├── .gitignore
├── README.md
├── code
    ├── 00_python_beginner_workshop.py
    ├── 00_python_intermediate_workshop.py
    ├── 02_command_line.md
    ├── 03_file_reading.py
    ├── 03_python_homework_chipotle.py
    ├── 03_python_homework_chipotle_explained_nb.py
    ├── 04_pandas.py
    ├── 05_pandas_homework_imdb.py
    ├── 05_pandas_merge_nb.py
    ├── 05_pandas_visualization_nb.py
    ├── 06_human_learning_iris_nb.py
    ├── 07_api.py
    ├── 07_web_scraping.py
    ├── 08_bias_variance_nb.py
    ├── 08_knn_sklearn_nb.py
    ├── 08_nba_knn_nb.py
    ├── 08_pandas_review_nb.py
    ├── 09_model_evaluation_nb.py
    ├── 10_linear_regression_nb.py
    ├── 10_yelp_votes_homework_nb.py
    ├── 12_e_log_examples_nb.py
    ├── 12_logistic_regression_nb.py
    ├── 12_titanic_confusion_nb.py
    ├── 13_advanced_model_evaluation_nb.py
    ├── 13_bank_exercise_nb.py
    ├── 13_cross_validation_nb.py
    ├── 14_bayes_theorem_iris_nb.py
    ├── 14_text_data_sklearn_nb.py
    ├── 14_types_of_naive_bayes_nb.py
    ├── 15_natural_language_processing_nb.py
    ├── 16_kaggle.py
    ├── 16_kaggle_minimal.py
    ├── 17_bikeshare_exercise_nb.py
    ├── 17_decision_trees_nb.py
    ├── 18_ensembling_nb.py
    ├── 19_advanced_sklearn_nb.py
    ├── 19_clustering_nb.py
    ├── 20_regex_exercise.py
    ├── 20_regex_reference.py
    └── 20_regularization_nb.py
├── data
    ├── airlines.csv
    ├── bank-additional.csv
    ├── beer.txt
    ├── bikeshare.csv
    ├── chipotle.tsv
    ├── drinks.csv
    ├── example.html
    ├── hitters.csv
    ├── homicides.txt
    ├── imdb_1000.csv
    ├── imdb_ids.txt
    ├── sms.tsv
    ├── titanic.csv
    ├── u.data
    ├── u.item
    ├── u.user
    ├── u.user_original
    ├── ufo.csv
    ├── vehicles_test.csv
    ├── vehicles_train.csv
    ├── yelp.csv
    └── yelp.json
├── homework
    ├── 02_command_line_chipotle.md
    ├── 09_bias_variance.md
    ├── 10_yelp_votes.md
    ├── 13_cross_validation.md
    ├── 13_roc_auc.md
    ├── 14_spam_filtering.md
    └── 14_yelp_review_text.md
├── notebooks
    ├── 03_python_homework_chipotle_explained.ipynb
    ├── 05_pandas_merge.ipynb
    ├── 05_pandas_visualization.ipynb
    ├── 06_human_learning_iris.ipynb
    ├── 08_bias_variance.ipynb
    ├── 08_knn_sklearn.ipynb
    ├── 08_nba_knn.ipynb
    ├── 08_pandas_review.ipynb
    ├── 09_model_evaluation.ipynb
    ├── 10_linear_regression.ipynb
    ├── 10_yelp_votes_homework.ipynb
    ├── 12_e_log_examples.ipynb
    ├── 12_logistic_regression.ipynb
    ├── 12_titanic_confusion.ipynb
    ├── 13_advanced_model_evaluation.ipynb
    ├── 13_bank_exercise.ipynb
    ├── 13_cross_validation.ipynb
    ├── 14_bayes_theorem_iris.ipynb
    ├── 14_naive_bayes_spam.ipynb
    ├── 14_text_data_sklearn.ipynb
    ├── 14_types_of_naive_bayes.ipynb
    ├── 14_yelp_review_text_homework.ipynb
    ├── 15_natural_language_processing.ipynb
    ├── 17_bikeshare_exercise.ipynb
    ├── 17_decision_trees.ipynb
    ├── 18_ensembling.ipynb
    ├── 19_advanced_sklearn.ipynb
    ├── 19_clustering.ipynb
    ├── 20_regularization.ipynb
    └── images
    │   ├── bias_variance.png
    │   ├── cross_validation_diagram.png
    │   ├── crowdflower_ensembling.jpg
    │   ├── driver_ensembling.png
    │   ├── estimating_coefficients.png
    │   ├── iris_01nn_map.png
    │   ├── iris_05nn_map.png
    │   ├── iris_15nn_map.png
    │   ├── iris_50nn_map.png
    │   ├── lasso_ridge_coefficients.png
    │   ├── lasso_ridge_path.png
    │   ├── logistic_betas.png
    │   ├── obama_clinton_tree.jpg
    │   ├── polynomial_overfitting.png
    │   ├── salary_color.png
    │   ├── salary_regions.png
    │   ├── salary_tree.png
    │   ├── salary_tree_annotated.png
    │   ├── salary_tree_deep.png
    │   ├── supervised_learning.png
    │   ├── train_test_split.png
    │   ├── training_testing_error.png
    │   ├── tree_bikeshare.png
    │   ├── tree_titanic.png
    │   ├── tree_vehicles.png
    │   └── tree_vs_linear.png
├── other
    ├── 02_exercise_output.png
    ├── 02_file_tree.png
    ├── advice.md
    ├── model_comparison.md
    ├── model_evaluation_comparison.md
    ├── python_packages.md
    └── setup_checklist.md
├── project
    ├── README.md
    ├── peer_review.md
    └── public_data.md
├── requirements.txt
└── slides
    ├── 01_course_overview.pdf
    ├── 01_course_overview.pptx
    ├── 01_intro_to_data_science.pdf
    ├── 01_intro_to_data_science.pptx
    ├── 01_types_of_data.pdf
    ├── 01_types_of_data.pptx
    ├── 02_git_github.pdf
    ├── 02_git_github.pptx
    ├── 06_machine_learning.pdf
    ├── 06_machine_learning.pptx
    ├── 12_confusion_matrix.pdf
    ├── 12_confusion_matrix.pptx
    ├── 13_drawing_roc.pdf
    ├── 13_drawing_roc.pptx
    ├── 14_bayes_theorem.pdf
    ├── 14_bayes_theorem.pptx
    ├── 14_naive_bayes.pdf
    ├── 14_naive_bayes.pptx
    ├── 16_kaggle.pdf
    ├── 16_kaggle.pptx
    ├── 19_clustering.pdf
    └── 19_clustering.pptx


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .DS_Store
3 | *.pyc
4 | extras/
5 | 


--------------------------------------------------------------------------------
/code/00_python_beginner_workshop.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Python Beginner Workshop
 3 | '''
 4 | 
 5 | '''
 6 | Multi-line comments go between 3 quotation marks.
 7 | You can use single or double quotes.
 8 | '''
 9 | 
10 | # One-line comments are preceded by the pound symbol
11 | 
12 | 
13 | # BASIC DATA TYPES
14 | 
15 | x = 5               # creates an object
16 | print type(x)       # check the type: int (not declared explicitly)
17 | type(x)             # automatically prints
18 | type(5)             # assigning it to a variable is not required
19 | 
20 | type(5.0)           # float
21 | type('five')        # str
22 | type(True)          # bool
23 | 
24 | 
25 | # LISTS
26 | 
27 | nums = [5, 5.0, 'five']     # multiple data types
28 | nums                        # print the list
29 | type(nums)                  # check the type: list
30 | len(nums)                   # check the length: 3
31 | nums[0]                     # print first element
32 | nums[0] = 6                 # replace a list element
33 | 
34 | nums.append(7)              # list 'method' that modifies the list
35 | help(nums.append)           # help on this method
36 | help(nums)                  # help on a list object
37 | nums.remove('five')         # another list method
38 | 
39 | sorted(nums)                # 'function' that does not modify the list
40 | nums                        # it was not affected
41 | nums = sorted(nums)         # overwrite the original list
42 | sorted(nums, reverse=True)  # optional argument
43 | 
44 | 
45 | # FUNCTIONS
46 | 
47 | def give_me_five():         # function definition ends with colon
48 |     return 5                # indentation required for function body
49 | 
50 | give_me_five()              # prints the return value (5)
51 | num = give_me_five()        # assigns return value to a variable, doesn't print it
52 | 
53 | def calc(x, y, op):         # three parameters (without any defaults)
54 |     if op == 'add':         # conditional statement
55 |         return x + y
56 |     elif op == 'subtract':
57 |         return x - y
58 |     else:
59 |         print 'Valid operations: add, subtract'
60 | 
61 | calc(5, 3, 'add')
62 | calc(5, 3, 'subtract')
63 | calc(5, 3, 'multiply')
64 | calc(5, 3)
65 | 
66 | 
67 | # EXERCISE: Write a function that takes two parameters (hours and rate), and
68 | # returns the total pay.
69 | 
70 | def compute_pay(hours, rate):
71 |     return hours * rate
72 | 
73 | compute_pay(40, 10.50)
74 | 
75 | 
76 | # FOR LOOPS
77 | 
78 | # print each list element in uppercase
79 | fruits = ['apple', 'banana', 'cherry']
80 | for fruit in fruits:
81 |     print fruit.upper()
82 | 


--------------------------------------------------------------------------------
/code/00_python_intermediate_workshop.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Python Intermediate Workshop
  3 | '''
  4 | 
  5 | '''
  6 | LISTS
  7 | '''
  8 | 
  9 | # creating
 10 | a = [1, 2, 3, 4, 5]     # create lists using brackets
 11 | 
 12 | # slicing
 13 | a[0]        # returns 1 (Python is zero indexed)
 14 | a[1:3]      # returns [2, 3] (inclusive of first index but exclusive of second)
 15 | a[-1]       # returns 5 (last element)
 16 | 
 17 | # appending
 18 | a[5] = 6        # error because you can't assign outside the existing range
 19 | a.append(6)     # list method that appends 6 to the end
 20 | a = a + [0]     # use plus sign to combine lists
 21 | 
 22 | # checking length
 23 | len(a)      # returns 7
 24 | 
 25 | # checking type
 26 | type(a)     # returns list
 27 | type(a[0])  # returns int
 28 | 
 29 | # sorting
 30 | sorted(a)               # sorts the list
 31 | sorted(a, reverse=True) # reverse=True is an 'optional argument'
 32 | sorted(a, True)         # error because optional arguments must be named
 33 | 
 34 | 
 35 | '''
 36 | STRINGS
 37 | '''
 38 | 
 39 | # creating
 40 | a = 'hello'     # can use single or double quotes
 41 | 
 42 | # slicing
 43 | a[0]        # returns 'h' (works like list slicing)
 44 | a[1:3]      # returns 'el'
 45 | a[-1]       # returns 'o'
 46 | 
 47 | # concatenating
 48 | a + ' there'        # use plus sign to combine strings
 49 | 5 + ' there'        # error because they are different types
 50 | str(5) + ' there'   # cast 5 to a string in order for this to work
 51 | 
 52 | # uppercasing
 53 | a[0] = 'H'      # error because strings are immutable (can't overwrite characters)
 54 | a.upper()       # string method (this method doesn't exist for lists)
 55 | 
 56 | # checking length
 57 | len(a)      # returns 5 (number of characters)
 58 | 
 59 | 
 60 | '''
 61 | EXERCISE:
 62 | 1. Create a list of the first names of your family members.
 63 | 2. Print the name of the last person in the list.
 64 | 3. Print the length of the name of the first person in the list.
 65 | 4. Change one of the names from their real name to their nickname.
 66 | 5. Append a new person to the list.
 67 | 6. Change the name of the new person to lowercase using the string method 'lower'.
 68 | 7. Sort the list in reverse alphabetical order.
 69 | Bonus: Sort the list by the length of the names (shortest to longest).
 70 | '''
 71 | 
 72 | names = ['Wesley', 'Larry', 'Wan']  # list of names
 73 | names[-1]                           # last element
 74 | len(names[0])                       # length of first string
 75 | names[0] = 'Wes'                    # overwrite existing element
 76 | names.append('Gabriel')             # append new element
 77 | names[-1] = names[-1].lower()       # change last string to be lowercase
 78 | sorted(names, reverse=True)         # sort the list in reverse order
 79 | sorted(names, key=len)              # sort the list by length
 80 | 
 81 | 
 82 | '''
 83 | FOR LOOPS AND LIST COMPREHENSIONS
 84 | '''
 85 | 
 86 | # for loop to print 1 through 5
 87 | nums = range(1, 6)      # create a list of 1 through 5
 88 | for num in nums:        # num 'becomes' each list element for one loop
 89 |     print num
 90 | 
 91 | # for loop to print 1, 3, 5
 92 | other = [1, 3, 5]       # create a different list
 93 | for x in other:         # name 'x' does not matter, not defined in advance
 94 |     print x             # this loop only executes 3 times (not 5)
 95 | 
 96 | # for loop to create a list of 2, 4, 6, 8, 10
 97 | doubled = []                # create empty list to store results
 98 | for num in nums:            # loop through nums (will execute 5 times)
 99 |     doubled.append(num*2)   # append the double of the current value of num
100 | 
101 | # equivalent list comprehension
102 | doubled = [num*2 for num in nums]   # expression (num*2) goes first, brackets
103 |                                     # indicate we are storing results in a list
104 | 
105 | 
106 | '''
107 | EXERCISE 1:
108 | Given that: letters = ['a', 'b', 'c']
109 | Write a list comprehension that returns: ['A', 'B', 'C']
110 | 
111 | EXERCISE 2 (BONUS):
112 | Given that: word = 'abc'
113 | Write a list comprehension that returns: ['A', 'B', 'C']
114 | 
115 | EXERCISE 3 (BONUS):
116 | Given that: fruits = ['Apple', 'Banana', 'Cherry']
117 | Write a list comprehension that returns: ['A', 'B', 'C']
118 | '''
119 | 
120 | letters = ['a', 'b', 'c']
121 | [letter.upper() for letter in letters]  # iterate through a list of strings,
122 |                                         # and each string has an 'upper' method
123 | word = 'abc'
124 | [letter.upper() for letter in word]     # iterate through each character
125 | 
126 | fruits = ['Apple', 'Banana', 'Cherry']
127 | [fruit[0] for fruit in fruits]          # slice the first character from each string
128 | 
129 | 
130 | '''
131 | DICTIONARIES
132 | '''
133 | 
134 | # dictionaries are made of key-value pairs (like a real dictionary)
135 | family = {'dad':'Homer', 'mom':'Marge', 'size':2}
136 | 
137 | # check the length
138 | len(family)         # returns 3 (number of key-value pairs)
139 | 
140 | # use the key to look up a value (fast operation regardless of dictionary size)
141 | family['dad']       # returns 'Homer'
142 | 
143 | # can't use a value to look up a key
144 | family['Homer']     # error
145 | 
146 | # dictionaries are unordered
147 | family[0]           # error
148 | 
149 | # add a new entry
150 | family['cat'] = 'snowball'
151 | 
152 | # keys must be unique, so this edits an existing entry
153 | family['cat'] = 'snowball ii'
154 | 
155 | # delete an entry
156 | del family['cat']
157 | 
158 | # keys can be strings or numbers or tuples, values can be any type
159 | family['kids'] = ['bart', 'lisa']   # value can be a list
160 | 
161 | # accessing a list element within a dictionary
162 | family['kids'][0]   # returns 'bart'
163 | 
164 | # useful methods
165 | family.keys()       # returns list: ['dad', 'kids', 'mom', 'size']
166 | family.values()     # returns list: ['Homer', ['bart', 'lisa'], 'Marge', 2]
167 | family.items()      # returns list of tuples:
168 |                     # [('dad', 'Homer'), ('kids', ['bart', 'lisa']), ('mom', 'Marge'), ('size', 2)]
169 | 
170 | 
171 | '''
172 | EXERCISE:
173 | 1. Print the name of the mom.
174 | 2. Change the size to 5.
175 | 3. Add 'Maggie' to the list of kids.
176 | 4. Fix 'bart' and 'lisa' so that the first letter is capitalized.
177 | Bonus: Do this last step using a list comprehension.
178 | '''
179 | 
180 | family['mom']                       # returns 'Marge'
181 | family['size'] = 5                  # replaces existing value for 'size'
182 | family['kids'].append('Maggie')     # access a list, then append 'Maggie' to it
183 | family['kids'][0] = 'Bart'          # capitalize names by overwriting them
184 | family['kids'][1] = 'Lisa'
185 | 
186 | # or, capitalize using a list comprehension and the 'capitalize' string method
187 | family['kids'] = [kid.capitalize() for kid in family['kids']]
188 | 
189 | # or, slice the string, uppercase the first letter, and concatenate with other letters
190 | family['kids'] = [kid[0].upper() + kid[1:] for kid in family['kids']]
191 | 
192 | 
193 | '''
194 | REQUESTS
195 | '''
196 | 
197 | # import module (make its functions available)
198 | import requests
199 | 
200 | # use requests to talk to the web
201 | r = requests.get('http://www.google.com')
202 | type(r)         # special 'response' object
203 | r.text          # HTML of web page stored as string
204 | type(r.text)    # string is encoded as unicode
205 | r.text[0]       # string can be sliced like any string
206 | 
207 | 
208 | '''
209 | APIs
210 | 
211 | What is an API?
212 | - Application Programming Interface
213 | - Structured way to expose specific functionality and data access to users
214 | - Web APIs usually follow the "REST" standard
215 | 
216 | How to interact with a REST API:
217 | - Make a "request" to a specific URL (an "endpoint"), and get the data back in a "response"
218 | - Most relevant request method for us is GET (other methods: POST, PUT, DELETE)
219 | - Response is often JSON format
220 | - Web console is sometimes available (allows you to explore an API)
221 | 
222 | API Providers: https://apigee.com/providers
223 | Echo Nest API Console: https://apigee.com/console/echonest
224 | API key: http://bit.ly/myechonest
225 | '''
226 | 
227 | # request data from the Echo Nest API
228 | r = requests.get('http://developer.echonest.com/api/v4/artist/top_hottt?api_key=KBGUPZPJZS9PHWNIN&format=json')
229 | r.text          # looks like a dictionary
230 | type(r.text)    # actually stored as a string
231 | r.json()        # decodes JSON
232 | type(r.json())  # JSON can be represented as a dictionary
233 | top = r.json()  # store that dictionary
234 | 
235 | # store the artist data
236 | artists = top['response']['artists']    # list of 15 dictionaries
237 | 
238 | # create a list of artist names only
239 | names = [artist['name'] for artist in artists]  # can iterate through list to access dictionaries
240 | 


--------------------------------------------------------------------------------
/code/02_command_line.md:
--------------------------------------------------------------------------------
  1 | ## Introduction to the Command Line
  2 | 
  3 | This document outlines basic usage of the command line. For Linux and Mac users, these commands should work in **Terminal**. For Windows users, these should work in **Git Bash**.
  4 | 
  5 | ### What is the command line?
  6 | 
  7 | The Command Line Interface (CLI) is a way of interacting with your computer using text-based commands. This is different from the way most people interact with their computers, using their mouse and a Graphical User Interface (GUI).
  8 | 
  9 | ### Why should I use it?
 10 | 
 11 | Once you become comfortable with the basics, it can be a more powerful way to use your computer. You're able to do many things more quickly and programatically.
 12 | 
 13 | ### General format for commands
 14 | 
 15 | `<command> -<options> <arguments>`
 16 | * `<command>` is the action we want the computer to take
 17 | * `<options>` (or "flags") modify the behavior of the command
 18 | * `<arguments>` are the things we want the command to act on
 19 | 
 20 | For Linux and Mac users, you can get view the **man**ual for a command by typing `man <command>`. For Windows users, you can view the help page by typing `<command> --help`.
 21 | 
 22 | ### Tips
 23 | 
 24 | * If there are spaces in file or directory names, use a "\" to "escape" the space characters, or just put the entire file path in quotes.
 25 | * After typing the first few letters of a file or directory name, you can hit Tab to auto-complete the name. (This often auto-escapes spaces for you.)
 26 | * Use the up and down arrow keys to navigate previously entered commands.
 27 | 
 28 | ### File paths
 29 | 
 30 | A **relative file path** specifies the path to a file, taking into account your current working directory. For example, if you were to give someone "relative" directions to your house, you would give them directions from their current location (the relative path from where they are to where you are).
 31 | 
 32 | An **absolute file path** specifies the complete path to a file, ignoring your current working directory. For example, if you were to give someone "absolute" directions to your house, you would start by telling them to be on earth, then go to your continent, then go to your country, then go to your region, etc.
 33 | 
 34 | 
 35 | ### Basic commands
 36 | 
 37 | ##### `pwd`
 38 | * **p**rints **w**orking **d**irectory (the directory you are currently in)
 39 | 
 40 | ##### `ls`
 41 | * **l**i**s**ts files and subdirectories in your working directory
 42 | * `ls -a` lists **a**ll files, including hidden files
 43 | * `ls -l` lists the files in a **l**ong format with extra information (permissions, size, last modified date, etc.)
 44 | * `ls *` also lists the contents of subdirectories (one level deep) in your working directory
 45 | * `ls <path>` lists files in a specific directory (without changing your working directory)
 46 | 
 47 | ##### `clear`
 48 | * **clear**s all output from your console
 49 | 
 50 | ##### `cd`
 51 | * `cd <path>` **c**hanges **d**irectory to the path you specify, which can be a relative path or an absolute path
 52 | * `cd ..` moves you "up" one directory (to the parent directory)
 53 | * `cd` moves you to your "home" directory
 54 | 
 55 | ##### `mkdir`
 56 | * `mkdir <dirname>` **m**a**k**es a new **dir**ectory called `<dirname>`
 57 | 
 58 | ##### `touch`
 59 | * `touch <filename>` creates an empty file called `<filename>`
 60 | * This is useful for creating empty files to be edited at a later time.
 61 | * You can create multiple empty files with a single command: `touch <filename1> <filename2> <filename3> ...`
 62 | 
 63 | ##### `rm -i`
 64 | * `rm <filename>` **r**e**m**oves (deletes) a file permanently
 65 | * `rm -i <filename>` removes files in **i**nteractive mode, in which you are prompted to confirm that you really want to delete the file. It's best to always use `rm -i`.
 66 | * `rm -ir <dirname>` removes a directory and **r**ecursively deletes all of its contents
 67 | 
 68 | ##### `mv`
 69 | * `mv <filename> <new path>` **m**o**v**es a file from its current location to `<new path>`
 70 | * `mv <filename> <new filename>` renames a file without changing its location
 71 | 
 72 | ##### `cp`
 73 | * `cp <filename> <new path>` **c**o**p**ies a file from its current location to `<new path>`, leaving the original file unchanged
 74 | * `cp <filename> <new filename>` copies a file without changing its location
 75 | 
 76 | 
 77 | ### Pre-class exercise
 78 | * Open your command line interface.
 79 | * Navigate to your Desktop, and confirm you are there:
 80 |     * Print your working directory (it should end with `Desktop`).
 81 |     * List your files and subdirectories (they should match what you see on your Desktop).
 82 | * Create a directory called `project`.
 83 | * Navigate to the `project` directory, and create the following files in it: `draft_paper.md`, `plot1.png`, `plot2.png`.
 84 | * Create two subdirectories in the `project` directory: `code`, `data`
 85 | * Navigate to the `code` subdirectory, and create the following files in it: `processing.py`, `exploration.py`.
 86 | * Navigate to the `data` subdirectory, and create the following files in it: `original.csv`, `clean.csv`, `other.csv`.
 87 | * Make a copy of `draft_paper.md` called `final_paper.md`.
 88 | * Rename `plot1.png` as `scatterplot.png`, and rename `plot2.png` as `histogram.png`.
 89 | * Create a subdirectory called `viz`, and then move `scatterplot.png` and `histogram.png` to `viz`.
 90 | * Delete `other.csv` from the `data` subdirectory.
 91 | * Navigate back to `project`, and then print out (with a single command) all of its files, subdirectories, and the contents of those subdirectories. The output should look similar to [this image](../other/02_exercise_output.png).
 92 | * Viewing this [collapsible tree diagram](../other/02_file_tree.png) may help you to visualize the directory structure that we have created.
 93 | 
 94 | 
 95 | ### Intermediate commands
 96 | 
 97 | ##### `head`
 98 | * `head <filename>` prints the **head** (the first 10 lines) of the file
 99 | * `head -n20 <filename>` prints the first 20 lines of the file
100 | * This is useful for previewing the contents of a large file without opening it.
101 | 
102 | ##### `tail`
103 | * `tail <filename>` prints the **tail** (the last 10 lines) of the file
104 | 
105 | ##### `cat`
106 | * `cat <filename>` prints the entire file
107 | 
108 | ##### `less`
109 | * `less <filename>` allows you to page or scroll through the file
110 | * Hit the spacebar to go down a page, use the arrow keys to scroll up and down, and hit `q` to exit.
111 | 
112 | ##### `wc`
113 | * `wc <filename>` returns the **c**ount of lines, **w**ords, and characters in a file
114 | * `wc -l <filename>` only counts lines, `wc -w <filename>` only counts words, and `wc -c <filename>` only counts characters
115 | * A "word" is defined as any set of characters delimited by a space.
116 | 
117 | ##### `find`
118 | * `find <path> -name <name>` will recursively search the specified path (and its subdirectories) and **find** files and directories with a given `<name>`
119 |     * Use `.` for the `<path>` to refer to the working directory.
120 | * For the `<name>`, you can search for an exact match, or use wildcard characters to search for a partial match:
121 |     * `*` specifies any number of any characters, such as `find . -name *.py` or `find . -name *data*.*`
122 |     * `?` specifies one character, such as `find . -name ??_*.*`
123 | 
124 | ##### `grep`
125 | * `grep <pattern> <filename>` searches a file for a **r**egular **e**xpression **p**attern and prints the matching lines
126 |     * The pattern should be in quotation marks to allow for multiple words.
127 |     * The pattern is case-sensitive by default, but you can use the `-i` option to **i**gnore case.
128 |     * You can use wildcards in the filename to search multiple files, but it only searches the working directory (not subdirectories).
129 | * `grep -r <pattern> <path>` does a **r**ecursive search of the path (checks subdirectories) for matches within files
130 |     * Use `.` for the `<path>` to refer to the working directory.
131 | * `grep <pattern>` does a **g**lobal search (of your entire computer) for matches
132 |     * Hit `Ctrl + c` if you want to cancel the search.
133 | * Much more complex string-matching patterns can be used.
134 | 
135 | ##### `|`
136 | * `<command 1> | <command 2>` pipes the results from `<command 1>` into `<command 2>`, and then the results of `<command 2>` are printed to the console
137 | 
138 | ##### `>`
139 | * `<command> > <filename>` takes the output of `<command>` and saves it in `<filename>`
140 | * This will overwrite the file if it already exists.
141 | 
142 | ##### `>>`
143 | * `<command> >> <filename>` takes the output of `<command>` and appends it to `<filename>`
144 | * This will create the file if it does not yet exist.
145 | 
146 | 
147 | ### Advanced commands
148 | 
149 | ##### `cut`
150 | * `cut -f1,2 <filename>` **cut**s a tab-delimited file into columns and returns the first two **f**ields
151 | * `cut -f1,2 -d, <filename>` indicates that the file is **d**elimited by commas
152 | 
153 | ##### `sort`
154 | * `sort <filename>` **sort**s a file by the first field
155 | 
156 | ##### `uniq`
157 | * `uniq <filename>` discards all but one of the successive identical lines (thus it only keeps **uniq**ue lines)
158 | * `uniq -c <filename>` also records the **c**ount of the number of occurrences
159 | * Because lines must be successive to be counted as identical, you will usually use `sort` before `uniq`.
160 | 


--------------------------------------------------------------------------------
/code/03_file_reading.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Lesson on file reading using Airline Safety Data
  3 | https://github.com/fivethirtyeight/data/tree/master/airline-safety
  4 | '''
  5 | 
  6 | # read the whole file at once, return a single string (including newlines)
  7 | # 'rU' mode (read universal) converts different line endings into '\n'
  8 | f = open('airlines.csv', mode='rU')
  9 | file_string = f.read()
 10 | f.close()
 11 | 
 12 | # use a context manager to automatically close your file
 13 | with open('airlines.csv', mode='rU') as f:
 14 |     file_string = f.read()
 15 | 
 16 | # read the file into a list (each list element is one row)
 17 | with open('airlines.csv', mode='rU') as f:
 18 |     file_list = []
 19 |     for row in f:
 20 |         file_list.append(row)
 21 | 
 22 | # do the same thing using a list comprehension
 23 | with open('airlines.csv', mode='rU') as f:
 24 |     file_list = [row for row in f]
 25 | 
 26 | # side note: splitting strings
 27 | 'hello DAT students'.split()
 28 | 'hello DAT students'.split('e')
 29 | 
 30 | # split each string (at the commas) into a list
 31 | with open('airlines.csv', mode='rU') as f:
 32 |     file_nested_list = [row.split(',') for row in f]
 33 | 
 34 | # do the same thing using the csv module
 35 | import csv
 36 | with open('airlines.csv', mode='rU') as f:
 37 |     file_nested_list = [row for row in csv.reader(f)]
 38 | 
 39 | # separate the header and data
 40 | header = file_nested_list[0]
 41 | data = file_nested_list[1:]
 42 | 
 43 | '''
 44 | EXERCISES:
 45 | 
 46 | 1. Create a list containing the average number of incidents per year for each airline.
 47 | Example for Aer Lingus: (2 + 0)/30 = 0.07
 48 | Expected output: [0.07, 2.73, 0.23, ...]
 49 | 
 50 | 2. Create a list of airline names (without the star).
 51 | Expected output: ['Aer Lingus', 'Aeroflot', 'Aerolineas Argentinas', ...]
 52 | 
 53 | 3. Create a list (of the same length) that contains 1 if there's a star and 0 if not.
 54 | Expected output: [0, 1, 0, ...]
 55 | 
 56 | 4. BONUS: Create a dictionary in which the key is the airline name (without the star)
 57 |    and the value is the average number of incidents.
 58 | Expected output: {'Aer Lingus': 0.07, 'Aeroflot': 2.73, ...}
 59 | '''
 60 | 
 61 | # Part 1
 62 | incidents = [round((int(row[2]) + int(row[5])) / float(30), 2) for row in data]
 63 | 
 64 | # Parts 2 and 3
 65 | airlines = []
 66 | starred = []
 67 | for row in data:
 68 |     if row[0][-1] == '*':
 69 |         starred.append(1)
 70 |         airlines.append(row[0][:-1])
 71 |     else:
 72 |         starred.append(0)
 73 |         airlines.append(row[0])
 74 | 
 75 | # Part 4
 76 | airline_incidents = dict(zip(airlines, incidents))
 77 | 
 78 | '''
 79 | A few extra things that will help you with the homework
 80 | '''
 81 | 
 82 | # 'set' data structure is useful for gathering unique elements
 83 | my_list = [1, 2, 1]
 84 | set(my_list)            # returns a set of 1, 2
 85 | len(set(my_list))       # count of unique elements
 86 | 
 87 | # 'in' statement is useful for lists
 88 | 1 in my_list            # True
 89 | 3 in my_list            # False
 90 | 
 91 | # 'in' is useful for strings (checks for substrings)
 92 | my_string = 'hello there'
 93 | 'the' in my_string      # True
 94 | 'then' in my_string     # False
 95 | 
 96 | # 'in' is useful for dictionaries (checks keys but not values)
 97 | my_dict = {'name':'Kevin', 'title':'instructor'}
 98 | 'name' in my_dict       # True
 99 | 'Kevin' in my_dict      # False
100 | 
101 | # 'count' method for strings counts how many times a character appears
102 | my_string.count('e')    # 3
103 | 


--------------------------------------------------------------------------------
/code/03_python_homework_chipotle.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Python Homework with Chipotle data
  3 | https://github.com/TheUpshot/chipotle
  4 | '''
  5 | 
  6 | '''
  7 | BASIC LEVEL
  8 | PART 1: Read in the file with csv.reader() and store it in an object called 'file_nested_list'.
  9 | Hint: This is a TSV file, and csv.reader() needs to be told how to handle it.
 10 |       https://docs.python.org/2/library/csv.html
 11 | '''
 12 | 
 13 | import csv
 14 | 
 15 | # specify that the delimiter is a tab character
 16 | with open('chipotle.tsv', mode='rU') as f:
 17 |     file_nested_list = [row for row in csv.reader(f, delimiter='\t')]
 18 | 
 19 | 
 20 | '''
 21 | BASIC LEVEL
 22 | PART 2: Separate 'file_nested_list' into the 'header' and the 'data'.
 23 | '''
 24 | 
 25 | header = file_nested_list[0]
 26 | data = file_nested_list[1:]
 27 | 
 28 | 
 29 | '''
 30 | INTERMEDIATE LEVEL
 31 | PART 3: Calculate the average price of an order.
 32 | Hint: Examine the data to see if the 'quantity' column is relevant to this calculation.
 33 | Hint: Think carefully about the simplest way to do this!
 34 | '''
 35 | 
 36 | # count the number of unique order_id's
 37 | # note: you could assume this is 1834 since that's the maximum order_id, but it's best to check
 38 | num_orders = len(set([row[0] for row in data]))     # 1834
 39 | 
 40 | # create a list of prices
 41 | # note: ignore the 'quantity' column because the 'item_price' takes quantity into account
 42 | prices = [float(row[4][1:-1]) for row in data]      # strip the dollar sign and trailing space
 43 | 
 44 | # calculate the average price of an order and round to 2 digits
 45 | round(sum(prices) / num_orders, 2)      # $18.81
 46 | 
 47 | 
 48 | '''
 49 | INTERMEDIATE LEVEL
 50 | PART 4: Create a list (or set) of all unique sodas and soft drinks that they sell.
 51 | Note: Just look for 'Canned Soda' and 'Canned Soft Drink', and ignore other drinks like 'Izze'.
 52 | '''
 53 | 
 54 | # if 'item_name' includes 'Canned', append 'choice_description' to 'sodas' list
 55 | sodas = []
 56 | for row in data:
 57 |     if 'Canned' in row[2]:
 58 |         sodas.append(row[3][1:-1])      # strip the brackets
 59 | 
 60 | # equivalent list comprehension (using an 'if' condition)
 61 | sodas = [row[3][1:-1] for row in data if 'Canned' in row[2]]
 62 | 
 63 | # create a set of unique sodas
 64 | unique_sodas = set(sodas)
 65 | 
 66 | 
 67 | '''
 68 | ADVANCED LEVEL
 69 | PART 5: Calculate the average number of toppings per burrito.
 70 | Note: Let's ignore the 'quantity' column to simplify this task.
 71 | Hint: Think carefully about the easiest way to count the number of toppings!
 72 | '''
 73 | 
 74 | # keep a running total of burritos and toppings
 75 | burrito_count = 0
 76 | topping_count = 0
 77 | 
 78 | # calculate number of toppings by counting the commas and adding 1
 79 | # note: x += 1 is equivalent to x = x + 1
 80 | for row in data:
 81 |     if 'Burrito' in row[2]:
 82 |         burrito_count += 1
 83 |         topping_count += (row[3].count(',') + 1)
 84 | 
 85 | # calculate the average topping count and round to 2 digits
 86 | round(topping_count / float(burrito_count), 2)      # 5.40
 87 | 
 88 | 
 89 | '''
 90 | ADVANCED LEVEL
 91 | PART 6: Create a dictionary in which the keys represent chip orders and
 92 |   the values represent the total number of orders.
 93 | Expected output: {'Chips and Roasted Chili-Corn Salsa': 18, ... }
 94 | Note: Please take the 'quantity' column into account!
 95 | Optional: Learn how to use 'defaultdict' to simplify your code.
 96 | '''
 97 | 
 98 | # start with an empty dictionary
 99 | chips = {}
100 | 
101 | # if chip order is not in dictionary, then add a new key/value pair
102 | # if chip order is already in dictionary, then update the value for that key
103 | for row in data:
104 |     if 'Chips' in row[2]:
105 |         if row[2] not in chips:
106 |             chips[row[2]] = int(row[1])     # this is a new key, so create key/value pair
107 |         else:
108 |             chips[row[2]] += int(row[1])    # this is an existing key, so add to the value
109 | 
110 | # defaultdict saves you the trouble of checking whether a key already exists
111 | from collections import defaultdict
112 | dchips = defaultdict(int)
113 | for row in data:
114 |     if 'Chips' in row[2]:
115 |         dchips[row[2]] += int(row[1])
116 | 
117 | 
118 | '''
119 | BONUS: Think of a question about this data that interests you, and then answer it!
120 | '''
121 | 


--------------------------------------------------------------------------------
/code/05_pandas_homework_imdb.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Pandas Homework with IMDb data
  3 | '''
  4 | 
  5 | '''
  6 | BASIC LEVEL
  7 | '''
  8 | 
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | # read in 'imdb_1000.csv' and store it in a DataFrame named movies
 13 | movies = pd.read_csv('imdb_1000.csv')
 14 | 
 15 | # check the number of rows and columns
 16 | movies.shape
 17 | 
 18 | # check the data type of each column
 19 | movies.dtypes
 20 | 
 21 | # calculate the average movie duration
 22 | movies.duration.mean()
 23 | 
 24 | # sort the DataFrame by duration to find the shortest and longest movies
 25 | movies.sort('duration').head(1)
 26 | movies.sort('duration').tail(1)
 27 | 
 28 | # create a histogram of duration, choosing an "appropriate" number of bins
 29 | movies.duration.plot(kind='hist', bins=20)
 30 | 
 31 | # use a box plot to display that same data
 32 | movies.duration.plot(kind='box')
 33 | 
 34 | '''
 35 | INTERMEDIATE LEVEL
 36 | '''
 37 | 
 38 | # count how many movies have each of the content ratings
 39 | movies.content_rating.value_counts()
 40 | 
 41 | # use a visualization to display that same data, including a title and x and y labels
 42 | movies.content_rating.value_counts().plot(kind='bar', title='Top 1000 Movies by Content Rating')
 43 | plt.xlabel('Content Rating')
 44 | plt.ylabel('Number of Movies')
 45 | 
 46 | # convert the following content ratings to "UNRATED": NOT RATED, APPROVED, PASSED, GP
 47 | movies.content_rating.replace(['NOT RATED', 'APPROVED', 'PASSED', 'GP'], 'UNRATED', inplace=True)
 48 | 
 49 | # convert the following content ratings to "NC-17": X, TV-MA
 50 | movies.content_rating.replace(['X', 'TV-MA'], 'NC-17', inplace=True)
 51 | 
 52 | # count the number of missing values in each column
 53 | movies.isnull().sum()
 54 | 
 55 | # if there are missing values: examine them, then fill them in with "reasonable" values
 56 | movies[movies.content_rating.isnull()]
 57 | movies.content_rating.fillna('UNRATED', inplace=True)
 58 | 
 59 | # calculate the average star rating for movies 2 hours or longer,
 60 | # and compare that with the average star rating for movies shorter than 2 hours
 61 | movies[movies.duration >= 120].star_rating.mean()
 62 | movies[movies.duration < 120].star_rating.mean()
 63 | 
 64 | # use a visualization to detect whether there is a relationship between duration and star rating
 65 | movies.plot(kind='scatter', x='duration', y='star_rating', alpha=0.2)
 66 | 
 67 | # calculate the average duration for each genre
 68 | movies.groupby('genre').duration.mean()
 69 | 
 70 | '''
 71 | ADVANCED LEVEL
 72 | '''
 73 | 
 74 | # visualize the relationship between content rating and duration
 75 | movies.boxplot(column='duration', by='content_rating')
 76 | movies.hist(column='duration', by='content_rating', sharex=True)
 77 | 
 78 | # determine the top rated movie (by star rating) for each genre
 79 | movies.sort('star_rating', ascending=False).groupby('genre').title.first()
 80 | movies.groupby('genre').title.first()   # equivalent, since DataFrame is already sorted by star rating
 81 | 
 82 | # check if there are multiple movies with the same title, and if so, determine if they are actually duplicates
 83 | dupe_titles = movies[movies.title.duplicated()].title
 84 | movies[movies.title.isin(dupe_titles)]
 85 | 
 86 | # calculate the average star rating for each genre, but only include genres with at least 10 movies
 87 | 
 88 | # option 1: manually create a list of relevant genres, then filter using that list
 89 | movies.genre.value_counts()
 90 | top_genres = ['Drama', 'Comedy', 'Action', 'Crime', 'Biography', 'Adventure', 'Animation', 'Horror', 'Mystery']
 91 | movies[movies.genre.isin(top_genres)].groupby('genre').star_rating.mean()
 92 | 
 93 | # option 2: automatically create a list of relevant genres by saving the value_counts and then filtering
 94 | genre_counts = movies.genre.value_counts()
 95 | top_genres = genre_counts[genre_counts >= 10].index
 96 | movies[movies.genre.isin(top_genres)].groupby('genre').star_rating.mean()
 97 | 
 98 | # option 3: calculate the average star rating for all genres, then filter using a boolean Series
 99 | movies.groupby('genre').star_rating.mean()[movies.genre.value_counts() >= 10]
100 | 
101 | # option 4: aggregate by count and mean, then filter using the count
102 | genre_ratings = movies.groupby('genre').star_rating.agg(['count', 'mean'])
103 | genre_ratings[genre_ratings['count'] >= 10]
104 | 
105 | '''
106 | BONUS
107 | '''
108 | 
109 | # Figure out something "interesting" using the actors data!
110 | 


--------------------------------------------------------------------------------
/code/05_pandas_merge_nb.py:
--------------------------------------------------------------------------------
  1 | # # Joining (Merging) DataFrames
  2 | 
  3 | # Using the [MovieLens 100k data](http://grouplens.org/datasets/movielens/), let's create two DataFrames:
  4 | # 
  5 | # - **movies**: shows information about movies, namely a unique **movie_id** and its **title**
  6 | # - **ratings**: shows the **rating** that a particular **user_id** gave to a particular **movie_id** at a particular **timestamp**
  7 | 
  8 | # ### Movies
  9 | 
 10 | import pandas as pd
 11 | movie_url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.item'
 12 | movie_cols = ['movie_id', 'title']
 13 | movies = pd.read_table(movie_url, sep='|', header=None, names=movie_cols, usecols=[0, 1])
 14 | movies.head()
 15 | 
 16 | 
 17 | # ### Ratings
 18 | 
 19 | rating_url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.data'
 20 | rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
 21 | ratings = pd.read_table(rating_url, sep='\t', header=None, names=rating_cols)
 22 | ratings.head()
 23 | 
 24 | 
 25 | # Let's pretend that you want to examine the ratings DataFrame, but you want to know the **title** of each movie rather than its **movie_id**. The best way to accomplish this objective is by "joining" (or "merging") the DataFrames using the Pandas `merge` function:
 26 | 
 27 | movie_ratings = pd.merge(movies, ratings)
 28 | movie_ratings.head()
 29 | 
 30 | 
 31 | # Here's what just happened:
 32 | # 
 33 | # - Pandas noticed that movies and ratings had one column in common, namely **movie_id**. This is the "key" on which the DataFrames will be joined.
 34 | # - The first **movie_id** in movies is 1. Thus, Pandas looked through every row in the ratings DataFrame, searching for a movie_id of 1. Every time it found such a row, it recorded the **user_id**, **rating**, and **timestamp** listed in that row. In this case, it found 452 matching rows.
 35 | # - The second **movie_id** in movies is 2. Again, Pandas did a search of ratings and found 131 matching rows.
 36 | # - This process was repeated for all of the remaining rows in movies.
 37 | # 
 38 | # At the end of the process, the movie_ratings DataFrame is created, which contains the two columns from movies (**movie_id** and **title**) and the three other colums from ratings (**user_id**, **rating**, and **timestamp**).
 39 | # 
 40 | # - **movie_id** 1 and its **title** are listed 452 times, next to the **user_id**, **rating**, and **timestamp** for each of the 452 matching ratings.
 41 | # - **movie_id** 2 and its **title** are listed 131 times, next to the **user_id**, **rating**, and **timestamp** for each of the 131 matching ratings.
 42 | # - And so on, for every movie in the dataset.
 43 | 
 44 | print movies.shape
 45 | print ratings.shape
 46 | print movie_ratings.shape
 47 | 
 48 | 
 49 | # Notice the shapes of the three DataFrames:
 50 | # 
 51 | # - There are 1682 rows in the movies DataFrame.
 52 | # - There are 100000 rows in the ratings DataFrame.
 53 | # - The `merge` function resulted in a movie_ratings DataFrame with 100000 rows, because every row from ratings matched a row from movies.
 54 | # - The movie_ratings DataFrame has 5 columns, namely the 2 columns from movies, plus the 4 columns from ratings, minus the 1 column in common.
 55 | # 
 56 | # By default, the `merge` function joins the DataFrames using all column names that are in common (**movie_id**, in this case). The [documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) explains how you can override this behavior.
 57 | 
 58 | # ## Four Types of Joins
 59 | 
 60 | # There are actually four types of joins supported by the Pandas `merge` function. Here's how they are described by the documentation:
 61 | # 
 62 | # - **inner:** use intersection of keys from both frames (SQL: inner join)
 63 | # - **outer:** use union of keys from both frames (SQL: full outer join)
 64 | # - **left:** use only keys from left frame (SQL: left outer join)
 65 | # - **right:** use only keys from right frame (SQL: right outer join)
 66 | # 
 67 | # The default is the "inner join", which was used when creating the movie_ratings DataFrame.
 68 | # 
 69 | # It's easiest to understand the different types by looking at some simple examples:
 70 | 
 71 | # ### Example DataFrames A and B
 72 | 
 73 | A = pd.DataFrame({'color': ['green', 'yellow', 'red'], 'num':[1, 2, 3]})
 74 | A
 75 | 
 76 | 
 77 | B = pd.DataFrame({'color': ['green', 'yellow', 'pink'], 'size':['S', 'M', 'L']})
 78 | B
 79 | 
 80 | 
 81 | # ### Inner join
 82 | # 
 83 | # Only include observations found in both A and B:
 84 | 
 85 | pd.merge(A, B, how='inner')
 86 | 
 87 | 
 88 | # ### Outer join
 89 | # 
 90 | # Include observations found in either A or B:
 91 | 
 92 | pd.merge(A, B, how='outer')
 93 | 
 94 | 
 95 | # ### Left join
 96 | # 
 97 | # Include all observations found in A:
 98 | 
 99 | pd.merge(A, B, how='left')
100 | 
101 | 
102 | # ### Right join
103 | # 
104 | # Include all observations found in B:
105 | 
106 | pd.merge(A, B, how='right')
107 | 


--------------------------------------------------------------------------------
/code/05_pandas_visualization_nb.py:
--------------------------------------------------------------------------------
  1 | # # Visualization with Pandas (and Matplotlib)
  2 | 
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # display plots in the notebook
  7 | 
  8 | # increase default figure and font sizes for easier viewing
  9 | plt.rcParams['figure.figsize'] = (8, 6)
 10 | plt.rcParams['font.size'] = 14
 11 | 
 12 | 
 13 | # read in the drinks data
 14 | drink_cols = ['country', 'beer', 'spirit', 'wine', 'liters', 'continent']
 15 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv'
 16 | drinks = pd.read_csv(url, header=0, names=drink_cols, na_filter=False)
 17 | 
 18 | 
 19 | # ## Histogram: show the distribution of a numerical variable
 20 | 
 21 | # sort the beer column and mentally split it into 3 groups
 22 | drinks.beer.order().values
 23 | 
 24 | 
 25 | # compare with histogram
 26 | drinks.beer.plot(kind='hist', bins=3)
 27 | 
 28 | 
 29 | # try more bins
 30 | drinks.beer.plot(kind='hist', bins=20)
 31 | 
 32 | 
 33 | # add title and labels
 34 | drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings')
 35 | plt.xlabel('Beer Servings')
 36 | plt.ylabel('Frequency')
 37 | 
 38 | 
 39 | # compare with density plot (smooth version of a histogram)
 40 | drinks.beer.plot(kind='density', xlim=(0, 500))
 41 | 
 42 | 
 43 | # ## Scatter Plot: show the relationship between two numerical variables
 44 | 
 45 | # select the beer and wine columns and sort by beer
 46 | drinks[['beer', 'wine']].sort('beer').values
 47 | 
 48 | 
 49 | # compare with scatter plot
 50 | drinks.plot(kind='scatter', x='beer', y='wine')
 51 | 
 52 | 
 53 | # add transparency
 54 | drinks.plot(kind='scatter', x='beer', y='wine', alpha=0.3)
 55 | 
 56 | 
 57 | # vary point color by spirit servings
 58 | drinks.plot(kind='scatter', x='beer', y='wine', c='spirit', colormap='Blues')
 59 | 
 60 | 
 61 | # scatter matrix of three numerical columns
 62 | pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']])
 63 | 
 64 | 
 65 | # increase figure size
 66 | pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']], figsize=(10, 8))
 67 | 
 68 | 
 69 | # ## Bar Plot: show a numerical comparison across different categories
 70 | 
 71 | # count the number of countries in each continent
 72 | drinks.continent.value_counts()
 73 | 
 74 | 
 75 | # compare with bar plot
 76 | drinks.continent.value_counts().plot(kind='bar')
 77 | 
 78 | 
 79 | # calculate the mean alcohol amounts for each continent
 80 | drinks.groupby('continent').mean()
 81 | 
 82 | 
 83 | # side-by-side bar plots
 84 | drinks.groupby('continent').mean().plot(kind='bar')
 85 | 
 86 | 
 87 | # drop the liters column
 88 | drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar')
 89 | 
 90 | 
 91 | # stacked bar plots
 92 | drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar', stacked=True)
 93 | 
 94 | 
 95 | # ## Box Plot: show quartiles (and outliers) for one or more numerical variables
 96 | # 
 97 | # **Five-number summary:**
 98 | # 
 99 | # - min = minimum value
100 | # - 25% = first quartile (Q1) = median of the lower half of the data
101 | # - 50% = second quartile (Q2) = median of the data
102 | # - 75% = third quartile (Q3) = median of the upper half of the data
103 | # - max = maximum value
104 | # 
105 | # (More useful than mean and standard deviation for describing skewed distributions)
106 | # 
107 | # **Interquartile Range (IQR)** = Q3 - Q1
108 | # 
109 | # **Outliers:**
110 | # 
111 | # - below Q1 - 1.5 * IQR
112 | # - above Q3 + 1.5 * IQR
113 | 
114 | # sort the spirit column
115 | drinks.spirit.order().values
116 | 
117 | 
118 | # show "five-number summary" for spirit
119 | drinks.spirit.describe()
120 | 
121 | 
122 | # compare with box plot
123 | drinks.spirit.plot(kind='box')
124 | 
125 | 
126 | # include multiple variables
127 | drinks.drop('liters', axis=1).plot(kind='box')
128 | 
129 | 
130 | # ## Line Plot: show the trend of a numerical variable over time
131 | 
132 | # read in the ufo data
133 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/ufo.csv'
134 | ufo = pd.read_csv(url)
135 | ufo['Time'] = pd.to_datetime(ufo.Time)
136 | ufo['Year'] = ufo.Time.dt.year
137 | 
138 | 
139 | # count the number of ufo reports each year (and sort by year)
140 | ufo.Year.value_counts().sort_index()
141 | 
142 | 
143 | # compare with line plot
144 | ufo.Year.value_counts().sort_index().plot()
145 | 
146 | 
147 | # don't use a line plot when there is no logical ordering
148 | drinks.continent.value_counts().plot()
149 | 
150 | 
151 | # ## Grouped Box Plots: show one box plot for each group
152 | 
153 | # reminder: box plot of beer servings
154 | drinks.beer.plot(kind='box')
155 | 
156 | 
157 | # box plot of beer servings grouped by continent
158 | drinks.boxplot(column='beer', by='continent')
159 | 
160 | 
161 | # box plot of all numeric columns grouped by continent
162 | drinks.boxplot(by='continent')
163 | 
164 | 
165 | # ## Grouped Histograms: show one histogram for each group
166 | 
167 | # reminder: histogram of beer servings
168 | drinks.beer.plot(kind='hist')
169 | 
170 | 
171 | # histogram of beer servings grouped by continent
172 | drinks.hist(column='beer', by='continent')
173 | 
174 | 
175 | # share the x axes
176 | drinks.hist(column='beer', by='continent', sharex=True)
177 | 
178 | 
179 | # share the x and y axes
180 | drinks.hist(column='beer', by='continent', sharex=True, sharey=True)
181 | 
182 | 
183 | # change the layout
184 | drinks.hist(column='beer', by='continent', sharex=True, layout=(2, 3))
185 | 
186 | 
187 | # ## Assorted Functionality
188 | 
189 | # saving a plot to a file
190 | drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings')
191 | plt.xlabel('Beer Servings')
192 | plt.ylabel('Frequency')
193 | plt.savefig('beer_histogram.png')
194 | 
195 | 
196 | # list available plot styles
197 | plt.style.available
198 | 
199 | 
200 | # change to a different style
201 | plt.style.use('ggplot')
202 | 


--------------------------------------------------------------------------------
/code/06_human_learning_iris_nb.py:
--------------------------------------------------------------------------------
  1 | # # Exercise: "Human learning" with iris data
  2 | # 
  3 | # **Question:** Can you predict the species of an iris using petal and sepal measurements?
  4 | # 
  5 | # 1. Read the iris data into a Pandas DataFrame, including column names.
  6 | # 2. Gather some basic information about the data.
  7 | # 3. Use sorting, split-apply-combine, and/or visualization to look for differences between species.
  8 | # 4. Write down a set of rules that could be used to predict species based on iris measurements.
  9 | # 
 10 | # **BONUS:** Define a function that accepts a row of data and returns a predicted species. Then, use that function to make predictions for all existing rows of data, and check the accuracy of your predictions.
 11 | 
 12 | import pandas as pd
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | # display plots in the notebook
 16 | 
 17 | # increase default figure and font sizes for easier viewing
 18 | plt.rcParams['figure.figsize'] = (8, 6)
 19 | plt.rcParams['font.size'] = 14
 20 | 
 21 | 
 22 | # ## Task 1
 23 | # 
 24 | # Read the iris data into a pandas DataFrame, including column names.
 25 | 
 26 | # define a list of column names (as strings)
 27 | col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
 28 | 
 29 | # define the URL from which to retrieve the data (as a string)
 30 | url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
 31 | 
 32 | # retrieve the CSV file and add the column names
 33 | iris = pd.read_csv(url, header=None, names=col_names)
 34 | 
 35 | 
 36 | # ## Task 2
 37 | # 
 38 | # Gather some basic information about the data.
 39 | 
 40 | iris.shape
 41 | 
 42 | 
 43 | iris.head()
 44 | 
 45 | 
 46 | iris.dtypes
 47 | 
 48 | 
 49 | iris.describe()
 50 | 
 51 | 
 52 | iris.species.value_counts()
 53 | 
 54 | 
 55 | iris.isnull().sum()
 56 | 
 57 | 
 58 | # ## Task 3
 59 | # 
 60 | # Use sorting, split-apply-combine, and/or visualization to look for differences between species.
 61 | 
 62 | # ### sorting
 63 | 
 64 | # sort the DataFrame by petal_width and display the NumPy array
 65 | print iris.sort('petal_width').values
 66 | 
 67 | 
 68 | # ### split-apply-combine
 69 | 
 70 | # mean of sepal_length grouped by species
 71 | iris.groupby('species').sepal_length.mean()
 72 | 
 73 | 
 74 | # mean of all numeric columns grouped by species
 75 | iris.groupby('species').mean()
 76 | 
 77 | 
 78 | # description of all numeric columns grouped by species
 79 | iris.groupby('species').describe()
 80 | 
 81 | 
 82 | # ### visualization
 83 | 
 84 | # histogram of petal_width grouped by species
 85 | iris.hist(column='petal_width', by='species', sharex=True)
 86 | 
 87 | 
 88 | # box plot of petal_width grouped by species
 89 | iris.boxplot(column='petal_width', by='species')
 90 | 
 91 | 
 92 | # box plot of all numeric columns grouped by species
 93 | iris.boxplot(by='species')
 94 | 
 95 | 
 96 | # map species to a numeric value so that plots can be colored by species
 97 | iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})
 98 | 
 99 | # alternative method
100 | iris['species_num'] = iris.species.factorize()[0]
101 | 
102 | 
103 | # scatter plot of petal_length vs petal_width colored by species
104 | iris.plot(kind='scatter', x='petal_length', y='petal_width', c='species_num', colormap='brg')
105 | 
106 | 
107 | # scatter matrix of all features colored by species
108 | pd.scatter_matrix(iris.drop('species_num', axis=1), c=iris.species_num, figsize=(12, 10))
109 | 
110 | 
111 | # ## Task 4
112 | # 
113 | # Write down a set of rules that could be used to predict species based on iris measurements.
114 | 
115 | # define a new feature that represents petal area ("feature engineering")
116 | iris['petal_area'] = iris.petal_length * iris.petal_width
117 | 
118 | 
119 | # description of petal_area grouped by species
120 | iris.groupby('species').petal_area.describe().unstack()
121 | 
122 | 
123 | # box plot of petal_area grouped by species
124 | iris.boxplot(column='petal_area', by='species')
125 | 
126 | 
127 | # only show irises with a petal_area between 7 and 9
128 | iris[(iris.petal_area > 7) & (iris.petal_area < 9)].sort('petal_area')
129 | 
130 | 
131 | # My set of rules for predicting species:
132 | # 
133 | # - If petal_area is less than 2, predict **setosa**.
134 | # - Else if petal_area is less than 7.4, predict **versicolor**.
135 | # - Otherwise, predict **virginica**.
136 | 
137 | # ## Bonus
138 | # 
139 | # Define a function that accepts a row of data and returns a predicted species. Then, use that function to make predictions for all existing rows of data, and check the accuracy of your predictions.
140 | 
141 | # given a row of data, return a predicted species_num (0/1/2)
142 | def classify_iris(row):
143 | 
144 |     # calculate the petal_area
145 |     petal_area = row[2] * row[3]
146 |     
147 |     # predict the species based on the rules above    
148 |     if petal_area < 2:
149 |         prediction = 'setosa'
150 |     elif petal_area < 7.4:
151 |         prediction = 'versicolor'
152 |     else:
153 |         prediction = 'virginica'
154 |     
155 |     # map the species name to a numeric value
156 |     species_to_num = {'setosa':0, 'versicolor':1, 'virginica':2}
157 |     
158 |     # return that value
159 |     return species_to_num[prediction]
160 | 
161 | 
162 | # print the first row
163 | iris.iloc[0, :]
164 | 
165 | 
166 | # print the last row
167 | iris.iloc[149, :]
168 | 
169 | 
170 | # test the function on the first and last rows
171 | print classify_iris(iris.iloc[0, :])
172 | print classify_iris(iris.iloc[149, :])
173 | 
174 | 
175 | # make predictions for all rows and store them in the DataFrame
176 | iris['prediction'] = [classify_iris(row) for index, row in iris.iterrows()]
177 | 
178 | 
179 | # calculate the percentage of correct predictions
180 | sum(iris.species_num == iris.prediction) / 150.
181 | 


--------------------------------------------------------------------------------
/code/07_api.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | CLASS: Getting Data from APIs
  3 | 
  4 | What is an API?
  5 | - Application Programming Interface
  6 | - Structured way to expose specific functionality and data access to users
  7 | - Web APIs usually follow the "REST" standard
  8 | 
  9 | How to interact with a REST API:
 10 | - Make a "request" to a specific URL (an "endpoint"), and get the data back in a "response"
 11 | - Most relevant request method for us is GET (other methods: POST, PUT, DELETE)
 12 | - Response is often JSON format
 13 | - Web console is sometimes available (allows you to explore an API)
 14 | '''
 15 | 
 16 | # read IMDb data into a DataFrame: we want a year column!
 17 | import pandas as pd
 18 | movies = pd.read_csv('imdb_1000.csv')
 19 | movies.head()
 20 | 
 21 | # use requests library to interact with a URL
 22 | import requests
 23 | r = requests.get('http://www.omdbapi.com/?t=the shawshank redemption&r=json&type=movie')
 24 | 
 25 | # check the status: 200 means success, 4xx means error
 26 | r.status_code
 27 | 
 28 | # view the raw response text
 29 | r.text
 30 | 
 31 | # decode the JSON response body into a dictionary
 32 | r.json()
 33 | 
 34 | # extracting the year from the dictionary
 35 | r.json()['Year']
 36 | 
 37 | # what happens if the movie name is not recognized?
 38 | r = requests.get('http://www.omdbapi.com/?t=blahblahblah&r=json&type=movie')
 39 | r.status_code
 40 | r.json()
 41 | 
 42 | # define a function to return the year
 43 | def get_movie_year(title):
 44 |     r = requests.get('http://www.omdbapi.com/?t=' + title + '&r=json&type=movie')
 45 |     info = r.json()
 46 |     if info['Response'] == 'True':
 47 |         return int(info['Year'])
 48 |     else:
 49 |         return None
 50 | 
 51 | # test the function
 52 | get_movie_year('The Shawshank Redemption')
 53 | get_movie_year('blahblahblah')
 54 | 
 55 | # create a smaller DataFrame for testing
 56 | top_movies = movies.head().copy()
 57 | 
 58 | # write a for loop to build a list of years
 59 | from time import sleep
 60 | years = []
 61 | for title in top_movies.title:
 62 |     years.append(get_movie_year(title))
 63 |     sleep(1)
 64 | 
 65 | # check that the DataFrame and the list of years are the same length
 66 | assert(len(top_movies) == len(years))
 67 | 
 68 | # save that list as a new column
 69 | top_movies['year'] = years
 70 | 
 71 | '''
 72 | Bonus content: Updating the DataFrame as part of a loop
 73 | '''
 74 | 
 75 | # enumerate allows you to access the item location while iterating
 76 | letters = ['a', 'b', 'c']
 77 | for index, letter in enumerate(letters):
 78 |     print index, letter
 79 | 
 80 | # iterrows method for DataFrames is similar
 81 | for index, row in top_movies.iterrows():
 82 |     print index, row.title
 83 | 
 84 | # create a new column and set a default value
 85 | movies['year'] = -1
 86 | 
 87 | # loc method allows you to access a DataFrame element by 'label'
 88 | movies.loc[0, 'year'] = 1994
 89 | 
 90 | # write a for loop to update the year for the first three movies
 91 | for index, row in movies.iterrows():
 92 |     if index < 3:
 93 |         movies.loc[index, 'year'] = get_movie_year(row.title)
 94 |         sleep(1)
 95 |     else:
 96 |         break
 97 | 
 98 | '''
 99 | Other considerations when accessing APIs:
100 | - Most APIs require you to have an access key (which you should store outside your code)
101 | - Most APIs limit the number of API calls you can make (per day, hour, minute, etc.)
102 | - Not all APIs are free
103 | - Not all APIs are well-documented
104 | - Pay attention to the API version
105 | 
106 | Python wrapper is another option for accessing an API:
107 | - Set of functions that "wrap" the API code for ease of use
108 | - Potentially simplifies your code
109 | - But, wrapper could have bugs or be out-of-date or poorly documented
110 | '''
111 | 


--------------------------------------------------------------------------------
/code/08_bias_variance_nb.py:
--------------------------------------------------------------------------------
  1 | # # Exploring the Bias-Variance Tradeoff
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import seaborn as sns
  6 | 
  7 | # allow plots to appear in the notebook
  8 | 
  9 | 
 10 | # ## Brain and body weight
 11 | 
 12 | # This is a [dataset](http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt) of the average weight of the body and the brain for 62 mammal species. Let's read it into pandas and take a quick look:
 13 | 
 14 | url = 'http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt'
 15 | col_names = ['id', 'brain', 'body']
 16 | mammals = pd.read_table(url, sep='\s+', skiprows=33, names=col_names, index_col='id')
 17 | mammals.head()
 18 | 
 19 | 
 20 | mammals.describe()
 21 | 
 22 | 
 23 | # We're going to focus on a smaller subset in which the body weight is less than 200:
 24 | 
 25 | # only keep rows in which the body weight is less than 200
 26 | mammals = mammals[mammals.body < 200]
 27 | mammals.shape
 28 | 
 29 | 
 30 | # We're now going to pretend that there are only 51 mammal species in existence. In other words, we are pretending that this is the entire dataset of brain and body weights for **every known mammal species**.
 31 | # 
 32 | # Let's create a scatterplot (using [Seaborn](http://stanford.edu/~mwaskom/software/seaborn/)) to visualize the relationship between brain and body weight:
 33 | 
 34 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, fit_reg=False)
 35 | sns.plt.xlim(-10, 200)
 36 | sns.plt.ylim(-10, 250)
 37 | 
 38 | 
 39 | # There appears to be a relationship between brain and body weight for mammals.
 40 | 
 41 | # ## Making a prediction
 42 | 
 43 | # Now let's pretend that a **new mammal species** is discovered. We measure the body weight of every member of this species that we can find, and calculate an **average body weight of 100**. We want to **predict the average brain weight** of this species (rather than measuring it directly). How might we do this?
 44 | 
 45 | sns.lmplot(x='body', y='brain', data=mammals, ci=None)
 46 | sns.plt.xlim(-10, 200)
 47 | sns.plt.ylim(-10, 250)
 48 | 
 49 | 
 50 | # We drew a straight line that appears to best capture the relationship between brain and body weight. So, we might predict that our new species has a brain weight of about 45, since that's the approximate y value when x=100.
 51 | # 
 52 | # This is known as a "linear model" or a "linear regression model", which we will study in a future class.
 53 | 
 54 | # ## Making a prediction from a sample
 55 | 
 56 | # Earlier, I said that this dataset contained every known mammal species. That's very convenient, but **in the real world, all you ever have is a sample of data**. A more realistic situation would be to only have brain and body weights for (let's say) half of the 51 known mammals.
 57 | # 
 58 | # When that new mammal species (with a body weight of 100) is discovered, we still want to make an accurate prediction for the brain weight, but this task might be more difficult since we don't have all of the data that we would ideally like to have.
 59 | # 
 60 | # Let's simulate this situation by assigning each of the 51 observations to **either universe 1 or universe 2**:
 61 | 
 62 | # set a random seed for reproducibility
 63 | np.random.seed(12345)
 64 | 
 65 | # randomly assign every observation to either universe 1 or universe 2
 66 | mammals['universe'] = np.random.randint(1, 3, len(mammals))
 67 | mammals.head()
 68 | 
 69 | 
 70 | # **Important:** We only live in one of the two universes. Both universes have 51 known mammal species, but each universe knows the brain and body weight for different species.
 71 | # 
 72 | # We can now tell Seaborn to create two plots, in which the left plot only uses the data from **universe 1** and the right plot only uses the data from **universe 2**:
 73 | 
 74 | # col='universe' subsets the data by universe and creates two separate plots
 75 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, col='universe')
 76 | sns.plt.xlim(-10, 200)
 77 | sns.plt.ylim(-10, 250)
 78 | 
 79 | 
 80 | # The line looks pretty similar between the two plots, despite the fact that they used separate samples of data. In both cases, we would predict a brain weight of about 45.
 81 | # 
 82 | # It's easier to see the degree of similarity by placing them on the same plot:
 83 | 
 84 | # hue='universe' subsets the data by universe and creates a single plot
 85 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, hue='universe')
 86 | sns.plt.xlim(-10, 200)
 87 | sns.plt.ylim(-10, 250)
 88 | 
 89 | 
 90 | # What was the point of this exercise? This was a visual demonstration of a high bias, low variance model:
 91 | # 
 92 | # - It's **high bias** because it doesn't fit the data particularly well.
 93 | # - It's **low variance** because it doesn't change much depending on which observations happen to be available in that universe.
 94 | 
 95 | # ## Let's try something completely different
 96 | 
 97 | # What would a **low bias, high variance** model look like? Let's try polynomial regression, with an eighth order polynomial:
 98 | 
 99 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, col='universe', order=8)
100 | sns.plt.xlim(-10, 200)
101 | sns.plt.ylim(-10, 250)
102 | 
103 | 
104 | # - It's **low bias** because the models match the data quite well!
105 | # - It's **high variance** because the models are widely different depending on which observations happen to be available in that universe. (For a body weight of 100, the brain weight prediction would be 40 in one universe and 0 in the other universe!)
106 | 
107 | # ## Can we find a middle ground?
108 | 
109 | # Perhaps we can create a model that has **less bias than the linear model**, and **less variance than the eighth order polynomial**?
110 | # 
111 | # Let's try a second order polynomial instead:
112 | 
113 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, col='universe', order=2)
114 | sns.plt.xlim(-10, 200)
115 | sns.plt.ylim(-10, 250)
116 | 
117 | 
118 | # This seems better. In both the left and right plots, **it fits the data pretty well, but not too well**.
119 | # 
120 | # This is the essence of the **bias-variance tradeoff**: You are seeking a model that appropriately balances bias and variance, and thus will generalize to new data (known as "out-of-sample" data).
121 | 


--------------------------------------------------------------------------------
/code/08_nba_knn_nb.py:
--------------------------------------------------------------------------------
  1 | # # KNN exercise with NBA player data
  2 | 
  3 | # ## Introduction
  4 | # 
  5 | # - NBA player statistics from 2014-2015 (partial season): [data](https://github.com/justmarkham/DAT4-students/blob/master/kerry/Final/NBA_players_2015.csv), [data dictionary](https://github.com/justmarkham/DAT-project-examples/blob/master/pdf/nba_paper.pdf)
  6 | # - **Goal:** Predict player position using assists, steals, blocks, turnovers, and personal fouls
  7 | 
  8 | # ## Step 1: Read the data into Pandas
  9 | 
 10 | # read the data into a DataFrame
 11 | import pandas as pd
 12 | url = 'https://raw.githubusercontent.com/justmarkham/DAT4-students/master/kerry/Final/NBA_players_2015.csv'
 13 | nba = pd.read_csv(url, index_col=0)
 14 | 
 15 | 
 16 | # examine the columns
 17 | nba.columns
 18 | 
 19 | 
 20 | # examine the positions
 21 | nba.pos.value_counts()
 22 | 
 23 | 
 24 | # ## Step 2: Create X and y
 25 | # 
 26 | # Use the following features: assists, steals, blocks, turnovers, personal fouls
 27 | 
 28 | # map positions to numbers
 29 | nba['pos_num'] = nba.pos.map({'C':0, 'F':1, 'G':2})
 30 | 
 31 | 
 32 | # create feature matrix (X)
 33 | feature_cols = ['ast', 'stl', 'blk', 'tov', 'pf']
 34 | X = nba[feature_cols]
 35 | 
 36 | 
 37 | # alternative way to create X
 38 | X = nba.loc[:, 'ast':'pf']
 39 | 
 40 | 
 41 | # create response vector (y)
 42 | y = nba.pos_num
 43 | 
 44 | 
 45 | # ## Step 3: Train a KNN model (K=5)
 46 | 
 47 | # import class
 48 | from sklearn.neighbors import KNeighborsClassifier
 49 | 
 50 | 
 51 | # instantiate with K=5
 52 | knn = KNeighborsClassifier(n_neighbors=5)
 53 | 
 54 | 
 55 | # fit with data
 56 | knn.fit(X, y)
 57 | 
 58 | 
 59 | # ## Step 4: Predict player position and calculate predicted probability of each position
 60 | # 
 61 | # Predict for a player with these statistics: 1 assist, 1 steal, 0 blocks, 1 turnover, 2 personal fouls
 62 | 
 63 | # create a list to represent a player
 64 | player = [1, 1, 0, 1, 2]
 65 | 
 66 | 
 67 | # make a prediction
 68 | knn.predict(player)
 69 | 
 70 | 
 71 | # calculate predicted probabilities
 72 | knn.predict_proba(player)
 73 | 
 74 | 
 75 | # ## Step 5: Repeat steps 3 and 4 using K=50
 76 | 
 77 | # repeat for K=50
 78 | knn = KNeighborsClassifier(n_neighbors=50)
 79 | knn.fit(X, y)
 80 | knn.predict(player)
 81 | 
 82 | 
 83 | # calculate predicted probabilities
 84 | knn.predict_proba(player)
 85 | 
 86 | 
 87 | # ## Bonus: Explore the features to decide which ones are predictive
 88 | 
 89 | # allow plots to appear in the notebook
 90 | import matplotlib.pyplot as plt
 91 | 
 92 | # increase default figure and font sizes for easier viewing
 93 | plt.rcParams['figure.figsize'] = (6, 4)
 94 | plt.rcParams['font.size'] = 14
 95 | 
 96 | 
 97 | # description of assists grouped by position
 98 | nba.groupby('pos').ast.describe().unstack()
 99 | 
100 | 
101 | # box plot of assists grouped by position
102 | nba.boxplot(column='ast', by='pos')
103 | 
104 | 
105 | # histogram of assists grouped by position
106 | nba.hist(column='ast', by='pos', sharex=True)
107 | 


--------------------------------------------------------------------------------
/code/08_pandas_review_nb.py:
--------------------------------------------------------------------------------
 1 | # # Pandas Review
 2 | 
 3 | import pandas as pd
 4 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv'
 5 | df = pd.read_csv(url).head(5).copy()
 6 | df
 7 | 
 8 | 
 9 | # For each of the following lines of code:
10 | # 
11 | # - What the **data type** of the object that is returned?
12 | # - What is the **shape** of the object that is returned?
13 | # 
14 | # 
15 | # 1. `df`
16 | # 2. `df.continent`
17 | # 3. `df['continent']`
18 | # 4. `df[['country', 'continent']]`
19 | # 5. `df[[False, True, False, True, False]]`
20 | 
21 | # ## Question 1
22 | 
23 | df
24 | 
25 | 
26 | print type(df)
27 | print df.shape
28 | 
29 | 
30 | # ## Question 2
31 | 
32 | df.continent
33 | 
34 | 
35 | print type(df.continent)
36 | print df.continent.shape
37 | 
38 | 
39 | # ## Question 3
40 | 
41 | df['continent']
42 | 
43 | 
44 | print type(df['continent'])
45 | print df['continent'].shape
46 | 
47 | 
48 | # ## Question 4
49 | 
50 | df[['country', 'continent']]
51 | 
52 | 
53 | print type(df[['country', 'continent']])
54 | print df[['country', 'continent']].shape
55 | 
56 | 
57 | # equivalent
58 | cols = ['country', 'continent']
59 | df[cols]
60 | 
61 | 
62 | # ## Question 5
63 | 
64 | df[[False, True, False, True, False]]
65 | 
66 | 
67 | print type(df[[False, True, False, True, False]])
68 | print df[[False, True, False, True, False]].shape
69 | 
70 | 
71 | # equivalent
72 | df[df.continent=='EU']
73 | 


--------------------------------------------------------------------------------
/code/10_yelp_votes_homework_nb.py:
--------------------------------------------------------------------------------
  1 | # # Linear regression homework with Yelp votes
  2 | 
  3 | # ## Introduction
  4 | # 
  5 | # This assignment uses a small subset of the data from Kaggle's [Yelp Business Rating Prediction](https://www.kaggle.com/c/yelp-recsys-2013) competition.
  6 | # 
  7 | # **Description of the data:**
  8 | # 
  9 | # - `yelp.json` is the original format of the file. `yelp.csv` contains the same data, in a more convenient format. Both of the files are in this repo, so there is no need to download the data from the Kaggle website.
 10 | # - Each observation in this dataset is a review of a particular business by a particular user.
 11 | # - The "stars" column is the number of stars (1 through 5) assigned by the reviewer to the business. (Higher stars is better.) In other words, it is the rating of the business by the person who wrote the review.
 12 | # - The "cool" column is the number of "cool" votes this review received from other Yelp users. All reviews start with 0 "cool" votes, and there is no limit to how many "cool" votes a review can receive. In other words, it is a rating of the review itself, not a rating of the business.
 13 | # - The "useful" and "funny" columns are similar to the "cool" column.
 14 | 
 15 | # ## Task 1
 16 | # 
 17 | # Read `yelp.csv` into a DataFrame.
 18 | 
 19 | # access yelp.csv using a relative path
 20 | import pandas as pd
 21 | yelp = pd.read_csv('../data/yelp.csv')
 22 | yelp.head(1)
 23 | 
 24 | 
 25 | # ## Task 1 (Bonus)
 26 | # 
 27 | # Ignore the `yelp.csv` file, and construct this DataFrame yourself from `yelp.json`. This involves reading the data into Python, decoding the JSON, converting it to a DataFrame, and adding individual columns for each of the vote types.
 28 | 
 29 | # read the data from yelp.json into a list of rows
 30 | # each row is decoded into a dictionary using using json.loads()
 31 | import json
 32 | with open('../data/yelp.json', 'rU') as f:
 33 |     data = [json.loads(row) for row in f]
 34 | 
 35 | 
 36 | # show the first review
 37 | data[0]
 38 | 
 39 | 
 40 | # convert the list of dictionaries to a DataFrame
 41 | yelp = pd.DataFrame(data)
 42 | yelp.head(1)
 43 | 
 44 | 
 45 | # add DataFrame columns for cool, useful, and funny
 46 | yelp['cool'] = [row['votes']['cool'] for row in data]
 47 | yelp['useful'] = [row['votes']['useful'] for row in data]
 48 | yelp['funny'] = [row['votes']['funny'] for row in data]
 49 | 
 50 | 
 51 | # drop the votes column
 52 | yelp.drop('votes', axis=1, inplace=True)
 53 | yelp.head(1)
 54 | 
 55 | 
 56 | # ## Task 2
 57 | # 
 58 | # Explore the relationship between each of the vote types (cool/useful/funny) and the number of stars.
 59 | 
 60 | # treat stars as a categorical variable and look for differences between groups
 61 | yelp.groupby('stars').mean()
 62 | 
 63 | 
 64 | # correlation matrix
 65 | import seaborn as sns
 66 | sns.heatmap(yelp.corr())
 67 | 
 68 | 
 69 | # multiple scatter plots
 70 | sns.pairplot(yelp, x_vars=['cool', 'useful', 'funny'], y_vars='stars', size=6, aspect=0.7, kind='reg')
 71 | 
 72 | 
 73 | # ## Task 3
 74 | # 
 75 | # Define cool/useful/funny as the features, and stars as the response.
 76 | 
 77 | feature_cols = ['cool', 'useful', 'funny']
 78 | X = yelp[feature_cols]
 79 | y = yelp.stars
 80 | 
 81 | 
 82 | # ## Task 4
 83 | # 
 84 | # Fit a linear regression model and interpret the coefficients. Do the coefficients make intuitive sense to you? Explore the Yelp website to see if you detect similar trends.
 85 | 
 86 | from sklearn.linear_model import LinearRegression
 87 | linreg = LinearRegression()
 88 | linreg.fit(X, y)
 89 | zip(feature_cols, linreg.coef_)
 90 | 
 91 | 
 92 | # ## Task 5
 93 | # 
 94 | # Evaluate the model by splitting it into training and testing sets and computing the RMSE. Does the RMSE make intuitive sense to you?
 95 | 
 96 | from sklearn.cross_validation import train_test_split
 97 | from sklearn import metrics
 98 | import numpy as np
 99 | 
100 | 
101 | # define a function that accepts a list of features and returns testing RMSE
102 | def train_test_rmse(feature_cols):
103 |     X = yelp[feature_cols]
104 |     y = yelp.stars
105 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
106 |     linreg = LinearRegression()
107 |     linreg.fit(X_train, y_train)
108 |     y_pred = linreg.predict(X_test)
109 |     return np.sqrt(metrics.mean_squared_error(y_test, y_pred))
110 | 
111 | 
112 | # calculate RMSE with all three features
113 | train_test_rmse(['cool', 'useful', 'funny'])
114 | 
115 | 
116 | # ## Task 6
117 | # 
118 | # Try removing some of the features and see if the RMSE improves.
119 | 
120 | print train_test_rmse(['cool', 'useful'])
121 | print train_test_rmse(['cool', 'funny'])
122 | print train_test_rmse(['useful', 'funny'])
123 | 
124 | 
125 | # ## Task 7 (Bonus)
126 | # 
127 | # Think of some new features you could create from the existing data that might be predictive of the response. Figure out how to create those features in Pandas, add them to your model, and see if the RMSE improves.
128 | 
129 | # new feature: review length (number of characters)
130 | yelp['length'] = yelp.text.apply(len)
131 | 
132 | 
133 | # new features: whether or not the review contains 'love' or 'hate'
134 | yelp['love'] = yelp.text.str.contains('love', case=False).astype(int)
135 | yelp['hate'] = yelp.text.str.contains('hate', case=False).astype(int)
136 | 
137 | 
138 | # add new features to the model and calculate RMSE
139 | train_test_rmse(['cool', 'useful', 'funny', 'length', 'love', 'hate'])
140 | 
141 | 
142 | # ## Task 8 (Bonus)
143 | # 
144 | # Compare your best RMSE on the testing set with the RMSE for the "null model", which is the model that ignores all features and simply predicts the mean response value in the testing set.
145 | 
146 | # split the data (outside of the function)
147 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
148 | 
149 | 
150 | # create a NumPy array with the same shape as y_test
151 | y_null = np.zeros_like(y_test, dtype=float)
152 | 
153 | 
154 | # fill the array with the mean of y_test
155 | y_null.fill(y_test.mean())
156 | 
157 | 
158 | # calculate null RMSE
159 | print np.sqrt(metrics.mean_squared_error(y_test, y_null))
160 | 
161 | 
162 | # ## Task 9 (Bonus)
163 | # 
164 | # Instead of treating this as a regression problem, treat it as a classification problem and see what testing accuracy you can achieve with KNN.
165 | 
166 | # import and instantiate KNN
167 | from sklearn.neighbors import KNeighborsClassifier
168 | knn = KNeighborsClassifier(n_neighbors=50)
169 | 
170 | 
171 | # classification models will automatically treat the response value (1/2/3/4/5) as unordered categories
172 | knn.fit(X_train, y_train)
173 | y_pred_class = knn.predict(X_test)
174 | print metrics.accuracy_score(y_test, y_pred_class)
175 | 
176 | 
177 | # ## Task 10 (Bonus)
178 | # 
179 | # Figure out how to use linear regression for classification, and compare its classification accuracy with KNN's accuracy.
180 | 
181 | # use linear regression to make continuous predictions
182 | linreg = LinearRegression()
183 | linreg.fit(X_train, y_train)
184 | y_pred = linreg.predict(X_test)
185 | 
186 | 
187 | # round its predictions to the nearest integer
188 | y_pred_class = y_pred.round()
189 | 
190 | 
191 | # calculate classification accuracy of the rounded predictions
192 | print metrics.accuracy_score(y_test, y_pred_class)
193 | 


--------------------------------------------------------------------------------
/code/12_e_log_examples_nb.py:
--------------------------------------------------------------------------------
 1 | # # Exponential functions and logarithms
 2 | 
 3 | import math
 4 | import numpy as np
 5 | 
 6 | 
 7 | # ## Exponential functions
 8 | 
 9 | # What is **e**? It is simply a number (known as Euler's number):
10 | 
11 | math.e
12 | 
13 | 
14 | # **e** is a significant number, because it is the base rate of growth shared by all continually growing processes.
15 | # 
16 | # For example, if I have **10 dollars**, and it grows 100% in 1 year (compounding continuously), I end up with **10\*e^1 dollars**:
17 | 
18 | # 100% growth for 1 year
19 | 10 * np.exp(1)
20 | 
21 | 
22 | # 100% growth for 2 years
23 | 10 * np.exp(2)
24 | 
25 | 
26 | # Side note: When e is raised to a power, it is known as **the exponential function**. Technically, any number can be the base, and it would still be known as **an exponential function** (such as 2^5). But in our context, the base of the exponential function is assumed to be e.
27 | # 
28 | # Anyway, what if I only have 20% growth instead of 100% growth?
29 | 
30 | # 20% growth for 1 year
31 | 10 * np.exp(0.20)
32 | 
33 | 
34 | # 20% growth for 2 years
35 | 10 * np.exp(0.20 * 2)
36 | 
37 | 
38 | # ## Logarithms
39 | 
40 | # What is the **(natural) logarithm**? It gives you the time needed to reach a certain level of growth. For example, if I want growth by a factor of 2.718, it will take me 1 unit of time (assuming a 100% growth rate):
41 | 
42 | # time needed to grow 1 unit to 2.718 units
43 | np.log(2.718)
44 | 
45 | 
46 | # If I want growth by a factor of 7.389, it will take me 2 units of time:
47 | 
48 | # time needed to grow 1 unit to 7.389 units
49 | np.log(7.389)
50 | 
51 | 
52 | # If I want growth by a factor of 1, it will take me 0 units of time:
53 | 
54 | # time needed to grow 1 unit to 1 unit
55 | np.log(1)
56 | 
57 | 
58 | # If I want growth by a factor of 0.5, it will take me -0.693 units of time (which is like looking back in time):
59 | 
60 | # time needed to grow 1 unit to 0.5 units
61 | np.log(0.5)
62 | 
63 | 
64 | # ## Connecting the concepts
65 | 
66 | # As you can see, the exponential function and the natural logarithm are **inverses** of one another:
67 | 
68 | np.log(np.exp(5))
69 | 
70 | 
71 | np.exp(np.log(5))
72 | 


--------------------------------------------------------------------------------
/code/12_titanic_confusion_nb.py:
--------------------------------------------------------------------------------
  1 | # # Logistic regression exercise with Titanic data
  2 | 
  3 | # ## Introduction
  4 | # 
  5 | # - Data from Kaggle's Titanic competition: [data](https://github.com/justmarkham/DAT8/blob/master/data/titanic.csv), [data dictionary](https://www.kaggle.com/c/titanic/data)
  6 | # - **Goal**: Predict survival based on passenger characteristics
  7 | # - `titanic.csv` is already in our repo, so there is no need to download the data from the Kaggle website
  8 | 
  9 | # ## Step 1: Read the data into Pandas
 10 | 
 11 | import pandas as pd
 12 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/titanic.csv'
 13 | titanic = pd.read_csv(url, index_col='PassengerId')
 14 | titanic.head()
 15 | 
 16 | 
 17 | # ## Step 2: Create X and y
 18 | # 
 19 | # Define **Pclass** and **Parch** as the features, and **Survived** as the response.
 20 | 
 21 | feature_cols = ['Pclass', 'Parch']
 22 | X = titanic[feature_cols]
 23 | y = titanic.Survived
 24 | 
 25 | 
 26 | # ## Step 3: Split the data into training and testing sets
 27 | 
 28 | from sklearn.cross_validation import train_test_split
 29 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
 30 | 
 31 | 
 32 | # ## Step 4: Fit a logistic regression model and examine the coefficients
 33 | # 
 34 | # Confirm that the coefficients make intuitive sense.
 35 | 
 36 | from sklearn.linear_model import LogisticRegression
 37 | logreg = LogisticRegression(C=1e9)
 38 | logreg.fit(X_train, y_train)
 39 | zip(feature_cols, logreg.coef_[0])
 40 | 
 41 | 
 42 | # ## Step 5: Make predictions on the testing set and calculate the accuracy
 43 | 
 44 | # class predictions (not predicted probabilities)
 45 | y_pred_class = logreg.predict(X_test)
 46 | 
 47 | 
 48 | # calculate classification accuracy
 49 | from sklearn import metrics
 50 | print metrics.accuracy_score(y_test, y_pred_class)
 51 | 
 52 | 
 53 | # ## Step 6: Compare your testing accuracy to the null accuracy
 54 | 
 55 | # this works regardless of the number of classes
 56 | y_test.value_counts().head(1) / len(y_test)
 57 | 
 58 | 
 59 | # this only works for binary classification problems coded as 0/1
 60 | max(y_test.mean(), 1 - y_test.mean())
 61 | 
 62 | 
 63 | # # Confusion matrix of Titanic predictions
 64 | 
 65 | # print confusion matrix
 66 | print metrics.confusion_matrix(y_test, y_pred_class)
 67 | 
 68 | 
 69 | # save confusion matrix and slice into four pieces
 70 | confusion = metrics.confusion_matrix(y_test, y_pred_class)
 71 | TP = confusion[1][1]
 72 | TN = confusion[0][0]
 73 | FP = confusion[0][1]
 74 | FN = confusion[1][0]
 75 | 
 76 | 
 77 | print 'True Positives:', TP
 78 | print 'True Negatives:', TN
 79 | print 'False Positives:', FP
 80 | print 'False Negatives:', FN
 81 | 
 82 | 
 83 | # calculate the sensitivity
 84 | print TP / float(TP + FN)
 85 | print 44 / float(44 + 51)
 86 | 
 87 | 
 88 | # calculate the specificity
 89 | print TN / float(TN + FP)
 90 | print 105 / float(105 + 23)
 91 | 
 92 | 
 93 | # store the predicted probabilities
 94 | y_pred_prob = logreg.predict_proba(X_test)[:, 1]
 95 | 
 96 | 
 97 | # histogram of predicted probabilities
 98 | import matplotlib.pyplot as plt
 99 | plt.hist(y_pred_prob)
100 | plt.xlim(0, 1)
101 | plt.xlabel('Predicted probability of survival')
102 | plt.ylabel('Frequency')
103 | 
104 | 
105 | # increase sensitivity by lowering the threshold for predicting survival
106 | import numpy as np
107 | y_pred_class = np.where(y_pred_prob > 0.3, 1, 0)
108 | 
109 | 
110 | # old confusion matrix
111 | print confusion
112 | 
113 | 
114 | # new confusion matrix
115 | print metrics.confusion_matrix(y_test, y_pred_class)
116 | 
117 | 
118 | # new sensitivity (higher than before)
119 | print 63 / float(63 + 32)
120 | 
121 | 
122 | # new specificity (lower than before)
123 | print 72 / float(72 + 56)
124 | 


--------------------------------------------------------------------------------
/code/13_advanced_model_evaluation_nb.py:
--------------------------------------------------------------------------------
  1 | # # Data Preparation and Advanced Model Evaluation
  2 | 
  3 | # ## Agenda
  4 | # 
  5 | # **Data preparation**
  6 | # 
  7 | # - Handling missing values
  8 | # - Handling categorical features (review)
  9 | # 
 10 | # **Advanced model evaluation**
 11 | # 
 12 | # - ROC curves and AUC
 13 | # - Bonus: ROC curve is only sensitive to rank order of predicted probabilities
 14 | # - Cross-validation
 15 | 
 16 | # ## Part 1: Handling missing values
 17 | 
 18 | # scikit-learn models expect that all values are **numeric** and **hold meaning**. Thus, missing values are not allowed by scikit-learn.
 19 | 
 20 | # read the Titanic data
 21 | import pandas as pd
 22 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/titanic.csv'
 23 | titanic = pd.read_csv(url, index_col='PassengerId')
 24 | titanic.shape
 25 | 
 26 | 
 27 | # check for missing values
 28 | titanic.isnull().sum()
 29 | 
 30 | 
 31 | # One possible strategy is to **drop missing values**:
 32 | 
 33 | # drop rows with any missing values
 34 | titanic.dropna().shape
 35 | 
 36 | 
 37 | # drop rows where Age is missing
 38 | titanic[titanic.Age.notnull()].shape
 39 | 
 40 | 
 41 | # Sometimes a better strategy is to **impute missing values**:
 42 | 
 43 | # mean Age
 44 | titanic.Age.mean()
 45 | 
 46 | 
 47 | # median Age
 48 | titanic.Age.median()
 49 | 
 50 | 
 51 | # most frequent Age
 52 | titanic.Age.mode()
 53 | 
 54 | 
 55 | # fill missing values for Age with the median age
 56 | titanic.Age.fillna(titanic.Age.median(), inplace=True)
 57 | 
 58 | 
 59 | # Another strategy would be to build a **KNN model** just to impute missing values. How would we do that?
 60 | # 
 61 | # If values are missing from a categorical feature, we could treat the missing values as **another category**. Why might that make sense?
 62 | # 
 63 | # How do we **choose** between all of these strategies?
 64 | 
 65 | # ## Part 2: Handling categorical features (Review)
 66 | 
 67 | # How do we include a categorical feature in our model?
 68 | # 
 69 | # - **Ordered categories:** transform them to sensible numeric values (example: small=1, medium=2, large=3)
 70 | # - **Unordered categories:** use dummy encoding (0/1)
 71 | 
 72 | titanic.head(10)
 73 | 
 74 | 
 75 | # encode Sex_Female feature
 76 | titanic['Sex_Female'] = titanic.Sex.map({'male':0, 'female':1})
 77 | 
 78 | 
 79 | # create a DataFrame of dummy variables for Embarked
 80 | embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked')
 81 | embarked_dummies.drop(embarked_dummies.columns[0], axis=1, inplace=True)
 82 | 
 83 | # concatenate the original DataFrame and the dummy DataFrame
 84 | titanic = pd.concat([titanic, embarked_dummies], axis=1)
 85 | 
 86 | 
 87 | titanic.head(1)
 88 | 
 89 | 
 90 | # - How do we **interpret** the encoding for Embarked?
 91 | # - Why didn't we just encode Embarked using a **single feature** (C=0, Q=1, S=2)?
 92 | # - Does it matter which category we choose to define as the **baseline**?
 93 | # - Why do we only need **two dummy variables** for Embarked?
 94 | 
 95 | # define X and y
 96 | feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female', 'Embarked_Q', 'Embarked_S']
 97 | X = titanic[feature_cols]
 98 | y = titanic.Survived
 99 | 
100 | # train/test split
101 | from sklearn.cross_validation import train_test_split
102 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
103 | 
104 | # train a logistic regression model
105 | from sklearn.linear_model import LogisticRegression
106 | logreg = LogisticRegression(C=1e9)
107 | logreg.fit(X_train, y_train)
108 | 
109 | # make predictions for testing set
110 | y_pred_class = logreg.predict(X_test)
111 | 
112 | # calculate testing accuracy
113 | from sklearn import metrics
114 | print metrics.accuracy_score(y_test, y_pred_class)
115 | 
116 | 
117 | # ## Part 3: ROC curves and AUC
118 | 
119 | # predict probability of survival
120 | y_pred_prob = logreg.predict_proba(X_test)[:, 1]
121 | 
122 | 
123 | import matplotlib.pyplot as plt
124 | plt.rcParams['figure.figsize'] = (8, 6)
125 | plt.rcParams['font.size'] = 14
126 | 
127 | 
128 | # plot ROC curve
129 | fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
130 | plt.plot(fpr, tpr)
131 | plt.xlim([0.0, 1.0])
132 | plt.ylim([0.0, 1.0])
133 | plt.xlabel('False Positive Rate (1 - Specificity)')
134 | plt.ylabel('True Positive Rate (Sensitivity)')
135 | 
136 | 
137 | # calculate AUC
138 | print metrics.roc_auc_score(y_test, y_pred_prob)
139 | 
140 | 
141 | # Besides allowing you to calculate AUC, seeing the ROC curve can help you to choose a threshold that **balances sensitivity and specificity** in a way that makes sense for the particular context.
142 | 
143 | # histogram of predicted probabilities grouped by actual response value
144 | df = pd.DataFrame({'probability':y_pred_prob, 'actual':y_test})
145 | df.hist(column='probability', by='actual', sharex=True, sharey=True)
146 | 
147 | 
148 | # What would have happened if you had used **y_pred_class** instead of **y_pred_prob** when drawing the ROC curve or calculating AUC?
149 | 
150 | # ROC curve using y_pred_class - WRONG!
151 | fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_class)
152 | plt.plot(fpr, tpr)
153 | 
154 | 
155 | # AUC using y_pred_class - WRONG!
156 | print metrics.roc_auc_score(y_test, y_pred_class)
157 | 
158 | 
159 | # If you use **y_pred_class**, it will interpret the zeros and ones as predicted probabilities of 0% and 100%.
160 | 
161 | # ## Bonus: ROC curve is only sensitive to rank order of predicted probabilities
162 | 
163 | # print the first 10 predicted probabilities
164 | y_pred_prob[:10]
165 | 
166 | 
167 | # take the square root of predicted probabilities (to make them all bigger)
168 | import numpy as np
169 | y_pred_prob_new = np.sqrt(y_pred_prob)
170 | 
171 | # print the modified predicted probabilities
172 | y_pred_prob_new[:10]
173 | 
174 | 
175 | # histogram of predicted probabilities has changed
176 | df = pd.DataFrame({'probability':y_pred_prob_new, 'actual':y_test})
177 | df.hist(column='probability', by='actual', sharex=True, sharey=True)
178 | 
179 | 
180 | # ROC curve did not change
181 | fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob_new)
182 | plt.plot(fpr, tpr)
183 | 
184 | 
185 | # AUC did not change
186 | print metrics.roc_auc_score(y_test, y_pred_prob_new)
187 | 
188 | 
189 | # ## Part 4: Cross-validation
190 | 
191 | # calculate cross-validated AUC
192 | from sklearn.cross_validation import cross_val_score
193 | cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()
194 | 
195 | 
196 | # add Fare to the model
197 | feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female', 'Embarked_Q', 'Embarked_S', 'Fare']
198 | X = titanic[feature_cols]
199 | 
200 | # recalculate AUC
201 | cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()
202 | 


--------------------------------------------------------------------------------
/code/13_bank_exercise_nb.py:
--------------------------------------------------------------------------------
  1 | # # Exercise with bank marketing data
  2 | 
  3 | # ## Introduction
  4 | # 
  5 | # - Data from the UCI Machine Learning Repository: [data](https://github.com/justmarkham/DAT8/blob/master/data/bank-additional.csv), [data dictionary](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing)
  6 | # - **Goal:** Predict whether a customer will purchase a bank product marketed over the phone
  7 | # - `bank-additional.csv` is already in our repo, so there is no need to download the data from the UCI website
  8 | 
  9 | # ## Step 1: Read the data into Pandas
 10 | 
 11 | import pandas as pd
 12 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bank-additional.csv'
 13 | bank = pd.read_csv(url, sep=';')
 14 | bank.head()
 15 | 
 16 | 
 17 | # ## Step 2: Prepare at least three features
 18 | # 
 19 | # - Include both numeric and categorical features
 20 | # - Choose features that you think might be related to the response (based on intuition or exploration)
 21 | # - Think about how to handle missing values (encoded as "unknown")
 22 | 
 23 | # list all columns (for reference)
 24 | bank.columns
 25 | 
 26 | 
 27 | # ### y (response)
 28 | 
 29 | # convert the response to numeric values and store as a new column
 30 | bank['outcome'] = bank.y.map({'no':0, 'yes':1})
 31 | 
 32 | 
 33 | # ### age
 34 | 
 35 | # probably not a great feature
 36 | bank.boxplot(column='age', by='outcome')
 37 | 
 38 | 
 39 | # ### job
 40 | 
 41 | # looks like a useful feature
 42 | bank.groupby('job').outcome.mean()
 43 | 
 44 | 
 45 | # create job_dummies (we will add it to the bank DataFrame later)
 46 | job_dummies = pd.get_dummies(bank.job, prefix='job')
 47 | job_dummies.drop(job_dummies.columns[0], axis=1, inplace=True)
 48 | 
 49 | 
 50 | # ### default
 51 | 
 52 | # looks like a useful feature
 53 | bank.groupby('default').outcome.mean()
 54 | 
 55 | 
 56 | # but only one person in the dataset has a status of yes
 57 | bank.default.value_counts()
 58 | 
 59 | 
 60 | # so, let's treat this as a 2-class feature rather than a 3-class feature
 61 | bank['default'] = bank.default.map({'no':0, 'unknown':1, 'yes':1})
 62 | 
 63 | 
 64 | # ### contact
 65 | 
 66 | # looks like a useful feature
 67 | bank.groupby('contact').outcome.mean()
 68 | 
 69 | 
 70 | # convert the feature to numeric values
 71 | bank['contact'] = bank.contact.map({'cellular':0, 'telephone':1})
 72 | 
 73 | 
 74 | # ### month
 75 | 
 76 | # looks like a useful feature at first glance
 77 | bank.groupby('month').outcome.mean()
 78 | 
 79 | 
 80 | # but, it looks like their success rate is actually just correlated with number of calls
 81 | # thus, the month feature is unlikely to generalize
 82 | bank.groupby('month').outcome.agg(['count', 'mean']).sort('count')
 83 | 
 84 | 
 85 | # ### duration
 86 | 
 87 | # looks like an excellent feature, but you can't know the duration of a call beforehand, thus it can't be used in your model
 88 | bank.boxplot(column='duration', by='outcome')
 89 | 
 90 | 
 91 | # ### previous
 92 | 
 93 | # looks like a useful feature
 94 | bank.groupby('previous').outcome.mean()
 95 | 
 96 | 
 97 | # ### poutcome
 98 | 
 99 | # looks like a useful feature
100 | bank.groupby('poutcome').outcome.mean()
101 | 
102 | 
103 | # create poutcome_dummies
104 | poutcome_dummies = pd.get_dummies(bank.poutcome, prefix='poutcome')
105 | poutcome_dummies.drop(poutcome_dummies.columns[0], axis=1, inplace=True)
106 | 
107 | 
108 | # concatenate bank DataFrame with job_dummies and poutcome_dummies
109 | bank = pd.concat([bank, job_dummies, poutcome_dummies], axis=1)
110 | 
111 | 
112 | # ### euribor3m
113 | 
114 | # looks like an excellent feature
115 | bank.boxplot(column='euribor3m', by='outcome')
116 | 
117 | 
118 | # ## Step 3: Model building
119 | # 
120 | # - Use cross-validation to evaluate the AUC of a logistic regression model with your chosen features
121 | # - Try to increase the AUC by selecting different sets of features
122 | 
123 | # new list of columns (including dummy columns)
124 | bank.columns
125 | 
126 | 
127 | # create X (including 13 dummy columns)
128 | feature_cols = ['default', 'contact', 'previous', 'euribor3m'] + list(bank.columns[-13:])
129 | X = bank[feature_cols]
130 | 
131 | 
132 | # create y
133 | y = bank.outcome
134 | 
135 | 
136 | # calculate cross-validated AUC
137 | from sklearn.linear_model import LogisticRegression
138 | from sklearn.cross_validation import cross_val_score
139 | logreg = LogisticRegression(C=1e9)
140 | cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()
141 | 


--------------------------------------------------------------------------------
/code/14_bayes_theorem_iris_nb.py:
--------------------------------------------------------------------------------
  1 | # # Applying Bayes' theorem to iris classification
  2 | # 
  3 | # Can **Bayes' theorem** help us to solve a **classification problem**, namely predicting the species of an iris?
  4 | 
  5 | # ## Preparing the data
  6 | # 
  7 | # We'll read the iris data into a DataFrame, and **round up** all of the measurements to the next integer:
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | 
 13 | # read the iris data into a DataFrame
 14 | url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
 15 | col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
 16 | iris = pd.read_csv(url, header=None, names=col_names)
 17 | iris.head()
 18 | 
 19 | 
 20 | # apply the ceiling function to the numeric columns
 21 | iris.loc[:, 'sepal_length':'petal_width'] = iris.loc[:, 'sepal_length':'petal_width'].apply(np.ceil)
 22 | iris.head()
 23 | 
 24 | 
 25 | # ## Deciding how to make a prediction
 26 | # 
 27 | # Let's say that I have an **out-of-sample iris** with the following measurements: **7, 3, 5, 2**. How might I predict the species?
 28 | 
 29 | # show all observations with features: 7, 3, 5, 2
 30 | iris[(iris.sepal_length==7) & (iris.sepal_width==3) & (iris.petal_length==5) & (iris.petal_width==2)]
 31 | 
 32 | 
 33 | # count the species for these observations
 34 | iris[(iris.sepal_length==7) & (iris.sepal_width==3) & (iris.petal_length==5) & (iris.petal_width==2)].species.value_counts()
 35 | 
 36 | 
 37 | # count the species for all observations
 38 | iris.species.value_counts()
 39 | 
 40 | 
 41 | # Let's frame this as a **conditional probability problem**: What is the probability of some particular species, given the measurements 7, 3, 5, and 2?
 42 | # 
 43 | # $$P(species \ | \ 7352)$$
 44 | # 
 45 | # We could calculate the conditional probability for **each of the three species**, and then predict the species with the **highest probability**:
 46 | # 
 47 | # $$P(setosa \ | \ 7352)$$
 48 | # $$P(versicolor \ | \ 7352)$$
 49 | # $$P(virginica \ | \ 7352)$$
 50 | 
 51 | # ## Calculating the probability of each species
 52 | # 
 53 | # **Bayes' theorem** gives us a way to calculate these conditional probabilities.
 54 | # 
 55 | # Let's start with **versicolor**:
 56 | # 
 57 | # $$P(versicolor \ | \ 7352) = \frac {P(7352 \ | \ versicolor) \times P(versicolor)} {P(7352)}$$
 58 | # 
 59 | # We can calculate each of the terms on the right side of the equation:
 60 | # 
 61 | # $$P(7352 \ | \ versicolor) = \frac {13} {50} = 0.26$$
 62 | # 
 63 | # $$P(versicolor) = \frac {50} {150} = 0.33$$
 64 | # 
 65 | # $$P(7352) = \frac {17} {150} = 0.11$$
 66 | # 
 67 | # Therefore, Bayes' theorem says the **probability of versicolor given these measurements** is:
 68 | # 
 69 | # $$P(versicolor \ | \ 7352) = \frac {0.26 \times 0.33} {0.11} = 0.76$$
 70 | # 
 71 | # Let's repeat this process for **virginica** and **setosa**:
 72 | # 
 73 | # $$P(virginica \ | \ 7352) = \frac {0.08 \times 0.33} {0.11} = 0.24$$
 74 | # 
 75 | # $$P(setosa \ | \ 7352) = \frac {0 \times 0.33} {0.11} = 0$$
 76 | # 
 77 | # We predict that the iris is a versicolor, since that species had the **highest conditional probability**.
 78 | 
 79 | # ## Summary
 80 | # 
 81 | # 1. We framed a **classification problem** as three conditional probability problems.
 82 | # 2. We used **Bayes' theorem** to calculate those conditional probabilities.
 83 | # 3. We made a **prediction** by choosing the species with the highest conditional probability.
 84 | 
 85 | # ## Bonus: The intuition behind Bayes' theorem
 86 | # 
 87 | # Let's make some hypothetical adjustments to the data, to demonstrate how Bayes' theorem makes intuitive sense:
 88 | # 
 89 | # Pretend that **more of the existing versicolors had measurements of 7352:**
 90 | # 
 91 | # - $P(7352 \ | \ versicolor)$ would increase, thus increasing the numerator.
 92 | # - It would make sense that given an iris with measurements of 7352, the probability of it being a versicolor would also increase.
 93 | # 
 94 | # Pretend that **most of the existing irises were versicolor:**
 95 | # 
 96 | # - $P(versicolor)$ would increase, thus increasing the numerator.
 97 | # - It would make sense that the probability of any iris being a versicolor (regardless of measurements) would also increase.
 98 | # 
 99 | # Pretend that **17 of the setosas had measurements of 7352:**
100 | # 
101 | # - $P(7352)$ would double, thus doubling the denominator.
102 | # - It would make sense that given an iris with measurements of 7352, the probability of it being a versicolor would be cut in half.
103 | 


--------------------------------------------------------------------------------
/code/14_text_data_sklearn_nb.py:
--------------------------------------------------------------------------------
  1 | # # Working with Text Data and Naive Bayes in scikit-learn
  2 | 
  3 | # ## Agenda
  4 | # 
  5 | # **Working with text data**
  6 | # 
  7 | # - Representing text as data
  8 | # - Reading SMS data
  9 | # - Vectorizing SMS data
 10 | # - Examining the tokens and their counts
 11 | # - Bonus: Calculating the "spamminess" of each token
 12 | # 
 13 | # **Naive Bayes classification**
 14 | # 
 15 | # - Building a Naive Bayes model
 16 | # - Comparing Naive Bayes with logistic regression
 17 | 
 18 | # ## Part 1: Representing text as data
 19 | # 
 20 | # From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):
 21 | # 
 22 | # > Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect **numerical feature vectors with a fixed size** rather than the **raw text documents with variable length**.
 23 | # 
 24 | # We will use [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to "convert text into a matrix of token counts":
 25 | 
 26 | from sklearn.feature_extraction.text import CountVectorizer
 27 | 
 28 | 
 29 | # start with a simple example
 30 | simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
 31 | 
 32 | 
 33 | # learn the 'vocabulary' of the training data
 34 | vect = CountVectorizer()
 35 | vect.fit(simple_train)
 36 | vect.get_feature_names()
 37 | 
 38 | 
 39 | # transform training data into a 'document-term matrix'
 40 | simple_train_dtm = vect.transform(simple_train)
 41 | simple_train_dtm
 42 | 
 43 | 
 44 | # print the sparse matrix
 45 | print simple_train_dtm
 46 | 
 47 | 
 48 | # convert sparse matrix to a dense matrix
 49 | simple_train_dtm.toarray()
 50 | 
 51 | 
 52 | # examine the vocabulary and document-term matrix together
 53 | import pandas as pd
 54 | pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())
 55 | 
 56 | 
 57 | # From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):
 58 | # 
 59 | # > In this scheme, features and samples are defined as follows:
 60 | # 
 61 | # > - Each individual token occurrence frequency (normalized or not) is treated as a **feature**.
 62 | # > - The vector of all the token frequencies for a given document is considered a multivariate **sample**.
 63 | # 
 64 | # > A **corpus of documents** can thus be represented by a matrix with **one row per document** and **one column per token** (e.g. word) occurring in the corpus.
 65 | # 
 66 | # > We call **vectorization** the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the **Bag of Words** or "Bag of n-grams" representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.
 67 | 
 68 | # transform testing data into a document-term matrix (using existing vocabulary)
 69 | simple_test = ["please don't call me"]
 70 | simple_test_dtm = vect.transform(simple_test)
 71 | simple_test_dtm.toarray()
 72 | 
 73 | 
 74 | # examine the vocabulary and document-term matrix together
 75 | pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())
 76 | 
 77 | 
 78 | # **Summary:**
 79 | # 
 80 | # - `vect.fit(train)` learns the vocabulary of the training data
 81 | # - `vect.transform(train)` uses the fitted vocabulary to build a document-term matrix from the training data
 82 | # - `vect.transform(test)` uses the fitted vocabulary to build a document-term matrix from the testing data (and ignores tokens it hasn't seen before)
 83 | 
 84 | # ## Part 2: Reading SMS data
 85 | 
 86 | # read tab-separated file
 87 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv'
 88 | col_names = ['label', 'message']
 89 | sms = pd.read_table(url, sep='\t', header=None, names=col_names)
 90 | print sms.shape
 91 | 
 92 | 
 93 | sms.head(20)
 94 | 
 95 | 
 96 | sms.label.value_counts()
 97 | 
 98 | 
 99 | # convert label to a numeric variable
100 | sms['label'] = sms.label.map({'ham':0, 'spam':1})
101 | 
102 | 
103 | # define X and y
104 | X = sms.message
105 | y = sms.label
106 | 
107 | 
108 | # split into training and testing sets
109 | from sklearn.cross_validation import train_test_split
110 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
111 | print X_train.shape
112 | print X_test.shape
113 | 
114 | 
115 | # ## Part 3: Vectorizing SMS data
116 | 
117 | # instantiate the vectorizer
118 | vect = CountVectorizer()
119 | 
120 | 
121 | # learn training data vocabulary, then create document-term matrix
122 | vect.fit(X_train)
123 | X_train_dtm = vect.transform(X_train)
124 | X_train_dtm
125 | 
126 | 
127 | # alternative: combine fit and transform into a single step
128 | X_train_dtm = vect.fit_transform(X_train)
129 | X_train_dtm
130 | 
131 | 
132 | # transform testing data (using fitted vocabulary) into a document-term matrix
133 | X_test_dtm = vect.transform(X_test)
134 | X_test_dtm
135 | 
136 | 
137 | # ## Part 4: Examining the tokens and their counts
138 | 
139 | # store token names
140 | X_train_tokens = vect.get_feature_names()
141 | 
142 | 
143 | # first 50 tokens
144 | print X_train_tokens[:50]
145 | 
146 | 
147 | # last 50 tokens
148 | print X_train_tokens[-50:]
149 | 
150 | 
151 | # view X_train_dtm as a dense matrix
152 | X_train_dtm.toarray()
153 | 
154 | 
155 | # count how many times EACH token appears across ALL messages in X_train_dtm
156 | import numpy as np
157 | X_train_counts = np.sum(X_train_dtm.toarray(), axis=0)
158 | X_train_counts
159 | 
160 | 
161 | X_train_counts.shape
162 | 
163 | 
164 | # create a DataFrame of tokens with their counts
165 | pd.DataFrame({'token':X_train_tokens, 'count':X_train_counts}).sort('count')
166 | 
167 | 
168 | # ## Bonus: Calculating the "spamminess" of each token
169 | 
170 | # create separate DataFrames for ham and spam
171 | sms_ham = sms[sms.label==0]
172 | sms_spam = sms[sms.label==1]
173 | 
174 | 
175 | # learn the vocabulary of ALL messages and save it
176 | vect.fit(sms.message)
177 | all_tokens = vect.get_feature_names()
178 | 
179 | 
180 | # create document-term matrices for ham and spam
181 | ham_dtm = vect.transform(sms_ham.message)
182 | spam_dtm = vect.transform(sms_spam.message)
183 | 
184 | 
185 | # count how many times EACH token appears across ALL ham messages
186 | ham_counts = np.sum(ham_dtm.toarray(), axis=0)
187 | 
188 | 
189 | # count how many times EACH token appears across ALL spam messages
190 | spam_counts = np.sum(spam_dtm.toarray(), axis=0)
191 | 
192 | 
193 | # create a DataFrame of tokens with their separate ham and spam counts
194 | token_counts = pd.DataFrame({'token':all_tokens, 'ham':ham_counts, 'spam':spam_counts})
195 | 
196 | 
197 | # add one to ham and spam counts to avoid dividing by zero (in the step that follows)
198 | token_counts['ham'] = token_counts.ham + 1
199 | token_counts['spam'] = token_counts.spam + 1
200 | 
201 | 
202 | # calculate ratio of spam-to-ham for each token
203 | token_counts['spam_ratio'] = token_counts.spam / token_counts.ham
204 | token_counts.sort('spam_ratio')
205 | 
206 | 
207 | # ## Part 5: Building a Naive Bayes model
208 | # 
209 | # We will use [Multinomial Naive Bayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html):
210 | # 
211 | # > The multinomial Naive Bayes classifier is suitable for classification with **discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.
212 | 
213 | # train a Naive Bayes model using X_train_dtm
214 | from sklearn.naive_bayes import MultinomialNB
215 | nb = MultinomialNB()
216 | nb.fit(X_train_dtm, y_train)
217 | 
218 | 
219 | # make class predictions for X_test_dtm
220 | y_pred_class = nb.predict(X_test_dtm)
221 | 
222 | 
223 | # calculate accuracy of class predictions
224 | from sklearn import metrics
225 | print metrics.accuracy_score(y_test, y_pred_class)
226 | 
227 | 
228 | # confusion matrix
229 | print metrics.confusion_matrix(y_test, y_pred_class)
230 | 
231 | 
232 | # predict (poorly calibrated) probabilities
233 | y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
234 | y_pred_prob
235 | 
236 | 
237 | # calculate AUC
238 | print metrics.roc_auc_score(y_test, y_pred_prob)
239 | 
240 | 
241 | # print message text for the false positives
242 | X_test[y_test < y_pred_class]
243 | 
244 | 
245 | # print message text for the false negatives
246 | X_test[y_test > y_pred_class]
247 | 
248 | 
249 | # what do you notice about the false negatives?
250 | X_test[3132]
251 | 
252 | 
253 | # ## Part 6: Comparing Naive Bayes with logistic regression
254 | 
255 | # import/instantiate/fit
256 | from sklearn.linear_model import LogisticRegression
257 | logreg = LogisticRegression(C=1e9)
258 | logreg.fit(X_train_dtm, y_train)
259 | 
260 | 
261 | # class predictions and predicted probabilities
262 | y_pred_class = logreg.predict(X_test_dtm)
263 | y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
264 | 
265 | 
266 | # calculate accuracy and AUC
267 | print metrics.accuracy_score(y_test, y_pred_class)
268 | print metrics.roc_auc_score(y_test, y_pred_prob)
269 | 


--------------------------------------------------------------------------------
/code/14_types_of_naive_bayes_nb.py:
--------------------------------------------------------------------------------
 1 | # # Comparing Multinomial and Gaussian Naive Bayes
 2 | # 
 3 | # scikit-learn documentation: [MultinomialNB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html) and [GaussianNB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)
 4 | # 
 5 | # Dataset: [Pima Indians Diabetes](https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes) from the UCI Machine Learning Repository
 6 | 
 7 | # read the data
 8 | import pandas as pd
 9 | url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
10 | col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
11 | pima = pd.read_csv(url, header=None, names=col_names)
12 | 
13 | 
14 | # notice that all features are continuous
15 | pima.head()
16 | 
17 | 
18 | # create X and y
19 | X = pima.drop('label', axis=1)
20 | y = pima.label
21 | 
22 | 
23 | # split into training and testing sets
24 | from sklearn.cross_validation import train_test_split
25 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
26 | 
27 | 
28 | # import both Multinomial and Gaussian Naive Bayes
29 | from sklearn.naive_bayes import MultinomialNB, GaussianNB
30 | from sklearn import metrics
31 | 
32 | 
33 | # testing accuracy of Multinomial Naive Bayes
34 | mnb = MultinomialNB()
35 | mnb.fit(X_train, y_train)
36 | y_pred_class = mnb.predict(X_test)
37 | print metrics.accuracy_score(y_test, y_pred_class)
38 | 
39 | 
40 | # testing accuracy of Gaussian Naive Bayes
41 | gnb = GaussianNB()
42 | gnb.fit(X_train, y_train)
43 | y_pred_class = gnb.predict(X_test)
44 | print metrics.accuracy_score(y_test, y_pred_class)
45 | 
46 | 
47 | # **Conclusion:** When applying Naive Bayes classification to a dataset with **continuous features**, it is better to use Gaussian Naive Bayes than Multinomial Naive Bayes. The latter is suitable for datasets containing **discrete features** (e.g., word counts).
48 | # 
49 | # Wikipedia has a short [description](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Gaussian_naive_Bayes) of Gaussian Naive Bayes, as well as an excellent [example](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Sex_classification) of its usage.
50 | 


--------------------------------------------------------------------------------
/code/16_kaggle_minimal.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | CLASS: Kaggle Stack Overflow competition (minimal code file)
  3 | '''
  4 | 
  5 | import pandas as pd
  6 | 
  7 | # define a function that takes a CSV file and returns a DataFrame (with new or modified features)
  8 | def make_features(filename):
  9 |     df = pd.read_csv(filename, index_col=0)
 10 |     df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True)
 11 |     df['TitleLength'] = df.Title.apply(len)
 12 |     return df
 13 | 
 14 | # apply function to both training and testing files
 15 | train = make_features('train.csv')
 16 | test = make_features('test.csv')
 17 | 
 18 | 
 19 | '''
 20 | Create a model with three features
 21 | '''
 22 | 
 23 | # define X and y
 24 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength']
 25 | X = train[feature_cols]
 26 | y = train.OpenStatus
 27 | 
 28 | # fit a logistic regression model
 29 | from sklearn.linear_model import LogisticRegression
 30 | logreg = LogisticRegression(C=1e9)
 31 | logreg.fit(X, y)
 32 | 
 33 | # predict class probabilities for the actual testing data
 34 | X_oos = test[feature_cols]
 35 | oos_pred_prob = logreg.predict_proba(X_oos)[:, 1]
 36 | 
 37 | 
 38 | '''
 39 | Create a submission file
 40 | '''
 41 | 
 42 | # create a DataFrame that has 'id' as the index, then export to a CSV file
 43 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id')
 44 | sub.to_csv('sub1.csv')  # 0.687
 45 | 
 46 | 
 47 | '''
 48 | Update make_features and create another submission file
 49 | '''
 50 | 
 51 | import numpy as np
 52 | 
 53 | # update the function
 54 | def make_features(filename):
 55 |     df = pd.read_csv(filename, index_col=0, parse_dates=['OwnerCreationDate', 'PostCreationDate'])
 56 |     df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True)
 57 |     df['TitleLength'] = df.Title.apply(len)
 58 |     df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1)
 59 |     df['OwnerAge'] = (df.PostCreationDate - df.OwnerCreationDate).dt.days
 60 |     df['OwnerAge'] = np.where(df.OwnerAge < 0, 0, df.OwnerAge)
 61 |     return df
 62 | 
 63 | # apply function to both training and testing files
 64 | train = make_features('train.csv')
 65 | test = make_features('test.csv')
 66 | 
 67 | # train the model on ALL data
 68 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'NumTags', 'OwnerAge']
 69 | X = train[feature_cols]
 70 | logreg.fit(X, y)
 71 | 
 72 | # predict class probabilities for the actual testing data
 73 | X_oos = test[feature_cols]
 74 | oos_pred_prob = logreg.predict_proba(X_oos)[:, 1]
 75 | 
 76 | # create submission file
 77 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id')
 78 | sub.to_csv('sub2.csv')  # 0.650
 79 | 
 80 | 
 81 | '''
 82 | Build a document-term matrix from Title using CountVectorizer
 83 | '''
 84 | 
 85 | # build document-term matrix for the training data
 86 | from sklearn.feature_extraction.text import CountVectorizer
 87 | vect = CountVectorizer(stop_words='english')
 88 | dtm = vect.fit_transform(train.Title)
 89 | 
 90 | # define X and y
 91 | X = dtm
 92 | y = train.OpenStatus
 93 | 
 94 | # build document-term matrix for the actual testing data and make predictions
 95 | oos_dtm = vect.transform(test.Title)
 96 | from sklearn.naive_bayes import MultinomialNB
 97 | nb = MultinomialNB()
 98 | nb.fit(X, y)
 99 | oos_pred_prob = nb.predict_proba(oos_dtm)[:, 1]
100 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id')
101 | sub.to_csv('sub3.csv')  # 0.544
102 | 
103 | 
104 | '''
105 | BONUS: Dummy encoding of Tag1
106 | '''
107 | 
108 | # convert Tag1 from strings to integers
109 | from sklearn.preprocessing import LabelEncoder
110 | le = LabelEncoder()
111 | train['Tag1_enc'] = le.fit_transform(train.Tag1)
112 | 
113 | # create a dummy column for each value of Tag1_enc (returns a sparse matrix)
114 | from sklearn.preprocessing import OneHotEncoder
115 | ohe = OneHotEncoder()
116 | tag1_dummies = ohe.fit_transform(train[['Tag1_enc']])
117 | 
118 | # adjust Tag1 on testing set since LabelEncoder errors on new values during a transform
119 | test['Tag1'] = test['Tag1'].map(lambda s: '<unknown>' if s not in le.classes_ else s)
120 | le.classes_ = np.append(le.classes_, '<unknown>')
121 | 
122 | # define X and y
123 | X = tag1_dummies
124 | y = train.OpenStatus
125 | 
126 | # apply the same encoding to the actual testing data and make predictions
127 | test['Tag1_enc'] = le.transform(test.Tag1)
128 | oos_tag1_dummies = ohe.transform(test[['Tag1_enc']])
129 | nb.fit(X, y)
130 | oos_pred_prob = nb.predict_proba(oos_tag1_dummies)[:, 1]
131 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id')
132 | sub.to_csv('sub4.csv')  # 0.652
133 | 


--------------------------------------------------------------------------------
/code/17_bikeshare_exercise_nb.py:
--------------------------------------------------------------------------------
  1 | # # Exercise with Capital Bikeshare data
  2 | 
  3 | # ## Introduction
  4 | # 
  5 | # - Capital Bikeshare dataset from Kaggle: [data](https://github.com/justmarkham/DAT8/blob/master/data/bikeshare.csv), [data dictionary](https://www.kaggle.com/c/bike-sharing-demand/data)
  6 | # - Each observation represents the bikeshare rentals initiated during a given hour of a given day
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | from sklearn.cross_validation import cross_val_score
 11 | from sklearn.linear_model import LinearRegression
 12 | from sklearn.tree import DecisionTreeRegressor, export_graphviz
 13 | 
 14 | 
 15 | # read the data and set "datetime" as the index
 16 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'
 17 | bikes = pd.read_csv(url, index_col='datetime', parse_dates=True)
 18 | 
 19 | 
 20 | # "count" is a method, so it's best to rename that column
 21 | bikes.rename(columns={'count':'total'}, inplace=True)
 22 | 
 23 | 
 24 | # create "hour" as its own feature
 25 | bikes['hour'] = bikes.index.hour
 26 | 
 27 | 
 28 | bikes.head()
 29 | 
 30 | 
 31 | bikes.tail()
 32 | 
 33 | 
 34 | # - **hour** ranges from 0 (midnight) through 23 (11pm)
 35 | # - **workingday** is either 0 (weekend or holiday) or 1 (non-holiday weekday)
 36 | 
 37 | # ## Task 1
 38 | # 
 39 | # Run these two `groupby` statements and figure out what they tell you about the data.
 40 | 
 41 | # mean rentals for each value of "workingday"
 42 | bikes.groupby('workingday').total.mean()
 43 | 
 44 | 
 45 | # mean rentals for each value of "hour"
 46 | bikes.groupby('hour').total.mean()
 47 | 
 48 | 
 49 | # ## Task 2
 50 | # 
 51 | # Run this plotting code, and make sure you understand the output. Then, separate this plot into two separate plots conditioned on "workingday". (In other words, one plot should display the hourly trend for "workingday=0", and the other should display the hourly trend for "workingday=1".)
 52 | 
 53 | # mean rentals for each value of "hour"
 54 | bikes.groupby('hour').total.mean().plot()
 55 | 
 56 | 
 57 | # hourly rental trend for "workingday=0"
 58 | bikes[bikes.workingday==0].groupby('hour').total.mean().plot()
 59 | 
 60 | 
 61 | # hourly rental trend for "workingday=1"
 62 | bikes[bikes.workingday==1].groupby('hour').total.mean().plot()
 63 | 
 64 | 
 65 | # combine the two plots
 66 | bikes.groupby(['hour', 'workingday']).total.mean().unstack().plot()
 67 | 
 68 | 
 69 | # ## Task 3
 70 | # 
 71 | # Fit a linear regression model to the entire dataset, using "total" as the response and "hour" and "workingday" as the only features. Then, print the coefficients and interpret them. What are the limitations of linear regression in this instance?
 72 | 
 73 | # create X and y
 74 | feature_cols = ['hour', 'workingday']
 75 | X = bikes[feature_cols]
 76 | y = bikes.total
 77 | 
 78 | 
 79 | # fit a linear regression model and print coefficients
 80 | linreg = LinearRegression()
 81 | linreg.fit(X, y)
 82 | linreg.coef_
 83 | 
 84 | 
 85 | # ## Task 4
 86 | # 
 87 | # Use 10-fold cross-validation to calculate the RMSE for the linear regression model.
 88 | 
 89 | # save the 10 MSE scores output by cross_val_score
 90 | scores = cross_val_score(linreg, X, y, cv=10, scoring='mean_squared_error')
 91 | 
 92 | 
 93 | # convert MSE to RMSE, and then calculate the mean of the 10 RMSE scores
 94 | np.mean(np.sqrt(-scores))
 95 | 
 96 | 
 97 | # ## Task 5
 98 | # 
 99 | # Use 10-fold cross-validation to evaluate a decision tree model with those same features (fit to any "max_depth" you choose).
100 | 
101 | # evaluate a decision tree model with "max_depth=7"
102 | treereg = DecisionTreeRegressor(max_depth=7, random_state=1)
103 | scores = cross_val_score(treereg, X, y, cv=10, scoring='mean_squared_error')
104 | np.mean(np.sqrt(-scores))
105 | 
106 | 
107 | # ## Task 6
108 | # 
109 | # Fit a decision tree model to the entire dataset using "max_depth=3", and create a tree diagram using Graphviz. Then, figure out what each leaf represents. What did the decision tree learn that a linear regression model could not learn?
110 | 
111 | # fit a decision tree model with "max_depth=3"
112 | treereg = DecisionTreeRegressor(max_depth=3, random_state=1)
113 | treereg.fit(X, y)
114 | 
115 | 
116 | # create a Graphviz file
117 | export_graphviz(treereg, out_file='tree_bikeshare.dot', feature_names=feature_cols)
118 | 
119 | # At the command line, run this to convert to PNG:
120 | #   dot -Tpng tree_bikeshare.dot -o tree_bikeshare.png
121 | 
122 | 
123 | # ![Tree for bikeshare data](images/tree_bikeshare.png)
124 | 


--------------------------------------------------------------------------------
/code/19_advanced_sklearn_nb.py:
--------------------------------------------------------------------------------
  1 | # # Advanced scikit-learn
  2 | 
  3 | # ## Agenda
  4 | # 
  5 | # - StandardScaler
  6 | # - Pipeline (bonus content)
  7 | 
  8 | # ## StandardScaler
  9 | # 
 10 | # ### What is the problem we're trying to solve?
 11 | 
 12 | # fake data
 13 | import pandas as pd
 14 | train = pd.DataFrame({'id':[0,1,2], 'length':[0.9,0.3,0.6], 'mass':[0.1,0.2,0.8], 'rings':[40,50,60]})
 15 | test = pd.DataFrame({'length':[0.59], 'mass':[0.79], 'rings':[54]})
 16 | 
 17 | 
 18 | # training data
 19 | train
 20 | 
 21 | 
 22 | # testing data
 23 | test
 24 | 
 25 | 
 26 | # define X and y
 27 | feature_cols = ['length', 'mass', 'rings']
 28 | X = train[feature_cols]
 29 | y = train.id
 30 | 
 31 | 
 32 | # KNN with K=1
 33 | from sklearn.neighbors import KNeighborsClassifier
 34 | knn = KNeighborsClassifier(n_neighbors=1)
 35 | knn.fit(X, y)
 36 | 
 37 | 
 38 | # what "should" it predict?
 39 | knn.predict(test)
 40 | 
 41 | 
 42 | # allow plots to appear in the notebook
 43 | import matplotlib.pyplot as plt
 44 | plt.rcParams['font.size'] = 14
 45 | plt.rcParams['figure.figsize'] = (5, 5)
 46 | 
 47 | 
 48 | # create a "colors" array for plotting
 49 | import numpy as np
 50 | colors = np.array(['red', 'green', 'blue'])
 51 | 
 52 | 
 53 | # scatter plot of training data, colored by id (0=red, 1=green, 2=blue)
 54 | plt.scatter(train.mass, train.rings, c=colors[train.id], s=50)
 55 | 
 56 | # testing data
 57 | plt.scatter(test.mass, test.rings, c='white', s=50)
 58 | 
 59 | # add labels
 60 | plt.xlabel('mass')
 61 | plt.ylabel('rings')
 62 | plt.title('How we interpret the data')
 63 | 
 64 | 
 65 | # adjust the x-limits
 66 | plt.scatter(train.mass, train.rings, c=colors[train.id], s=50)
 67 | plt.scatter(test.mass, test.rings, c='white', s=50)
 68 | plt.xlabel('mass')
 69 | plt.ylabel('rings')
 70 | plt.title('How KNN interprets the data')
 71 | plt.xlim(0, 30)
 72 | 
 73 | 
 74 | # ### How does StandardScaler solve the problem?
 75 | # 
 76 | # [StandardScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) is used for the "standardization" of features, also known as "center and scale" or "z-score normalization".
 77 | 
 78 | # standardize the features
 79 | from sklearn.preprocessing import StandardScaler
 80 | scaler = StandardScaler()
 81 | scaler.fit(X)
 82 | X_scaled = scaler.transform(X)
 83 | 
 84 | 
 85 | # original values
 86 | X.values
 87 | 
 88 | 
 89 | # standardized values
 90 | X_scaled
 91 | 
 92 | 
 93 | # figure out how it standardized
 94 | print scaler.mean_
 95 | print scaler.std_
 96 | 
 97 | 
 98 | # manually standardize
 99 | (X.values - scaler.mean_) / scaler.std_
100 | 
101 | 
102 | # ### Applying StandardScaler to a real dataset
103 | # 
104 | # - Wine dataset from the UCI Machine Learning Repository: [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data), [data dictionary](http://archive.ics.uci.edu/ml/datasets/Wine)
105 | # - **Goal:** Predict the origin of wine using chemical analysis
106 | 
107 | # read three columns from the dataset into a DataFrame
108 | url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
109 | col_names = ['label', 'color', 'proline']
110 | wine = pd.read_csv(url, header=None, names=col_names, usecols=[0, 10, 13])
111 | 
112 | 
113 | wine.head()
114 | 
115 | 
116 | wine.describe()
117 | 
118 | 
119 | # define X and y
120 | feature_cols = ['color', 'proline']
121 | X = wine[feature_cols]
122 | y = wine.label
123 | 
124 | 
125 | # split into training and testing sets
126 | from sklearn.cross_validation import train_test_split
127 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
128 | 
129 | 
130 | # standardize X_train
131 | scaler.fit(X_train)
132 | X_train_scaled = scaler.transform(X_train)
133 | 
134 | 
135 | # check that it standardized properly
136 | print X_train_scaled[:, 0].mean()
137 | print X_train_scaled[:, 0].std()
138 | print X_train_scaled[:, 1].mean()
139 | print X_train_scaled[:, 1].std()
140 | 
141 | 
142 | # standardize X_test
143 | X_test_scaled = scaler.transform(X_test)
144 | 
145 | 
146 | # is this right?
147 | print X_test_scaled[:, 0].mean()
148 | print X_test_scaled[:, 0].std()
149 | print X_test_scaled[:, 1].mean()
150 | print X_test_scaled[:, 1].std()
151 | 
152 | 
153 | # KNN accuracy on original data
154 | knn = KNeighborsClassifier(n_neighbors=3)
155 | knn.fit(X_train, y_train)
156 | y_pred_class = knn.predict(X_test)
157 | from sklearn import metrics
158 | print metrics.accuracy_score(y_test, y_pred_class)
159 | 
160 | 
161 | # KNN accuracy on scaled data
162 | knn.fit(X_train_scaled, y_train)
163 | y_pred_class = knn.predict(X_test_scaled)
164 | print metrics.accuracy_score(y_test, y_pred_class)
165 | 
166 | 
167 | # ## Pipeline (bonus content)
168 | # 
169 | # ### What is the problem we're trying to solve?
170 | 
171 | # define X and y
172 | feature_cols = ['color', 'proline']
173 | X = wine[feature_cols]
174 | y = wine.label
175 | 
176 | 
177 | # proper cross-validation on the original (unscaled) data
178 | knn = KNeighborsClassifier(n_neighbors=3)
179 | from sklearn.cross_validation import cross_val_score
180 | cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()
181 | 
182 | 
183 | # why is this improper cross-validation on the scaled data?
184 | scaler = StandardScaler()
185 | X_scaled = scaler.fit_transform(X)
186 | cross_val_score(knn, X_scaled, y, cv=5, scoring='accuracy').mean()
187 | 
188 | 
189 | # ### How does Pipeline solve the problem?
190 | # 
191 | # [Pipeline](http://scikit-learn.org/stable/modules/pipeline.html) is used for chaining steps together:
192 | 
193 | # fix the cross-validation process using Pipeline
194 | from sklearn.pipeline import make_pipeline
195 | pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
196 | cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
197 | 
198 | 
199 | # Pipeline can also be used with [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html) for parameter searching:
200 | 
201 | # search for an optimal n_neighbors value using GridSearchCV
202 | neighbors_range = range(1, 21)
203 | param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range)
204 | from sklearn.grid_search import GridSearchCV
205 | grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
206 | grid.fit(X, y)
207 | print grid.best_score_
208 | print grid.best_params_
209 | 


--------------------------------------------------------------------------------
/code/19_clustering_nb.py:
--------------------------------------------------------------------------------
  1 | # # Clustering
  2 | 
  3 | # ## Agenda:
  4 | # 
  5 | # 1. K-means clustering
  6 | # 2. Clustering evaluation
  7 | # 3. DBSCAN clustering
  8 | 
  9 | # beer dataset
 10 | import pandas as pd
 11 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/beer.txt'
 12 | beer = pd.read_csv(url, sep=' ')
 13 | beer
 14 | 
 15 | 
 16 | # How would you cluster these beers?
 17 | 
 18 | # define X
 19 | X = beer.drop('name', axis=1)
 20 | 
 21 | 
 22 | # What happened to y?
 23 | 
 24 | # ## Part 1: K-means clustering
 25 | 
 26 | # K-means with 3 clusters
 27 | from sklearn.cluster import KMeans
 28 | km = KMeans(n_clusters=3, random_state=1)
 29 | km.fit(X)
 30 | 
 31 | 
 32 | # review the cluster labels
 33 | km.labels_
 34 | 
 35 | 
 36 | # save the cluster labels and sort by cluster
 37 | beer['cluster'] = km.labels_
 38 | beer.sort('cluster')
 39 | 
 40 | 
 41 | # What do the clusters seem to be based on? Why?
 42 | 
 43 | # review the cluster centers
 44 | km.cluster_centers_
 45 | 
 46 | 
 47 | # calculate the mean of each feature for each cluster
 48 | beer.groupby('cluster').mean()
 49 | 
 50 | 
 51 | # save the DataFrame of cluster centers
 52 | centers = beer.groupby('cluster').mean()
 53 | 
 54 | 
 55 | # allow plots to appear in the notebook
 56 | import matplotlib.pyplot as plt
 57 | plt.rcParams['font.size'] = 14
 58 | 
 59 | 
 60 | # create a "colors" array for plotting
 61 | import numpy as np
 62 | colors = np.array(['red', 'green', 'blue', 'yellow'])
 63 | 
 64 | 
 65 | # scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
 66 | plt.scatter(beer.calories, beer.alcohol, c=colors[beer.cluster], s=50)
 67 | 
 68 | # cluster centers, marked by "+"
 69 | plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')
 70 | 
 71 | # add labels
 72 | plt.xlabel('calories')
 73 | plt.ylabel('alcohol')
 74 | 
 75 | 
 76 | # scatter plot matrix (0=red, 1=green, 2=blue)
 77 | pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)
 78 | 
 79 | 
 80 | # ### Repeat with scaled data
 81 | 
 82 | # center and scale the data
 83 | from sklearn.preprocessing import StandardScaler
 84 | scaler = StandardScaler()
 85 | X_scaled = scaler.fit_transform(X)
 86 | 
 87 | 
 88 | # K-means with 3 clusters on scaled data
 89 | km = KMeans(n_clusters=3, random_state=1)
 90 | km.fit(X_scaled)
 91 | 
 92 | 
 93 | # save the cluster labels and sort by cluster
 94 | beer['cluster'] = km.labels_
 95 | beer.sort('cluster')
 96 | 
 97 | 
 98 | # What are the "characteristics" of each cluster?
 99 | 
100 | # review the cluster centers
101 | beer.groupby('cluster').mean()
102 | 
103 | 
104 | # scatter plot matrix of new cluster assignments (0=red, 1=green, 2=blue)
105 | pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)
106 | 
107 | 
108 | # Do you notice any cluster assignments that seem a bit odd? How might we explain those?
109 | 
110 | # ## Part 2: Clustering evaluation
111 | # 
112 | # The [Silhouette Coefficient](http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient) is a common metric for evaluating clustering "performance" in situations when the "true" cluster assignments are not known.
113 | # 
114 | # A Silhouette Coefficient is calculated for **each observation**:
115 | # 
116 | # $$SC = \frac{b-a} {max(a, b)}$$
117 | # 
118 | # - a = mean distance to all other points in **its cluster**
119 | # - b = mean distance to all other points in **the next nearest cluster**
120 | # 
121 | # It ranges from -1 (worst) to 1 (best). A **global score** is calculated by taking the mean score for all observations.
122 | 
123 | # calculate SC for K=3
124 | from sklearn import metrics
125 | metrics.silhouette_score(X_scaled, km.labels_)
126 | 
127 | 
128 | # calculate SC for K=2 through K=19
129 | k_range = range(2, 20)
130 | scores = []
131 | for k in k_range:
132 |     km = KMeans(n_clusters=k, random_state=1)
133 |     km.fit(X_scaled)
134 |     scores.append(metrics.silhouette_score(X_scaled, km.labels_))
135 | 
136 | 
137 | # plot the results
138 | plt.plot(k_range, scores)
139 | plt.xlabel('Number of clusters')
140 | plt.ylabel('Silhouette Coefficient')
141 | plt.grid(True)
142 | 
143 | 
144 | # K-means with 4 clusters on scaled data
145 | km = KMeans(n_clusters=4, random_state=1)
146 | km.fit(X_scaled)
147 | beer['cluster'] = km.labels_
148 | beer.sort('cluster')
149 | 
150 | 
151 | # ## Part 3: DBSCAN clustering
152 | 
153 | # DBSCAN with eps=1 and min_samples=3
154 | from sklearn.cluster import DBSCAN
155 | db = DBSCAN(eps=1, min_samples=3)
156 | db.fit(X_scaled)
157 | 
158 | 
159 | # review the cluster labels
160 | db.labels_
161 | 
162 | 
163 | # save the cluster labels and sort by cluster
164 | beer['cluster'] = db.labels_
165 | beer.sort('cluster')
166 | 
167 | 
168 | # review the cluster centers
169 | beer.groupby('cluster').mean()
170 | 
171 | 
172 | # scatter plot matrix of DBSCAN cluster assignments (0=red, 1=green, 2=blue, -1=yellow)
173 | pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)
174 | 


--------------------------------------------------------------------------------
/code/20_regex_exercise.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | EXERCISE: Regular Expressions
 3 | '''
 4 | 
 5 | # open file and store each line as one list element
 6 | with open('homicides.txt', mode='rU') as f:
 7 |     data = [row for row in f]
 8 | 
 9 | 
10 | '''
11 | Create a list of ages
12 | '''
13 | 
14 | import re
15 | 
16 | ages = []
17 | for row in data:
18 |     match = re.search(r'\d+ years? old', row)
19 |     if match:
20 |         ages.append(match.group())
21 |     else:
22 |         ages.append('0')
23 | 
24 | # split the string on spaces, only keep the first element, and convert to int
25 | ages = [int(element.split()[0]) for element in ages]
26 | 
27 | # calculate average age
28 | sum(ages) / float(len(ages))
29 | 
30 | # check that 'data' and 'ages' are the same length
31 | assert(len(data)==len(ages))
32 | 
33 | 
34 | '''
35 | Create a list of ages (using match groups)
36 | '''
37 | 
38 | ages = []
39 | for row in data:
40 |     match = re.search(r'(\d+)( years? old)', row)
41 |     if match:
42 |         ages.append(int(match.group(1)))
43 |     else:
44 |         ages.append(0)
45 | 
46 | 
47 | '''
48 | Create a list of causes
49 | '''
50 | 
51 | causes = []
52 | for row in data:
53 |     match = re.search(r'Cause: (.+?)<', row)
54 |     if match:
55 |         causes.append(match.group(1).lower())
56 |     else:
57 |         causes.append('unknown')
58 | 
59 | # tally the causes
60 | from collections import Counter
61 | Counter(causes)
62 | 


--------------------------------------------------------------------------------
/code/20_regex_reference.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | REFERENCE GUIDE: Regular Expressions
  3 | '''
  4 | 
  5 | '''
  6 | Rules for Searching:
  7 | 
  8 | Search proceeds through string from start to end, stopping at first match
  9 | All of the pattern must be matched
 10 | 
 11 | Basic Patterns:
 12 | 
 13 | Ordinary characters match themselves exactly
 14 | . matches any single character except newline \n
 15 | \w matches a word character (letter, digit, underscore)
 16 | \W matches any non-word character
 17 | \b matches boundary between word and non-word
 18 | \s matches single whitespace character (space, newline, return, tab, form)
 19 | \S matches single non-whitespace character
 20 | \d matches single digit (0 through 9)
 21 | \t matches tab
 22 | \n matches newline
 23 | \r matches return
 24 | \ match a special character, such as period: \.
 25 | 
 26 | Basic Python Usage:
 27 | 
 28 | match = re.search(r'pattern', string_to_search)
 29 | Returns match object
 30 | If there is a match, access match using match.group()
 31 | If there is no match, match is None
 32 | Use 'r' in front of pattern to designate a raw string
 33 | '''
 34 | 
 35 | import re
 36 | 
 37 | s = 'my 1st string!!'
 38 | 
 39 | match = re.search(r'my', s)     # returns match object
 40 | if match:                       # checks whether match was found
 41 |     print match.group()         # if match was found, then print result
 42 | 
 43 | re.search(r'my', s).group()     # single-line version (without error handling)
 44 | re.search(r'st', s).group()     # 'st'
 45 | re.search(r'sta', s).group()    # error
 46 | re.search(r'\w\w\w', s).group() # '1st'
 47 | re.search(r'\W', s).group()     # ' '
 48 | re.search(r'\W\W', s).group()   # '!!'
 49 | re.search(r'\s', s).group()     # ' '
 50 | re.search(r'\s\s', s).group()   # error
 51 | re.search(r'..t', s).group()    # '1st'
 52 | re.search(r'\s\St', s).group()  # ' st'
 53 | re.search(r'\bst', s).group()   # 'st'
 54 | 
 55 | 
 56 | '''
 57 | Repetition:
 58 | 
 59 | + 1 or more occurrences of the pattern to its left
 60 | * 0 or more occurrences of the pattern to its left
 61 | ? 0 or 1 occurrence of the pattern to its left
 62 | 
 63 | + and * are 'greedy': they try to use up as much of the string as possible
 64 | 
 65 | Add ? after + or * to make them 'lazy': +? or *?
 66 | '''
 67 | 
 68 | s = 'sid is missing class'
 69 | 
 70 | re.search(r'miss\w+', s).group()    # 'missing'
 71 | re.search(r'is\w+', s).group()      # 'issing'
 72 | re.search(r'is\w*', s).group()      # 'is'
 73 | 
 74 | s = '<h1>my heading</h1>'
 75 | 
 76 | re.search(r'<.+>', s).group()   # '<h1>my heading</h1>'
 77 | re.search(r'<.+?>', s).group()  # '<h1>'
 78 | 
 79 | 
 80 | '''
 81 | Positions:
 82 | 
 83 | ^ match start of a string
 84 | $ match end of a string
 85 | '''
 86 | 
 87 | s = 'sid is missing class'
 88 | 
 89 | re.search(r'^miss', s).group()  # error
 90 | re.search(r'..ss', s).group()   # 'miss'
 91 | re.search(r'..ss$', s).group()  # 'lass'
 92 | 
 93 | 
 94 | '''
 95 | Brackets:
 96 | 
 97 | [abc] match a or b or c
 98 | \w, \s, etc. work inside brackets, except period just means a literal period
 99 | [a-z] match any lowercase letter (dash indicates range unless it's last)
100 | [abc-] match a or b or c or -
101 | [^ab] match anything except a or b
102 | '''
103 | 
104 | s = 'my email is john-doe@gmail.com'
105 | 
106 | re.search(r'\w+@\w+', s).group()            # 'doe@gmail'
107 | re.search(r'[\w.-]+@[\w.-]+', s).group()    # 'john-doe@gmail.com'
108 | 
109 | 
110 | '''
111 | Lookarounds:
112 | 
113 | Lookahead matches a pattern only if it is followed by another pattern
114 | 100(?= dollars) matches '100' only if it is followed by ' dollars'
115 | 
116 | Lookbehind matches a pattern only if it is preceded by another pattern
117 | (?<=\$)100 matches '100' only if it is preceded by '$'
118 | '''
119 | 
120 | s = 'Name: Cindy, 30 years old'
121 | 
122 | re.search(r'\d+(?= years? old)', s).group()     # '30'
123 | re.search(r'(?<=Name: )\w+', s).group()         # 'Cindy'
124 | 
125 | 
126 | '''
127 | Match Groups:
128 | 
129 | Parentheses create logical groups inside of match text
130 | match.group(1) corresponds to first group
131 | match.group(2) corresponds to second group
132 | match.group() corresponds to entire match text (as usual)
133 | '''
134 | 
135 | s = 'my email is john-doe@gmail.com'
136 | 
137 | match = re.search(r'([\w.-]+)@([\w.-]+)', s)
138 | if match:
139 |     match.group(1)      # 'john-doe'
140 |     match.group(2)      # 'gmail.com'
141 |     match.group()       # 'john-doe@gmail.com'
142 | 
143 | 
144 | '''
145 | Finding All Matches:
146 | 
147 | re.findall() finds all matches and returns them as a list of strings
148 | list_of_strings = re.findall(r'pattern', string_to_search)
149 | 
150 | If pattern includes parentheses, a list of tuples is returned
151 | '''
152 | 
153 | s = 'emails: joe@gmail.com, bob@gmail.com'
154 | 
155 | re.findall(r'[\w.-]+@[\w.-]+', s)       # ['joe@gmail.com', 'bob@gmail.com']
156 | re.findall(r'([\w.-]+)@([\w.-]+)', s)   # [('joe', 'gmail.com'), ('bob', 'gmail.com')]
157 | 
158 | 
159 | '''
160 | Option Flags:
161 | 
162 | Options flags modify the behavior of the pattern matching
163 | 
164 | default: matching is case sensitive
165 | re.IGNORECASE: ignore uppercase/lowercase differences ('a' matches 'a' or 'A')
166 | 
167 | default: period matches any character except newline
168 | re.DOTALL: allow period to match newline
169 | 
170 | default: within a string of many lines, ^ and $ match start and end of entire string
171 | re.MULTILINE: allow ^ and $ to match start and end of each line
172 | 
173 | Option flag is third argument to re.search() or re.findall():
174 | re.search(r'pattern', string_to_search, re.IGNORECASE)
175 | re.findall(r'pattern', string_to_search, re.IGNORECASE)
176 | '''
177 | 
178 | s = 'emails: nicole@ga.co, joe@gmail.com, PAT@GA.CO'
179 | 
180 | re.findall(r'\w+@ga\.co', s)                # ['nicole@ga.co']
181 | re.findall(r'\w+@ga\.co', s, re.IGNORECASE) # ['nicole@ga.co', 'PAT@GA.CO']
182 | 
183 | 
184 | '''
185 | Substitution:
186 | 
187 | re.sub() finds all matches and replaces them with a specified string
188 | new_string = re.sub(r'pattern', r'replacement', string_to_search)
189 | 
190 | Replacement string can refer to text from matching groups:
191 | \1 refers to group(1)
192 | \2 refers to group(2)
193 | etc.
194 | '''
195 | 
196 | s = 'sid is missing class'
197 | 
198 | re.sub(r'is ', r'was ', s)                          # 'sid was missing class'
199 | 
200 | s = 'emails: joe@gmail.com, bob@gmail.com'
201 | 
202 | re.sub(r'([\w.-]+)@([\w.-]+)', r'\1@yahoo.com', s)  # 'emails: joe@yahoo.com, bob@yahoo.com'
203 | 
204 | 
205 | '''
206 | Useful to know, but not covered above:
207 | 
208 | re.split() splits a string by the occurrences of a pattern
209 | re.compile() compiles a pattern (for improved performance if it's used many times)
210 | A|B indicates a pattern that can match A or B
211 | '''
212 | 


--------------------------------------------------------------------------------
/data/airlines.csv:
--------------------------------------------------------------------------------
 1 | airline,avail_seat_km_per_week,incidents_85_99,fatal_accidents_85_99,fatalities_85_99,incidents_00_14,fatal_accidents_00_14,fatalities_00_14
 2 | Aer Lingus,320906734,2,0,0,0,0,0
 3 | Aeroflot*,1197672318,76,14,128,6,1,88
 4 | Aerolineas Argentinas,385803648,6,0,0,1,0,0
 5 | Aeromexico*,596871813,3,1,64,5,0,0
 6 | Air Canada,1865253802,2,0,0,2,0,0
 7 | Air France,3004002661,14,4,79,6,2,337
 8 | Air India*,869253552,2,1,329,4,1,158
 9 | Air New Zealand*,710174817,3,0,0,5,1,7
10 | Alaska Airlines*,965346773,5,0,0,5,1,88
11 | Alitalia,698012498,7,2,50,4,0,0
12 | All Nippon Airways,1841234177,3,1,1,7,0,0
13 | American*,5228357340,21,5,101,17,3,416
14 | Austrian Airlines,358239823,1,0,0,1,0,0
15 | Avianca,396922563,5,3,323,0,0,0
16 | British Airways*,3179760952,4,0,0,6,0,0
17 | Cathay Pacific*,2582459303,0,0,0,2,0,0
18 | China Airlines,813216487,12,6,535,2,1,225
19 | Condor,417982610,2,1,16,0,0,0
20 | COPA,550491507,3,1,47,0,0,0
21 | Delta / Northwest*,6525658894,24,12,407,24,2,51
22 | Egyptair,557699891,8,3,282,4,1,14
23 | El Al,335448023,1,1,4,1,0,0
24 | Ethiopian Airlines,488560643,25,5,167,5,2,92
25 | Finnair,506464950,1,0,0,0,0,0
26 | Garuda Indonesia,613356665,10,3,260,4,2,22
27 | Gulf Air,301379762,1,0,0,3,1,143
28 | Hawaiian Airlines,493877795,0,0,0,1,0,0
29 | Iberia,1173203126,4,1,148,5,0,0
30 | Japan Airlines,1574217531,3,1,520,0,0,0
31 | Kenya Airways,277414794,2,0,0,2,2,283
32 | KLM*,1874561773,7,1,3,1,0,0
33 | Korean Air,1734522605,12,5,425,1,0,0
34 | LAN Airlines,1001965891,3,2,21,0,0,0
35 | Lufthansa*,3426529504,6,1,2,3,0,0
36 | Malaysia Airlines,1039171244,3,1,34,3,2,537
37 | Pakistan International,348563137,8,3,234,10,2,46
38 | Philippine Airlines,413007158,7,4,74,2,1,1
39 | Qantas*,1917428984,1,0,0,5,0,0
40 | Royal Air Maroc,295705339,5,3,51,3,0,0
41 | SAS*,682971852,5,0,0,6,1,110
42 | Saudi Arabian,859673901,7,2,313,11,0,0
43 | Singapore Airlines,2376857805,2,2,6,2,1,83
44 | South African,651502442,2,1,159,1,0,0
45 | Southwest Airlines,3276525770,1,0,0,8,0,0
46 | Sri Lankan / AirLanka,325582976,2,1,14,4,0,0
47 | SWISS*,792601299,2,1,229,3,0,0
48 | TACA,259373346,3,1,3,1,1,3
49 | TAM,1509195646,8,3,98,7,2,188
50 | TAP - Air Portugal,619130754,0,0,0,0,0,0
51 | Thai Airways,1702802250,8,4,308,2,1,1
52 | Turkish Airlines,1946098294,8,3,64,8,2,84
53 | United / Continental*,7139291291,19,8,319,14,2,109
54 | US Airways / America West*,2455687887,16,7,224,11,2,23
55 | Vietnam Airlines,625084918,7,3,171,1,0,0
56 | Virgin Atlantic,1005248585,1,0,0,0,0,0
57 | Xiamen Airlines,430462962,9,1,82,2,0,0
58 | 


--------------------------------------------------------------------------------
/data/beer.txt:
--------------------------------------------------------------------------------
 1 | name calories sodium alcohol cost
 2 | Budweiser 144 15 4.7 0.43
 3 | Schlitz 151 19 4.9 0.43
 4 | Lowenbrau 157 15 0.9 0.48
 5 | Kronenbourg 170 7 5.2 0.73
 6 | Heineken 152 11 5.0 0.77
 7 | Old_Milwaukee 145 23 4.6 0.28
 8 | Augsberger 175 24 5.5 0.40
 9 | Srohs_Bohemian_Style 149 27 4.7 0.42
10 | Miller_Lite 99 10 4.3 0.43
11 | Budweiser_Light 113 8 3.7 0.40
12 | Coors 140 18 4.6 0.44
13 | Coors_Light 102 15 4.1 0.46
14 | Michelob_Light 135 11 4.2 0.50
15 | Becks 150 19 4.7 0.76
16 | Kirin 149 6 5.0 0.79
17 | Pabst_Extra_Light 68 15 2.3 0.38
18 | Hamms 139 19 4.4 0.43
19 | Heilemans_Old_Style 144 24 4.9 0.43
20 | Olympia_Goled_Light 72 6 2.9 0.46
21 | Schlitz_Light 97 7 4.2 0.47
22 | 


--------------------------------------------------------------------------------
/data/drinks.csv:
--------------------------------------------------------------------------------
  1 | country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
  2 | Afghanistan,0,0,0,0.0,AS
  3 | Albania,89,132,54,4.9,EU
  4 | Algeria,25,0,14,0.7,AF
  5 | Andorra,245,138,312,12.4,EU
  6 | Angola,217,57,45,5.9,AF
  7 | Antigua & Barbuda,102,128,45,4.9,NA
  8 | Argentina,193,25,221,8.3,SA
  9 | Armenia,21,179,11,3.8,EU
 10 | Australia,261,72,212,10.4,OC
 11 | Austria,279,75,191,9.7,EU
 12 | Azerbaijan,21,46,5,1.3,EU
 13 | Bahamas,122,176,51,6.3,NA
 14 | Bahrain,42,63,7,2.0,AS
 15 | Bangladesh,0,0,0,0.0,AS
 16 | Barbados,143,173,36,6.3,NA
 17 | Belarus,142,373,42,14.4,EU
 18 | Belgium,295,84,212,10.5,EU
 19 | Belize,263,114,8,6.8,NA
 20 | Benin,34,4,13,1.1,AF
 21 | Bhutan,23,0,0,0.4,AS
 22 | Bolivia,167,41,8,3.8,SA
 23 | Bosnia-Herzegovina,76,173,8,4.6,EU
 24 | Botswana,173,35,35,5.4,AF
 25 | Brazil,245,145,16,7.2,SA
 26 | Brunei,31,2,1,0.6,AS
 27 | Bulgaria,231,252,94,10.3,EU
 28 | Burkina Faso,25,7,7,4.3,AF
 29 | Burundi,88,0,0,6.3,AF
 30 | Cote d'Ivoire,37,1,7,4.0,AF
 31 | Cabo Verde,144,56,16,4.0,AF
 32 | Cambodia,57,65,1,2.2,AS
 33 | Cameroon,147,1,4,5.8,AF
 34 | Canada,240,122,100,8.2,NA
 35 | Central African Republic,17,2,1,1.8,AF
 36 | Chad,15,1,1,0.4,AF
 37 | Chile,130,124,172,7.6,SA
 38 | China,79,192,8,5.0,AS
 39 | Colombia,159,76,3,4.2,SA
 40 | Comoros,1,3,1,0.1,AF
 41 | Congo,76,1,9,1.7,AF
 42 | Cook Islands,0,254,74,5.9,OC
 43 | Costa Rica,149,87,11,4.4,NA
 44 | Croatia,230,87,254,10.2,EU
 45 | Cuba,93,137,5,4.2,NA
 46 | Cyprus,192,154,113,8.2,EU
 47 | Czech Republic,361,170,134,11.8,EU
 48 | North Korea,0,0,0,0.0,AS
 49 | DR Congo,32,3,1,2.3,AF
 50 | Denmark,224,81,278,10.4,EU
 51 | Djibouti,15,44,3,1.1,AF
 52 | Dominica,52,286,26,6.6,NA
 53 | Dominican Republic,193,147,9,6.2,NA
 54 | Ecuador,162,74,3,4.2,SA
 55 | Egypt,6,4,1,0.2,AF
 56 | El Salvador,52,69,2,2.2,NA
 57 | Equatorial Guinea,92,0,233,5.8,AF
 58 | Eritrea,18,0,0,0.5,AF
 59 | Estonia,224,194,59,9.5,EU
 60 | Ethiopia,20,3,0,0.7,AF
 61 | Fiji,77,35,1,2.0,OC
 62 | Finland,263,133,97,10.0,EU
 63 | France,127,151,370,11.8,EU
 64 | Gabon,347,98,59,8.9,AF
 65 | Gambia,8,0,1,2.4,AF
 66 | Georgia,52,100,149,5.4,EU
 67 | Germany,346,117,175,11.3,EU
 68 | Ghana,31,3,10,1.8,AF
 69 | Greece,133,112,218,8.3,EU
 70 | Grenada,199,438,28,11.9,NA
 71 | Guatemala,53,69,2,2.2,NA
 72 | Guinea,9,0,2,0.2,AF
 73 | Guinea-Bissau,28,31,21,2.5,AF
 74 | Guyana,93,302,1,7.1,SA
 75 | Haiti,1,326,1,5.9,NA
 76 | Honduras,69,98,2,3.0,NA
 77 | Hungary,234,215,185,11.3,EU
 78 | Iceland,233,61,78,6.6,EU
 79 | India,9,114,0,2.2,AS
 80 | Indonesia,5,1,0,0.1,AS
 81 | Iran,0,0,0,0.0,AS
 82 | Iraq,9,3,0,0.2,AS
 83 | Ireland,313,118,165,11.4,EU
 84 | Israel,63,69,9,2.5,AS
 85 | Italy,85,42,237,6.5,EU
 86 | Jamaica,82,97,9,3.4,NA
 87 | Japan,77,202,16,7.0,AS
 88 | Jordan,6,21,1,0.5,AS
 89 | Kazakhstan,124,246,12,6.8,AS
 90 | Kenya,58,22,2,1.8,AF
 91 | Kiribati,21,34,1,1.0,OC
 92 | Kuwait,0,0,0,0.0,AS
 93 | Kyrgyzstan,31,97,6,2.4,AS
 94 | Laos,62,0,123,6.2,AS
 95 | Latvia,281,216,62,10.5,EU
 96 | Lebanon,20,55,31,1.9,AS
 97 | Lesotho,82,29,0,2.8,AF
 98 | Liberia,19,152,2,3.1,AF
 99 | Libya,0,0,0,0.0,AF
100 | Lithuania,343,244,56,12.9,EU
101 | Luxembourg,236,133,271,11.4,EU
102 | Madagascar,26,15,4,0.8,AF
103 | Malawi,8,11,1,1.5,AF
104 | Malaysia,13,4,0,0.3,AS
105 | Maldives,0,0,0,0.0,AS
106 | Mali,5,1,1,0.6,AF
107 | Malta,149,100,120,6.6,EU
108 | Marshall Islands,0,0,0,0.0,OC
109 | Mauritania,0,0,0,0.0,AF
110 | Mauritius,98,31,18,2.6,AF
111 | Mexico,238,68,5,5.5,NA
112 | Micronesia,62,50,18,2.3,OC
113 | Monaco,0,0,0,0.0,EU
114 | Mongolia,77,189,8,4.9,AS
115 | Montenegro,31,114,128,4.9,EU
116 | Morocco,12,6,10,0.5,AF
117 | Mozambique,47,18,5,1.3,AF
118 | Myanmar,5,1,0,0.1,AS
119 | Namibia,376,3,1,6.8,AF
120 | Nauru,49,0,8,1.0,OC
121 | Nepal,5,6,0,0.2,AS
122 | Netherlands,251,88,190,9.4,EU
123 | New Zealand,203,79,175,9.3,OC
124 | Nicaragua,78,118,1,3.5,NA
125 | Niger,3,2,1,0.1,AF
126 | Nigeria,42,5,2,9.1,AF
127 | Niue,188,200,7,7.0,OC
128 | Norway,169,71,129,6.7,EU
129 | Oman,22,16,1,0.7,AS
130 | Pakistan,0,0,0,0.0,AS
131 | Palau,306,63,23,6.9,OC
132 | Panama,285,104,18,7.2,NA
133 | Papua New Guinea,44,39,1,1.5,OC
134 | Paraguay,213,117,74,7.3,SA
135 | Peru,163,160,21,6.1,SA
136 | Philippines,71,186,1,4.6,AS
137 | Poland,343,215,56,10.9,EU
138 | Portugal,194,67,339,11.0,EU
139 | Qatar,1,42,7,0.9,AS
140 | South Korea,140,16,9,9.8,AS
141 | Moldova,109,226,18,6.3,EU
142 | Romania,297,122,167,10.4,EU
143 | Russian Federation,247,326,73,11.5,AS
144 | Rwanda,43,2,0,6.8,AF
145 | St. Kitts & Nevis,194,205,32,7.7,NA
146 | St. Lucia,171,315,71,10.1,NA
147 | St. Vincent & the Grenadines,120,221,11,6.3,NA
148 | Samoa,105,18,24,2.6,OC
149 | San Marino,0,0,0,0.0,EU
150 | Sao Tome & Principe,56,38,140,4.2,AF
151 | Saudi Arabia,0,5,0,0.1,AS
152 | Senegal,9,1,7,0.3,AF
153 | Serbia,283,131,127,9.6,EU
154 | Seychelles,157,25,51,4.1,AF
155 | Sierra Leone,25,3,2,6.7,AF
156 | Singapore,60,12,11,1.5,AS
157 | Slovakia,196,293,116,11.4,EU
158 | Slovenia,270,51,276,10.6,EU
159 | Solomon Islands,56,11,1,1.2,OC
160 | Somalia,0,0,0,0.0,AF
161 | South Africa,225,76,81,8.2,AF
162 | Spain,284,157,112,10.0,EU
163 | Sri Lanka,16,104,0,2.2,AS
164 | Sudan,8,13,0,1.7,AF
165 | Suriname,128,178,7,5.6,SA
166 | Swaziland,90,2,2,4.7,AF
167 | Sweden,152,60,186,7.2,EU
168 | Switzerland,185,100,280,10.2,EU
169 | Syria,5,35,16,1.0,AS
170 | Tajikistan,2,15,0,0.3,AS
171 | Thailand,99,258,1,6.4,AS
172 | Macedonia,106,27,86,3.9,EU
173 | Timor-Leste,1,1,4,0.1,AS
174 | Togo,36,2,19,1.3,AF
175 | Tonga,36,21,5,1.1,OC
176 | Trinidad & Tobago,197,156,7,6.4,NA
177 | Tunisia,51,3,20,1.3,AF
178 | Turkey,51,22,7,1.4,AS
179 | Turkmenistan,19,71,32,2.2,AS
180 | Tuvalu,6,41,9,1.0,OC
181 | Uganda,45,9,0,8.3,AF
182 | Ukraine,206,237,45,8.9,EU
183 | United Arab Emirates,16,135,5,2.8,AS
184 | United Kingdom,219,126,195,10.4,EU
185 | Tanzania,36,6,1,5.7,AF
186 | USA,249,158,84,8.7,NA
187 | Uruguay,115,35,220,6.6,SA
188 | Uzbekistan,25,101,8,2.4,AS
189 | Vanuatu,21,18,11,0.9,OC
190 | Venezuela,333,100,3,7.7,SA
191 | Vietnam,111,2,1,2.0,AS
192 | Yemen,6,0,0,0.1,AS
193 | Zambia,32,19,4,2.5,AF
194 | Zimbabwe,64,18,4,4.7,AF
195 | 


--------------------------------------------------------------------------------
/data/example.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang='en'>
 3 | 
 4 | <head>
 5 |     <title>Example Web Page</title>
 6 | </head>
 7 | 
 8 | <body>
 9 | 
10 |     <h1 id='main'>DAT8 Class 7</h1>
11 | 
12 |     <p class='topic' id='api'>First, we are covering APIs, which are useful for getting data.</p>
13 |     <p class='topic' id='scraping'>Then, we are covering web scraping, which is a more flexible way to get data.</p>
14 |     <p class='topic' id='feedback'>Finally, I will ask you to fill out yet another feedback form!</p>
15 | 
16 |     <h2>Resource List</h2>
17 | 
18 |     <p>Here are some helpful API resources:</p>
19 | 
20 |     <ul id='api'>
21 |         <li>API resource 1</li>
22 |         <li>API resource 2</li>
23 |     </ul>
24 | 
25 |     <p>Here are some helpful web scraping resources:</p>
26 | 
27 |     <ul id='scraping'>
28 |         <li>Web scraping resource 1</li>
29 |         <li>Web scraping resource 2</li>
30 |     </ul>
31 | 
32 | </body>
33 | 
34 | </html>
35 | 


--------------------------------------------------------------------------------
/data/imdb_ids.txt:
--------------------------------------------------------------------------------
1 | tt0111161
2 | tt1856010
3 | tt0096694
4 | tt0088763
5 | tt1375666
6 | 


--------------------------------------------------------------------------------
/data/u.item:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/data/u.item


--------------------------------------------------------------------------------
/data/vehicles_test.csv:
--------------------------------------------------------------------------------
1 | price,year,miles,doors,vtype
2 | 3000,2003,130000,4,truck
3 | 6000,2005,82500,4,car
4 | 12000,2010,60000,2,car
5 | 


--------------------------------------------------------------------------------
/data/vehicles_train.csv:
--------------------------------------------------------------------------------
 1 | price,year,miles,doors,vtype
 2 | 22000,2012,13000,2,car
 3 | 14000,2010,30000,2,car
 4 | 13000,2010,73500,4,car
 5 | 9500,2009,78000,4,car
 6 | 9000,2007,47000,4,car
 7 | 4000,2006,124000,2,car
 8 | 3000,2004,177000,4,car
 9 | 2000,2004,209000,4,truck
10 | 3000,2003,138000,2,car
11 | 1900,2003,160000,4,car
12 | 2500,2003,190000,2,truck
13 | 5000,2001,62000,4,car
14 | 1800,1999,163000,2,truck
15 | 1300,1997,138000,4,car
16 | 


--------------------------------------------------------------------------------
/homework/02_command_line_chipotle.md:
--------------------------------------------------------------------------------
 1 | ## Class 2 Homework: Command Line Chipotle
 2 | 
 3 | #### Submitting Your Homework
 4 | 
 5 | * Create a Markdown file that includes your answers **and** the code you used to arrive at those answers.
 6 | * Add this Markdown file to a GitHub repo that you'll use for all of your coursework.
 7 | * Submit a link to your repo using the homework submission form.
 8 | 
 9 | #### Command Line Tasks
10 | 
11 | 1. Look at the head and the tail of **chipotle.tsv** in the **data** subdirectory of this repo. Think for a minute about how the data is structured. What do you think each column means? What do you think each row means? Tell me! (If you're unsure, look at more of the file contents.)
12 | 2. How many orders do there appear to be?
13 | 3. How many lines are in this file?
14 | 4. Which burrito is more popular, steak or chicken?
15 | 5. Do chicken burritos more often have black beans or pinto beans?
16 | 6. Make a list of all of the CSV or TSV files in the DAT8 repo (using a single command). Think about how wildcard characters can help you with this task.
17 | 7. Count the approximate number of occurrences of the word "dictionary" (regardless of case) across all files in the DAT8 repo.
18 | 8. **Optional:** Use the the command line to discover something "interesting" about the Chipotle data. Try using the commands from the "advanced" section!
19 | 
20 | #### Solution
21 | 
22 | 1. **order_id** is the unique identifier for each order. **quantity** is the number purchased of a particular item. **item_name** is the primary name for the item being purchased. **choice_description** is list of modifiers for that item. **price** is the price for that entire line (taking **quantity** into account). A given order consists of one or more rows, depending upon the number of unique items being purchased in that order.
23 |     * `head chipotle.tsv`
24 |     * `tail chipotle.tsv`
25 | 2. There are 1834 orders (since 1834 is the highest **order_id** number).
26 | 3. The file has 4623 lines.
27 |     * `wc -l chipotle.tsv`
28 | 4. Chicken burritos are more popular than steak burritos.
29 |     * Compare `grep -i 'chicken burrito' chipotle.tsv | wc -l` with `grep -i 'steak burrito' chipotle.tsv | wc -l`
30 |     * Alternatively, use the 'c' option of `grep` to skip the piping step: `grep -ic 'chicken burrito' chipotle.tsv`
31 | 5. Black beans are more popular than pinto beans (on chicken burritos).
32 |     * Compare `grep -i 'chicken burrito' chipotle.tsv | grep -i 'black beans' | wc -l` with `grep -i 'chicken burrito' chipotle.tsv | grep -i 'pinto beans' | wc -l`
33 |     * Alternatively, use the 'c' option of `grep` and a more complex regular expression pattern to skip the piping steps: `grep -ic 'chicken burrito.*black beans' chipotle.tsv`
34 | 6. At the moment, the CSV and TSV files in the DAT8 repo are **airlines.csv**, **chipotle.tsv**, and **sms.tsv**, all of which are in the **data** subdirectory.
35 |     * Change your working directory to DAT8, and then use `find . -name *.?sv`
36 | 7. At the moment, there are 13 lines in DAT8 files that contain the word 'dictionary', which is a good approximation of the number of occurrences.
37 |     * Change your working directory to DAT8, and then use `grep -ir 'dictionary' . | wc -l`
38 |     * Alternatively, use the 'c' option of `grep` to skip the piping step: `grep -irc 'dictionary' .`
39 | 


--------------------------------------------------------------------------------
/homework/09_bias_variance.md:
--------------------------------------------------------------------------------
 1 | ## Class 9 Pre-work: Bias-Variance Tradeoff
 2 | 
 3 | Read this excellent article, [Understanding the Bias-Variance Tradeoff](http://scott.fortmann-roe.com/docs/BiasVariance.html), and be prepared to **discuss it in class** on Tuesday.
 4 | 
 5 | **Note:** You can ignore sections 4.2 and 4.3.
 6 | 
 7 | Here are some questions to think about while you read:
 8 | * In the Party Registration example, what are the features? What is the response? Is this a regression or classification problem?
 9 |     * The features are wealth and religiousness. The response is voter party registration. This is a classification problem.
10 | * Conceptually, how is KNN being applied to this problem to make a prediction?
11 |     * Find the K most similar voters in the training data (in terms of wealth and religiousness), and use the majority party registration among those "neighbors" as the predicted party registration for the unknown individual.
12 | * How do the four visualizations in section 3 relate to one another? Change the value of K using the slider, and make sure you understand what changed in the visualizations (and why it changed).
13 |     * First viz: training data colored by response value
14 |     * Second viz: classification map for K=1
15 |     * Third viz: out-of-sample data colored by predicted response value, and identification of the neighborhoods used to make that prediction
16 |     * Fourth viz: predicted response value for each hexagon
17 |     * Changing K changes the predictions in the third and fourth viz
18 | * In figures 4 and 5, what do the lighter colors versus the darker colors mean? How is the darkness calculated?
19 |     * Darkness indicates confidence in the prediction, and is calculated using the proportion of nearest neighbors that have the same response value.
20 | * What does the black line in figure 5 represent? What predictions would the best possible machine learning model make, with respect to this line?
21 |     * The black line is the the underlying model that generated the training data. The best possible machine learning model would learn that line as its decision boundary. It would not be a perfect model, but it would be the best possible model.
22 | * Choose a very small value of K, and click the button "Generate New Training Data" a number of times. Do you "see" low variance or high variance, and low bias or high bias?
23 |     * High variance, low bias
24 | * Repeat this with a very large value of K. Do you "see" low variance or high variance, and low bias or high bias?
25 |     * Low variance, high bias
26 | * Try using other values of K. What value of K do you think is "best"? How do you define "best"?
27 |     * A value of K in the middle is best. The best value is the value that results in a model whose predictions most consistently match the decision boundary.
28 | * Does a small value for K cause "overfitting" or "underfitting"?
29 |     * Overfitting
30 | * Why should we care about variance at all? Shouldn't we just minimize bias and ignore variance?
31 |     * If you had all of the possible data (past and future), a model with high complexity (and thus high variance) would be ideal because it would capture all of the complexity in the data and wouldn't need to generalize. But given that we only have a single sample of data, both bias and variance contribute to prediction error and should be appropriately balanced.
32 | 


--------------------------------------------------------------------------------
/homework/10_yelp_votes.md:
--------------------------------------------------------------------------------
 1 | ## Class 10 Homework: Yelp Votes
 2 | 
 3 | This assignment uses a small subset of the data from Kaggle's [Yelp Business Rating Prediction](https://www.kaggle.com/c/yelp-recsys-2013) competition.
 4 | 
 5 | **Description of the data:**
 6 | 
 7 | * `yelp.json` is the original format of the file. `yelp.csv` contains the same data, in a more convenient format. Both of the files are in this repo, so there is no need to download the data from the Kaggle website.
 8 | * Each observation in this dataset is a review of a particular business by a particular user.
 9 | * The "stars" column is the number of stars (1 through 5) assigned by the reviewer to the business. (Higher stars is better.) In other words, it is the rating of the business by the person who wrote the review.
10 | * The "cool" column is the number of "cool" votes this review received from other Yelp users. All reviews start with 0 "cool" votes, and there is no limit to how many "cool" votes a review can receive. In other words, it is a rating of the review itself, not a rating of the business.
11 | * The "useful" and "funny" columns are similar to the "cool" column.
12 | 
13 | **Homework tasks:**
14 | 
15 | 1. Read `yelp.csv` into a DataFrame.
16 |     * **Bonus:** Ignore the `yelp.csv` file, and construct this DataFrame yourself from `yelp.json`. This involves reading the data into Python, decoding the JSON, converting it to a DataFrame, and adding individual columns for each of the vote types.
17 | 2. Explore the relationship between each of the vote types (cool/useful/funny) and the number of stars.
18 | 3. Define cool/useful/funny as the features, and stars as the response.
19 | 4. Fit a linear regression model and interpret the coefficients. Do the coefficients make intuitive sense to you? Explore the Yelp website to see if you detect similar trends.
20 | 5. Evaluate the model by splitting it into training and testing sets and computing the RMSE. Does the RMSE make intuitive sense to you?
21 | 6. Try removing some of the features and see if the RMSE improves.
22 | 7. **Bonus:** Think of some new features you could create from the existing data that might be predictive of the response. Figure out how to create those features in Pandas, add them to your model, and see if the RMSE improves.
23 | 8. **Bonus:** Compare your best RMSE on the testing set with the RMSE for the "null model", which is the model that ignores all features and simply predicts the mean response value in the testing set.
24 | 9. **Bonus:** Instead of treating this as a regression problem, treat it as a classification problem and see what testing accuracy you can achieve with KNN.
25 | 10. **Bonus:** Figure out how to use linear regression for classification, and compare its classification accuracy with KNN's accuracy.
26 | 


--------------------------------------------------------------------------------
/homework/13_cross_validation.md:
--------------------------------------------------------------------------------
 1 | ## Class 13 Pre-work: Cross-validation
 2 | 
 3 | Watch my video on [cross-validation](https://www.youtube.com/watch?v=6dbrR-WymjI) (36 minutes), and be prepared to **discuss it in class** on Tuesday. The [notebook](../notebooks/13_cross_validation.ipynb) shown in the video is also in this repository.
 4 | 
 5 | Alternatively, read section 5.1 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) (11 pages).
 6 | 
 7 | Here are some questions to think about:
 8 | 
 9 | - What is the purpose of model evaluation?
10 |     - The purpose is to estimate the likely performance of a model on out-of-sample data, so that we can choose the model that is most likely to generalize, and so that we can have an idea of how well that model will actually perform.
11 | - What is the drawback of training and testing on the same data?
12 |     - Training accuracy is maximized for overly complex models which overfit the training data, and thus it's not a good measure of how well a model will generalize.
13 | - How does train/test split work, and what is its primary drawback?
14 |     - It splits the data into two pieces, trains the model on the training set, and tests the model on the testing set. Testing accuracy can change a lot depending upon which observations happen to be in the training and testing sets.
15 | - How does K-fold cross-validation work, and what is the role of "K"?
16 |     - First, it splits the data into K equal folds. Then, it trains the model on folds 2 through K, tests the model on fold 1, and calculates the requested evaluation metric. Then, it repeats that process K-1 more times, until every fold has been the testing set exactly once.
17 | - Why do we pass X and y, not X_train and y_train, to the `cross_val_score` function?
18 |     - It will take care of splitting the data into the K folds, so we don't need to split it ourselves.
19 | - Why does `cross_val_score` need a "scoring" parameter?
20 |     - It needs to know what evaluation metric to calculate, since many different metrics are available.
21 | - What does `cross_val_score` return, and what do we usually do with that object?
22 |     - It returns a NumPy array containing the K scores. We usually calculate the mean score, though we might also be interested in the standard deviation.
23 | - Under what circumstances does `cross_val_score` return negative scores?
24 |     - The scores will be negative if the evaluation metric is a loss function (something you want to minimize) rather than a reward function (something you want to maximize).
25 | - When should you use train/test split, and when should you use cross-validation?
26 |     - Train/test split is useful when you want to inspect your testing results (via confusion matrix or ROC curve) and when evaluation speed is a concern. Cross-validation is useful when you are most concerned with the accuracy of your estimation.
27 | 


--------------------------------------------------------------------------------
/homework/13_roc_auc.md:
--------------------------------------------------------------------------------
 1 | ## Class 13 Pre-work: ROC Curves and AUC
 2 | 
 3 | First, read these [lesson notes](http://ebp.uga.edu/courses/Chapter%204%20-%20Diagnosis%20I/8%20-%20ROC%20curves.html) from a university course for an excellent overview of ROC curves.
 4 | 
 5 | Then, watch my video on [ROC Curves and Area Under the Curve](https://www.youtube.com/watch?v=OAl6eAyP-yo) (14 minutes), and be prepared to **discuss it in class** on Tuesday. (Feel free to play with the [visualization](http://www.navan.name/roc/) shown in the video, or view the [video transcript and screenshots](http://www.dataschool.io/roc-curves-and-auc-explained/).)
 6 | 
 7 | **Optional:** If you would like to go even deeper, [An introduction to ROC analysis](http://people.inf.elte.hu/kiss/13dwhdm/roc.pdf) is a very readable paper on the topic.
 8 | 
 9 | Here are some questions to think about:
10 | 
11 | - What is the difference between the predict and predict_proba methods in scikit-learn?
12 |     - The former outputs class predictions, and the latter outputs predicted probabilities of class membership.
13 | - If you have a classification model that outputs predicted probabilities, how could you convert those probabilities to class predictions?
14 |     - Set a threshold, and classify everything above the threshold as a 1 and everything below the threshold as a 0.
15 | - Why are predicted probabilities (rather than just class predictions) required to generate an ROC curve?
16 |     - Because an ROC curve is measuring the performance of a classifier at all possible thresholds, and thresholds only make sense in the context of predicted probabilities.
17 | - Could you use an ROC curve for a regression problem? Why or why not?
18 |     - No, because ROC is a plot of TPR vs FPR, and those concepts have no meaning in a regression problem.
19 | - What's another term for True Positive Rate?
20 |     - Sensitivity or recall.
21 | - If I wanted to increase specificity, how would I change the classification threshold?
22 |     - Increase it.
23 | - Is it possible to adjust your classification threshold such that both sensitivity and specificity increase simultaneously? Why or why not?
24 |     - No, because increasing either of those requires moving the threshold in opposite directions.
25 | - What are the primary benefits of ROC curves over classification accuracy?
26 |     - Doesn't require setting a classification threshold, allows you to visualize the performance of your classifier, works well for unbalanced classes.
27 | - What should you do if your AUC is 0.2?
28 |     - Reverse your predictions so that your AUC is 0.8.
29 | - What would the plot of reds and blues look like for a dataset in which each observation was a credit card transaction, and the response variable was whether or not the transaction was fraudulent? (0 = not fraudulent, 1 = fraudulent)
30 |     - Blues would be significantly larger, lots of overlap between blues and reds.
31 | - What's a real-world scenario in which you would prefer high specificity (rather than high sensitivity) for your classifier?
32 |     - Speed cameras issuing speeding tickets.
33 | 


--------------------------------------------------------------------------------
/homework/14_spam_filtering.md:
--------------------------------------------------------------------------------
 1 | ## Class 14 Pre-work: Spam Filtering
 2 | 
 3 | Read Paul Graham's [A Plan for Spam](http://www.paulgraham.com/spam.html).
 4 | 
 5 | Here are some questions to think about:
 6 | 
 7 | - Should a spam filter optimize for sensitivity or specificity, in Paul's opinion?
 8 |     - Specificity, in order to minimize false positives (non-spam being incorrectly marked as spam).
 9 | - Before he tried the "statistical approach" to spam filtering, what was his approach?
10 |     - He hand-engineered features and used those features to compute a score.
11 | - What are the key components of his statistical filtering system? In other words, how does it work?
12 |     - Scan the entire text (including headers) and tokenize it.
13 |     - Count the number of occurrences of each token in the ham corpus and the spam corpus (separately).
14 |     - Assign each token a "spam score" based on its relative frequency in the corpora.
15 |     - For new email, only take into account the 15 most "interesting" tokens.
16 | - What did Paul say were some of the benefits of the statistical approach?
17 |     - It works better (almost no false positives).
18 |     - It requires less work because it discovers features automatically.
19 |     - The "spam score" is interpretable.
20 |     - It can easily be tuned to the individual user.
21 |     - It evolves with the spam.
22 |     - It creates an implicit whitelist/blacklist of email addresses, server names, etc.
23 | - How good was his prediction of the "spam of the future"?
24 |     - Great!
25 | 


--------------------------------------------------------------------------------
/homework/14_yelp_review_text.md:
--------------------------------------------------------------------------------
 1 | ## Class 14 Homework: Yelp Review Text
 2 | 
 3 | This assignment uses the same data as the [class 10 homework](10_yelp_votes.md). This time, we will attempt to classify reviews as either 5-star or 1-star using only the review text!
 4 | 
 5 | After each task, I recommend that you check the **shape** and the **contents** of your objects, to confirm that they match your expectations.
 6 | 
 7 | **Homework tasks:**
 8 | 
 9 | 1. Read `yelp.csv` into a DataFrame.
10 | 2. Create a new DataFrame that only contains the 5-star and 1-star reviews.
11 | 3. Split the new DataFrame into training and testing sets, using the review text as the only feature and the star rating as the response.
12 | 4. Use CountVectorizer to create document-term matrices from X_train and X_test.
13 |     - **Hint:** If you run into a decoding error, instantiate the vectorizer with the argument `decode_error='ignore'`.
14 | 5. Use Naive Bayes to predict the star rating for reviews in the testing set, and calculate the accuracy.
15 | 6. Calculate the AUC.
16 |     - **Hint 1:** Make sure to pass the predicted probabilities to `roc_auc_score`, not the predicted classes.
17 |     - **Hint 2:** `roc_auc_score` will get confused if y_test contains fives and ones, so you will need to create a new object that contains ones and zeros instead.
18 | 7. Plot the ROC curve.
19 | 8. Print the confusion matrix, and calculate the sensitivity and specificity. Comment on the results.
20 | 9. Browse through the review text for some of the false positives and false negatives. Based on your knowledge of how Naive Bayes works, do you have any theories about why the model is incorrectly classifying these reviews?
21 | 10. Let's pretend that you want to balance sensitivity and specificity. You can achieve this by changing the threshold for predicting a 5-star review. What threshold approximately balances sensitivity and specificity?
22 | 11. Let's see how well Naive Bayes performs when all reviews are included, rather than just 1-star and 5-star reviews:
23 |     - Define X and y using the original DataFrame from step 1. (y should contain 5 different classes.)
24 |     - Split the data into training and testing sets.
25 |     - Calculate the testing accuracy of a Naive Bayes model.
26 |     - Compare the testing accuracy with the null accuracy.
27 |     - Print the confusion matrix.
28 |     - Comment on the results.
29 | 


--------------------------------------------------------------------------------
/notebooks/12_e_log_examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Exponential functions and logarithms"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import math\n",
 19 |     "import numpy as np"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Exponential functions"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "What is **e**? It is simply a number (known as Euler's number):"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "2.718281828459045"
 47 |       ]
 48 |      },
 49 |      "execution_count": 2,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "math.e"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "**e** is a significant number, because it is the base rate of growth shared by all continually growing processes.\n",
 63 |     "\n",
 64 |     "For example, if I have **10 dollars**, and it grows 100% in 1 year (compounding continuously), I end up with **10\\*e^1 dollars**:"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "27.18281828459045"
 78 |       ]
 79 |      },
 80 |      "execution_count": 3,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "# 100% growth for 1 year\n",
 87 |     "10 * np.exp(1)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 4,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "73.890560989306508"
101 |       ]
102 |      },
103 |      "execution_count": 4,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "# 100% growth for 2 years\n",
110 |     "10 * np.exp(2)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "Side note: When e is raised to a power, it is known as **the exponential function**. Technically, any number can be the base, and it would still be known as **an exponential function** (such as 2^5). But in our context, the base of the exponential function is assumed to be e.\n",
118 |     "\n",
119 |     "Anyway, what if I only have 20% growth instead of 100% growth?"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 5,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "12.214027581601698"
133 |       ]
134 |      },
135 |      "execution_count": 5,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "# 20% growth for 1 year\n",
142 |     "10 * np.exp(0.20)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "14.918246976412703"
156 |       ]
157 |      },
158 |      "execution_count": 6,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "# 20% growth for 2 years\n",
165 |     "10 * np.exp(0.20 * 2)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "## Logarithms"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "What is the **(natural) logarithm**? It gives you the time needed to reach a certain level of growth. For example, if I want growth by a factor of 2.718, it will take me 1 unit of time (assuming a 100% growth rate):"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 7,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "0.99989631572895199"
193 |       ]
194 |      },
195 |      "execution_count": 7,
196 |      "metadata": {},
197 |      "output_type": "execute_result"
198 |     }
199 |    ],
200 |    "source": [
201 |     "# time needed to grow 1 unit to 2.718 units\n",
202 |     "np.log(2.718)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "If I want growth by a factor of 7.389, it will take me 2 units of time:"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 8,
215 |    "metadata": {
216 |     "collapsed": false
217 |    },
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "1.9999924078065106"
223 |       ]
224 |      },
225 |      "execution_count": 8,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "# time needed to grow 1 unit to 7.389 units\n",
232 |     "np.log(7.389)"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "If I want growth by a factor of 1, it will take me 0 units of time:"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 9,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [
249 |     {
250 |      "data": {
251 |       "text/plain": [
252 |        "0.0"
253 |       ]
254 |      },
255 |      "execution_count": 9,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "# time needed to grow 1 unit to 1 unit\n",
262 |     "np.log(1)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "If I want growth by a factor of 0.5, it will take me -0.693 units of time (which is like looking back in time):"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 10,
275 |    "metadata": {
276 |     "collapsed": false
277 |    },
278 |    "outputs": [
279 |     {
280 |      "data": {
281 |       "text/plain": [
282 |        "-0.69314718055994529"
283 |       ]
284 |      },
285 |      "execution_count": 10,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "# time needed to grow 1 unit to 0.5 units\n",
292 |     "np.log(0.5)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "## Connecting the concepts"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "As you can see, the exponential function and the natural logarithm are **inverses** of one another:"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 11,
312 |    "metadata": {
313 |     "collapsed": false
314 |    },
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "text/plain": [
319 |        "5.0"
320 |       ]
321 |      },
322 |      "execution_count": 11,
323 |      "metadata": {},
324 |      "output_type": "execute_result"
325 |     }
326 |    ],
327 |    "source": [
328 |     "np.log(np.exp(5))"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 12,
334 |    "metadata": {
335 |     "collapsed": false
336 |    },
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "4.9999999999999991"
342 |       ]
343 |      },
344 |      "execution_count": 12,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "np.exp(np.log(5))"
351 |    ]
352 |   }
353 |  ],
354 |  "metadata": {
355 |   "kernelspec": {
356 |    "display_name": "Python 2",
357 |    "language": "python",
358 |    "name": "python2"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 2
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython2",
370 |    "version": "2.7.6"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 0
375 | }
376 | 


--------------------------------------------------------------------------------
/notebooks/14_naive_bayes_spam.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Applying Naive Bayes classification to spam filtering"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "Let's pretend we have an email with three words: \"Send money now.\" We'll use Naive Bayes to classify it as **ham or spam.**\n",
15 |     "\n",
16 |     "$$P(spam \\ | \\ \\text{send money now}) = \\frac {P(\\text{send money now} \\ | \\ spam) \\times P(spam)} {P(\\text{send money now})}$$\n",
17 |     "\n",
18 |     "By assuming that the features (the words) are **conditionally independent**, we can simplify the likelihood function:\n",
19 |     "\n",
20 |     "$$P(spam \\ | \\ \\text{send money now}) \\approx \\frac {P(\\text{send} \\ | \\ spam) \\times P(\\text{money} \\ | \\ spam) \\times P(\\text{now} \\ | \\ spam) \\times P(spam)} {P(\\text{send money now})}$$\n",
21 |     "\n",
22 |     "We can calculate all of the values in the numerator by examining a corpus of **spam email**:\n",
23 |     "\n",
24 |     "$$P(spam \\ | \\ \\text{send money now}) \\approx \\frac {0.2 \\times 0.1 \\times 0.1 \\times 0.9} {P(\\text{send money now})} = \\frac {0.0018} {P(\\text{send money now})}$$\n",
25 |     "\n",
26 |     "We would repeat this process with a corpus of **ham email**:\n",
27 |     "\n",
28 |     "$$P(ham \\ | \\ \\text{send money now}) \\approx \\frac {0.05 \\times 0.01 \\times 0.1 \\times 0.1} {P(\\text{send money now})} = \\frac {0.000005} {P(\\text{send money now})}$$\n",
29 |     "\n",
30 |     "All we care about is whether spam or ham has the **higher probability**, and so we predict that the email is **spam**."
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "markdown",
35 |    "metadata": {},
36 |    "source": [
37 |     "## Key takeaways\n",
38 |     "\n",
39 |     "- The **\"naive\" assumption** of Naive Bayes (that the features are conditionally independent) is critical to making these calculations simple.\n",
40 |     "- The **normalization constant** (the denominator) can be ignored since it's the same for all classes.\n",
41 |     "- The **prior probability** is much less relevant once you have a lot of features."
42 |    ]
43 |   },
44 |   {
45 |    "cell_type": "markdown",
46 |    "metadata": {},
47 |    "source": [
48 |     "## Comparing Naive Bayes with other models\n",
49 |     "\n",
50 |     "Advantages of Naive Bayes:\n",
51 |     "\n",
52 |     "- Model training and prediction are very fast\n",
53 |     "- Somewhat interpretable\n",
54 |     "- No tuning is required\n",
55 |     "- Features don't need scaling\n",
56 |     "- Insensitive to irrelevant features (with enough observations)\n",
57 |     "- Performs better than logistic regression when the training set is very small\n",
58 |     "\n",
59 |     "Disadvantages of Naive Bayes:\n",
60 |     "\n",
61 |     "- Predicted probabilities are not well-calibrated\n",
62 |     "- Correlated features can be problematic (due to the independence assumption)\n",
63 |     "- Can't handle negative features (with Multinomial Naive Bayes)\n",
64 |     "- Has a higher \"asymptotic error\" than logistic regression"
65 |    ]
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "kernelspec": {
70 |    "display_name": "Python 2",
71 |    "language": "python",
72 |    "name": "python2"
73 |   },
74 |   "language_info": {
75 |    "codemirror_mode": {
76 |     "name": "ipython",
77 |     "version": 2
78 |    },
79 |    "file_extension": ".py",
80 |    "mimetype": "text/x-python",
81 |    "name": "python",
82 |    "nbconvert_exporter": "python",
83 |    "pygments_lexer": "ipython2",
84 |    "version": "2.7.6"
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 0
89 | }
90 | 


--------------------------------------------------------------------------------
/notebooks/14_types_of_naive_bayes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Comparing Multinomial and Gaussian Naive Bayes\n",
  8 |     "\n",
  9 |     "scikit-learn documentation: [MultinomialNB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html) and [GaussianNB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)\n",
 10 |     "\n",
 11 |     "Dataset: [Pima Indians Diabetes](https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes) from the UCI Machine Learning Repository"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# read the data\n",
 23 |     "import pandas as pd\n",
 24 |     "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'\n",
 25 |     "col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']\n",
 26 |     "pima = pd.read_csv(url, header=None, names=col_names)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/html": [
 39 |        "<div>\n",
 40 |        "<table border=\"1\" class=\"dataframe\">\n",
 41 |        "  <thead>\n",
 42 |        "    <tr style=\"text-align: right;\">\n",
 43 |        "      <th></th>\n",
 44 |        "      <th>pregnant</th>\n",
 45 |        "      <th>glucose</th>\n",
 46 |        "      <th>bp</th>\n",
 47 |        "      <th>skin</th>\n",
 48 |        "      <th>insulin</th>\n",
 49 |        "      <th>bmi</th>\n",
 50 |        "      <th>pedigree</th>\n",
 51 |        "      <th>age</th>\n",
 52 |        "      <th>label</th>\n",
 53 |        "    </tr>\n",
 54 |        "  </thead>\n",
 55 |        "  <tbody>\n",
 56 |        "    <tr>\n",
 57 |        "      <th>0</th>\n",
 58 |        "      <td>6</td>\n",
 59 |        "      <td>148</td>\n",
 60 |        "      <td>72</td>\n",
 61 |        "      <td>35</td>\n",
 62 |        "      <td>0</td>\n",
 63 |        "      <td>33.6</td>\n",
 64 |        "      <td>0.627</td>\n",
 65 |        "      <td>50</td>\n",
 66 |        "      <td>1</td>\n",
 67 |        "    </tr>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>1</th>\n",
 70 |        "      <td>1</td>\n",
 71 |        "      <td>85</td>\n",
 72 |        "      <td>66</td>\n",
 73 |        "      <td>29</td>\n",
 74 |        "      <td>0</td>\n",
 75 |        "      <td>26.6</td>\n",
 76 |        "      <td>0.351</td>\n",
 77 |        "      <td>31</td>\n",
 78 |        "      <td>0</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>2</th>\n",
 82 |        "      <td>8</td>\n",
 83 |        "      <td>183</td>\n",
 84 |        "      <td>64</td>\n",
 85 |        "      <td>0</td>\n",
 86 |        "      <td>0</td>\n",
 87 |        "      <td>23.3</td>\n",
 88 |        "      <td>0.672</td>\n",
 89 |        "      <td>32</td>\n",
 90 |        "      <td>1</td>\n",
 91 |        "    </tr>\n",
 92 |        "    <tr>\n",
 93 |        "      <th>3</th>\n",
 94 |        "      <td>1</td>\n",
 95 |        "      <td>89</td>\n",
 96 |        "      <td>66</td>\n",
 97 |        "      <td>23</td>\n",
 98 |        "      <td>94</td>\n",
 99 |        "      <td>28.1</td>\n",
100 |        "      <td>0.167</td>\n",
101 |        "      <td>21</td>\n",
102 |        "      <td>0</td>\n",
103 |        "    </tr>\n",
104 |        "    <tr>\n",
105 |        "      <th>4</th>\n",
106 |        "      <td>0</td>\n",
107 |        "      <td>137</td>\n",
108 |        "      <td>40</td>\n",
109 |        "      <td>35</td>\n",
110 |        "      <td>168</td>\n",
111 |        "      <td>43.1</td>\n",
112 |        "      <td>2.288</td>\n",
113 |        "      <td>33</td>\n",
114 |        "      <td>1</td>\n",
115 |        "    </tr>\n",
116 |        "  </tbody>\n",
117 |        "</table>\n",
118 |        "</div>"
119 |       ],
120 |       "text/plain": [
121 |        "   pregnant  glucose  bp  skin  insulin   bmi  pedigree  age  label\n",
122 |        "0         6      148  72    35        0  33.6     0.627   50      1\n",
123 |        "1         1       85  66    29        0  26.6     0.351   31      0\n",
124 |        "2         8      183  64     0        0  23.3     0.672   32      1\n",
125 |        "3         1       89  66    23       94  28.1     0.167   21      0\n",
126 |        "4         0      137  40    35      168  43.1     2.288   33      1"
127 |       ]
128 |      },
129 |      "execution_count": 2,
130 |      "metadata": {},
131 |      "output_type": "execute_result"
132 |     }
133 |    ],
134 |    "source": [
135 |     "# notice that all features are continuous\n",
136 |     "pima.head()"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 3,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "# create X and y\n",
148 |     "X = pima.drop('label', axis=1)\n",
149 |     "y = pima.label"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 4,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "# split into training and testing sets\n",
161 |     "from sklearn.cross_validation import train_test_split\n",
162 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 5,
168 |    "metadata": {
169 |     "collapsed": true
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "# import both Multinomial and Gaussian Naive Bayes\n",
174 |     "from sklearn.naive_bayes import MultinomialNB, GaussianNB\n",
175 |     "from sklearn import metrics"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 6,
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [
185 |     {
186 |      "name": "stdout",
187 |      "output_type": "stream",
188 |      "text": [
189 |       "0.541666666667\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "# testing accuracy of Multinomial Naive Bayes\n",
195 |     "mnb = MultinomialNB()\n",
196 |     "mnb.fit(X_train, y_train)\n",
197 |     "y_pred_class = mnb.predict(X_test)\n",
198 |     "print metrics.accuracy_score(y_test, y_pred_class)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 7,
204 |    "metadata": {
205 |     "collapsed": false
206 |    },
207 |    "outputs": [
208 |     {
209 |      "name": "stdout",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "0.791666666667\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "# testing accuracy of Gaussian Naive Bayes\n",
218 |     "gnb = GaussianNB()\n",
219 |     "gnb.fit(X_train, y_train)\n",
220 |     "y_pred_class = gnb.predict(X_test)\n",
221 |     "print metrics.accuracy_score(y_test, y_pred_class)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "**Conclusion:** When applying Naive Bayes classification to a dataset with **continuous features**, it is better to use Gaussian Naive Bayes than Multinomial Naive Bayes. The latter is suitable for datasets containing **discrete features** (e.g., word counts).\n",
229 |     "\n",
230 |     "Wikipedia has a short [description](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Gaussian_naive_Bayes) of Gaussian Naive Bayes, as well as an excellent [example](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Sex_classification) of its usage."
231 |    ]
232 |   }
233 |  ],
234 |  "metadata": {
235 |   "kernelspec": {
236 |    "display_name": "Python 2",
237 |    "language": "python",
238 |    "name": "python2"
239 |   },
240 |   "language_info": {
241 |    "codemirror_mode": {
242 |     "name": "ipython",
243 |     "version": 2
244 |    },
245 |    "file_extension": ".py",
246 |    "mimetype": "text/x-python",
247 |    "name": "python",
248 |    "nbconvert_exporter": "python",
249 |    "pygments_lexer": "ipython2",
250 |    "version": "2.7.6"
251 |   }
252 |  },
253 |  "nbformat": 4,
254 |  "nbformat_minor": 0
255 | }
256 | 


--------------------------------------------------------------------------------
/notebooks/images/bias_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/bias_variance.png


--------------------------------------------------------------------------------
/notebooks/images/cross_validation_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/cross_validation_diagram.png


--------------------------------------------------------------------------------
/notebooks/images/crowdflower_ensembling.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/crowdflower_ensembling.jpg


--------------------------------------------------------------------------------
/notebooks/images/driver_ensembling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/driver_ensembling.png


--------------------------------------------------------------------------------
/notebooks/images/estimating_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/estimating_coefficients.png


--------------------------------------------------------------------------------
/notebooks/images/iris_01nn_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/iris_01nn_map.png


--------------------------------------------------------------------------------
/notebooks/images/iris_05nn_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/iris_05nn_map.png


--------------------------------------------------------------------------------
/notebooks/images/iris_15nn_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/iris_15nn_map.png


--------------------------------------------------------------------------------
/notebooks/images/iris_50nn_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/iris_50nn_map.png


--------------------------------------------------------------------------------
/notebooks/images/lasso_ridge_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/lasso_ridge_coefficients.png


--------------------------------------------------------------------------------
/notebooks/images/lasso_ridge_path.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/lasso_ridge_path.png


--------------------------------------------------------------------------------
/notebooks/images/logistic_betas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/logistic_betas.png


--------------------------------------------------------------------------------
/notebooks/images/obama_clinton_tree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/obama_clinton_tree.jpg


--------------------------------------------------------------------------------
/notebooks/images/polynomial_overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/polynomial_overfitting.png


--------------------------------------------------------------------------------
/notebooks/images/salary_color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_color.png


--------------------------------------------------------------------------------
/notebooks/images/salary_regions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_regions.png


--------------------------------------------------------------------------------
/notebooks/images/salary_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_tree.png


--------------------------------------------------------------------------------
/notebooks/images/salary_tree_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_tree_annotated.png


--------------------------------------------------------------------------------
/notebooks/images/salary_tree_deep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_tree_deep.png


--------------------------------------------------------------------------------
/notebooks/images/supervised_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/supervised_learning.png


--------------------------------------------------------------------------------
/notebooks/images/train_test_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/train_test_split.png


--------------------------------------------------------------------------------
/notebooks/images/training_testing_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/training_testing_error.png


--------------------------------------------------------------------------------
/notebooks/images/tree_bikeshare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/tree_bikeshare.png


--------------------------------------------------------------------------------
/notebooks/images/tree_titanic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/tree_titanic.png


--------------------------------------------------------------------------------
/notebooks/images/tree_vehicles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/tree_vehicles.png


--------------------------------------------------------------------------------
/notebooks/images/tree_vs_linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/tree_vs_linear.png


--------------------------------------------------------------------------------
/other/02_exercise_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/other/02_exercise_output.png


--------------------------------------------------------------------------------
/other/02_file_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/other/02_file_tree.png


--------------------------------------------------------------------------------
/other/advice.md:
--------------------------------------------------------------------------------
 1 | ## What's Next?
 2 | 
 3 | Here is my best advice for **getting better at data science**: Find "the thing" that motivates you to practice what you learned and to learn more, and then do that thing. That could be personal data science projects, Kaggle competitions, online courses, reading books, reading blogs, attending meetups or conferences, or something else.
 4 | 
 5 | If you create your own **data science projects**, I'd encourage you to share them on GitHub and include writeups. That will help to show others that you know how to do [proper data science](http://simplystatistics.org/2015/03/17/data-science-done-well-looks-easy-and-that-is-a-big-problem-for-data-scientists/).
 6 | 
 7 | **Kaggle competitions** are a great way to practice data science without coming up with the problem yourself. Don't worry about how high you place, just focus on learning something new with every competition. Spend as much time as possible reading the forums, because you'll learn a lot, but don't spend time in the forums at the expense of working on the competition yourself. Also, keep in mind that you won't be practicing important parts of the data science workflow, namely generating questions, gathering data, and communicating results.
 8 | 
 9 | There are many **online courses** to consider, and new ones being created all the time:
10 | 
11 | * Coursera's [Data Science Specialization](https://www.coursera.org/specialization/jhudatascience/1) is 9 courses, plus a capstone project. There is a lot of overlap with General Assembly's course, and course quality varies, but you would definitely learn a lot of R.
12 | * Coursera's [Machine Learning](https://www.coursera.org/learn/machine-learning/) is Andrew Ng's highly regarded course. It goes deeper into many topics we covered, and covers many topics we didn't. Keep in mind that it focuses only on machine learning (not the entire data science workflow), the programming assignments use MATLAB/Octave, and it requires some understanding of linear algebra. Browse these [lecture notes](http://www.holehouse.org/mlclass/) (compiled by a student) for a preview of the course.
13 | * Stanford's [Statistical Learning](http://online.stanford.edu/course/statistical-learning) also covers some topics that we did not. It focuses on teaching machine learning at a conceptual (rather than mathematical) level, when possible. The course may be offered again in 2016, but the real gem from the course is the book and videos (linked below).
14 | * Caltech's [Learning from Data](http://work.caltech.edu/telecourse.html) teaches machine learning at a theoretical and conceptual level. The lectures and slides are excellent. The homework assignments are not interactive, and the course does not use a specific programming language.
15 | * Udacity's [Data Analyst Nanodegree](https://www.udacity.com/course/data-analyst-nanodegree--nd002) looks promising, but I don't know anyone who has done it.
16 | * Thinkful's [Data Science in Python](https://www.thinkful.com/courses/learn-data-science-online/) course or SlideRule's [Data Science Intensive](https://www.mysliderule.com/workshops/data-science-intensive) may be a good way to practice our course material with guidance from an expert mentor.
17 | * [Dataquest](https://www.dataquest.io) is an online platform rather than a traditional course, and allows you to learn and practice data science through interactive exercises. Not all of the lessons are free, but new lessons are frequently being added.
18 | * edX's [Introduction to Computer Science and Programming Using Python](https://www.edx.org/course/introduction-computer-science-mitx-6-00-1x7) is apparently an excellent course if you want to get better at programming in Python.
19 | * Coursera recently added many other data science-related [specializations and courses](https://www.coursera.org/browse/data-science?languages=en), most of which I am not familiar with. However, [CourseTalk](https://www.coursetalk.com/) is useful for reading reviews of online courses.
20 | * Some additional courses are listed in the [Additional Resources](../README.md#additional-resources-1) section of the main README.
21 | * I will also be teaching [my own online courses](http://www.dataschool.io/learn/), which will range in level from beginner to advanced. (Subscribe to my [email newsletter](http://www.dataschool.io/subscribe/) to be notified when courses are announced.)
22 | 
23 | Here is just a tiny selection of **books**:
24 | * [An Introduction to Statistical Learning with Applications in R](http://www-bcf.usc.edu/~gareth/ISL/) is my favorite book on machine learning because of the thoughtful way in which the material is presented. The Statistical Learning course linked above uses it as the course textbook, and the [related videos](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/) are available on YouTube.
25 | * [Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/) is by the same authors. It covers a wider variety of topics, and in greater mathematical depth.
26 | * [Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do) was written by the creator of Pandas, and is especially useful if you want to go deeper into Pandas and NumPy.
27 | * [Python Machine Learning](https://github.com/rasbt/python-machine-learning-book) came out in September 2015. The author, [Sebastian Raschka](http://sebastianraschka.com/articles.html), is an excellent writer and has a deep understanding of both machine learning and scikit-learn, so I suspect it is worth reading.
28 | 
29 | There are an overwhelming number of data science **blogs and newsletters**. If you want to read just one site, [DataTau](http://www.datatau.com/news) is the best aggregator. [Data Elixir](http://dataelixir.com/) is the best newsletter, though the [O'Reilly Data Newsletter](http://www.oreilly.com/data/newsletter.html) and [Python Weekly](http://www.pythonweekly.com/) are also good. Other notable blogs include: [no free hunch](http://blog.kaggle.com/) (Kaggle's blog), [The Yhat blog](http://blog.yhathq.com/) (lots of Python and R content), [Practical Business Python](http://pbpython.com/) (accessible Python content), [Simply Statistics](http://simplystatistics.org/) (a bit more academic), [FastML](http://fastml.com/) (machine learning content), [Win-Vector blog](http://www.win-vector.com/blog/) (great data science advice), [FiveThirtyEight](http://fivethirtyeight.com/) (data journalism), and [Data School](http://www.dataschool.io/) (my blog).
30 | 
31 | If you prefer **podcasts**, I don't have any personal recommendations, though this [list](https://blog.growth.supply/the-7-best-data-science-and-machine-learning-podcasts-e8f0d5a4a419) gives a nice summary of seven data science podcasts that the author recommends.
32 | 
33 | There are tons of data-related **meetups in DC**, and most of them are organized by Data Community DC. Check out the [calendar](http://www.datacommunitydc.org/calendar/) or just subscribe to their [weekly newsletter](http://www.datacommunitydc.org/newsletter/). [District Data Labs](http://www.districtdatalabs.com/) also offers data science workshops and project opportunities in DC.
34 | 
35 | Some notable data science **conferences** are [KDD](http://www.kdd.org/), [Strata](http://strataconf.com/), [PyCon](https://us.pycon.org), [PyData](http://pydata.org/), and [SciPy](http://scipy.org/).
36 | 
37 | If you want to go **full-time** with your data science education, read this [guide to data science bootcamps](http://yet-another-data-blog.blogspot.com/2014/04/data-science-bootcamp-landscape-full.html), and this [other guide](http://www.skilledup.com/articles/list-data-science-bootcamps) which also includes part-time and online programs. Or, check out this massive list of [colleges and universities](http://datascience.community/colleges) with data science-related degrees.
38 | 
39 | Finally, Dataquest's blog post on [How to actually learn data science](https://www.dataquest.io/blog/how-to-actually-learn-data-science/) has some additional advice that may be useful to you.
40 | 


--------------------------------------------------------------------------------
/other/model_comparison.md:
--------------------------------------------------------------------------------
  1 | # Comparison of Machine Learning Models ([table](http://www.dataschool.io/comparing-supervised-learning-algorithms/))
  2 | 
  3 | ## K-nearest neighbors (KNN)
  4 | 
  5 | **Advantages:**
  6 | 
  7 | - Simple to understand and explain
  8 | - Model training is fast
  9 | - Can be used for classification and regression
 10 | 
 11 | **Disadvantages:**
 12 | 
 13 | - Must store all of the training data
 14 | - Prediction phase can be slow when n is large
 15 | - Sensitive to irrelevant features
 16 | - Sensitive to the scale of the data
 17 | - Accuracy is (generally) not competitive with the best supervised learning methods
 18 | 
 19 | ## Linear Regression
 20 | 
 21 | **Advantages:**
 22 | 
 23 | - Simple to explain
 24 | - Highly interpretable
 25 | - Model training and prediction are fast
 26 | - No tuning is required (excluding regularization)
 27 | - Features don't need scaling
 28 | - Can perform well with a small number of observations
 29 | - Well-understood
 30 | 
 31 | **Disadvantages:**
 32 | 
 33 | - Presumes a linear relationship between the features and the response
 34 | - Performance is (generally) not competitive with the best supervised learning methods due to high bias
 35 | - Can't automatically learn feature interactions
 36 | 
 37 | ## Logistic Regression
 38 | 
 39 | **Advantages:**
 40 | 
 41 | - Highly interpretable (if you remember how)
 42 | - Model training and prediction are fast
 43 | - No tuning is required (excluding regularization)
 44 | - Features don't need scaling
 45 | - Can perform well with a small number of observations
 46 | - Outputs well-calibrated predicted probabilities
 47 | 
 48 | **Disadvantages:**
 49 | 
 50 | - Presumes a linear relationship between the features and the log-odds of the response
 51 | - Performance is (generally) not competitive with the best supervised learning methods
 52 | - Can't automatically learn feature interactions
 53 | 
 54 | ## Naive Bayes
 55 | 
 56 | **Advantages:**
 57 | 
 58 | - Model training and prediction are very fast
 59 | - Somewhat interpretable
 60 | - No tuning is required
 61 | - Features don't need scaling
 62 | - Insensitive to irrelevant features (with enough observations)
 63 | - Performs better than logistic regression when the training set is very small
 64 | 
 65 | **Disadvantages:**
 66 | 
 67 | - Predicted probabilities are not well-calibrated
 68 | - Correlated features can be problematic (due to the independence assumption)
 69 | - Can't handle negative features (with Multinomial Naive Bayes)
 70 | - Has a higher "asymptotic error" than logistic regression
 71 | 
 72 | ## Decision Trees
 73 | 
 74 | **Advantages:**
 75 | 
 76 | - Can be used for regression or classification
 77 | - Can be displayed graphically
 78 | - Highly interpretable
 79 | - Can be specified as a series of rules, and more closely approximate human decision-making than other models
 80 | - Prediction is fast
 81 | - Features don't need scaling
 82 | - Automatically learns feature interactions
 83 | - Tends to ignore irrelevant features
 84 | - Non-parametric (will outperform linear models if relationship between features and response is highly non-linear)
 85 | 
 86 | **Disadvantages:**
 87 | 
 88 | - Performance is (generally) not competitive with the best supervised learning methods
 89 | - Can easily overfit the training data (tuning is required)
 90 | - Small variations in the data can result in a completely different tree (high variance)
 91 | - Recursive binary splitting makes "locally optimal" decisions that may not result in a globally optimal tree
 92 | - Doesn't tend to work well if the classes are highly unbalanced
 93 | - Doesn't tend to work well with very small datasets
 94 | 
 95 | ## Random Forests
 96 | 
 97 | **Advantages (compared to decision trees):**
 98 | 
 99 | - Performance is competitive with the best supervised learning methods
100 | - Provides a more reliable estimate of feature importance
101 | - Allows you to estimate out-of-sample error without using train/test split or cross-validation
102 | 
103 | **Disadvantages (compared to decision trees):**
104 | 
105 | - Less interpretable
106 | - Slower to train
107 | - Slower to predict
108 | 
109 | ## Regularized Linear Models
110 | 
111 | **Advantages (compared to unregularized linear models):**
112 | 
113 | - Better performance
114 | - L1 regularization performs automatic feature selection
115 | - Useful for high-dimensional problems (p > n)
116 | 
117 | **Disadvantages (compared to unregularized linear models):**
118 | 
119 | - Tuning is required
120 | - Feature scaling is recommended
121 | - Less interpretable (due to feature scaling)
122 | 


--------------------------------------------------------------------------------
/other/model_evaluation_comparison.md:
--------------------------------------------------------------------------------
 1 | ## Comparing Model Evaluation Procedures
 2 | 
 3 | **Training and testing on the same data**
 4 | 
 5 | - Goal is to estimate likely performance of a model on out-of-sample data
 6 | - But, maximizing training performance rewards overly complex models that won't necessarily generalize
 7 | - Unnecessarily complex models overfit the training data:
 8 |     - Will do well when tested using the in-sample data
 9 |     - May do poorly on out-of-sample data
10 |     - Learns the "noise" in the data rather than the "signal"
11 | 
12 | **Train/test split**
13 | 
14 | - Split the dataset into two pieces, so that the model can be trained and tested on different data
15 | - Testing performance is a better estimate of out-of-sample performance (compared to training performance)
16 | - But, it provides a high variance estimate since changing which observations happen to be in the testing set can significantly change testing performance
17 | - Allows you to easily inspect your testing results (via confusion matrix or ROC curve)
18 | 
19 | **K-fold cross-validation**
20 | 
21 | - Systematically create "K" train/test splits and average the results together
22 | - Cross-validated performance is a more reliable estimate of out-of-sample performance (compared to testing performance)
23 | - Runs "K" times slower than train/test split
24 | 
25 | ## Comparing Evaluation Metrics for Classification Problems
26 | 
27 | **Classification accuracy/error**
28 | 
29 | - Classification accuracy is the percentage of correct predictions (higher is better)
30 | - Classification error is the percentage of incorrect predictions (lower is better)
31 | - Easiest classification metric to understand
32 | 
33 | **Confusion matrix**
34 | 
35 | - Confusion matrix gives you a better understanding of how your classifier is performing
36 | - Allows you to calculate sensitivity, specificity, and many other metrics that might match your business objective better than accuracy
37 | 
38 | **ROC curves and Area Under the Curve (AUC)**
39 | 
40 | - Allows you to visualize the performance of your classifier across all possible classification thresholds, thus helping you to choose a threshold that appropriately balances sensitivity and specificity
41 | - Still useful when there is high class imbalance (unlike classification accuracy/error)
42 | - Harder to use when there are more than two response classes
43 | 
44 | **Log loss**
45 | 
46 | - Most useful when well-calibrated predicted probabilities are important to your business objective
47 | 
48 | ## Comparing Evaluation Metrics for Regression Problems
49 | 
50 | **Mean Absolute Error (MAE)**
51 | 
52 | - Mean of the absolute value of the errors
53 | - Easiest regression metric to understand
54 | 
55 | **Mean Squared Error (MSE)**
56 | 
57 | - Mean of the squared errors
58 | - More popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world
59 | 
60 | **Root Mean Squared Error (RMSE)**
61 | 
62 | - Square root of the mean of the squared errors
63 | - Even more popular than MSE, because RMSE is interpretable in the "y" units
64 | 


--------------------------------------------------------------------------------
/other/python_packages.md:
--------------------------------------------------------------------------------
 1 | ## List of Python packages used in the course
 2 | 
 3 | Note: Some of these packages have dependencies that will also need to be installed.
 4 | 
 5 | ### Included with Anaconda ([complete list](http://docs.continuum.io/anaconda/pkg-docs.html))
 6 | * beautiful-soup
 7 | * ipython
 8 | * ipython-notebook
 9 | * matplotlib
10 | * nltk
11 | * numpy
12 | * pandas
13 | * pip
14 | * requests
15 | * scikit-learn
16 | * scipy
17 | 
18 | ### Available for installation via `conda`
19 | * seaborn
20 | 
21 | ### Available for installation via `pip`
22 | * textblob
23 | 


--------------------------------------------------------------------------------
/other/setup_checklist.md:
--------------------------------------------------------------------------------
 1 | ## Setup checklist
 2 | 
 3 | This is a checklist to confirm that your laptop is set up properly for DAT8. If at any point you get an error message, please note the error message and we will help you to fix it! If you don't get any error messages, you are properly set up.
 4 | 
 5 | ### GitHub
 6 | * Log into your GitHub account, and "star" the DAT8 repository (the one you are looking at right now) by clicking the Star button in the upper right corner of the screen.
 7 | 
 8 | ### Git
 9 | * Open a command line application:
10 |     * For Windows, we recommend [Git Bash](http://git-scm.com/download/win) instead of Git Shell (which uses Powershell).
11 |     * For Mac, you will probably be using Terminal, or another command line tool of your choice.
12 | * Type `git config --global user.name "YourFirstName YourLastName"` (including the quotes)
13 | * Type `git config --global user.email "youremail@domain.com"` (use the email address associated with your GitHub account)
14 | 
15 | ### Python
16 | * While still at the command line:
17 |     * Type `conda list` (if you choose not to use Anaconda, this will generate an error)
18 |     * Type `pip install textblob`
19 |     * Type `python` to open the Python interpreter
20 | * While in the Python interpreter:
21 |     * Look at the Python version number. It should start with 2.7. If your version number starts with 3, that's fine as long as you are aware of the differences between Python 2 and 3.
22 |     * Type `import pandas`
23 |     * Type `import textblob`
24 |     * Type `exit()` to exit the interpreter. You can now close the command line application.
25 | * Open Spyder (if you can't find Spyder, look for the Anaconda Launcher application)
26 |     * In the console (probably on the right side of the screen), type `import pandas`
27 |     * Type `import textblob`
28 |         * If this worked from the interpreter but not in Spyder, and you're using a Mac, try opening the PYTHONPATH manager (in Spyder) and adding a path to where textblob was installed (such as `/Users/yourname/anaconda/lib/python2.7/site-packages/`). Then, restart Spyder.
29 | 


--------------------------------------------------------------------------------
/project/peer_review.md:
--------------------------------------------------------------------------------
 1 | ## Peer Review Guidelines
 2 | 
 3 | You will be assigned to review the project drafts of two of your peers, and will provide them with feedback via a private group in Slack that we will set up for you.
 4 | 
 5 | Expectations:
 6 | * Read everything they wrote!
 7 | * If they provided their data, review it and try to understand it.
 8 | * Read their code and try to understand their thought process.
 9 | * If their code can be run, try running it.
10 | * Spend at least one hour reviewing their project (including the time it takes to write the feedback).
11 | 
12 | Your feedback would ideally consist of:
13 | * Strengths of their project (things you particularly like about it)
14 | * Comments about things you think could be improved
15 | * Questions about things you don't understand
16 | * Comments about their code
17 | * Links to resources or code snippets that might be useful to them
18 | * Suggestions for next steps
19 | * Guiding principle: Give feedback that would be helpful to you if it was your project!
20 | 
21 | You should take a quick glance through their project as soon as possible, to make sure you understand what they have given you and what files you should be reviewing. If you're unclear, ask them about it!
22 | 


--------------------------------------------------------------------------------
/project/public_data.md:
--------------------------------------------------------------------------------
 1 | ## Public Data Sources
 2 | 
 3 | * Open data catalogs from various governments and NGOs:
 4 |      * [NYC Open Data](https://nycopendata.socrata.com/)
 5 |      * [DC Open Data Catalog](http://data.dc.gov/) / [OpenDataDC](http://www.opendatadc.org/)
 6 |      * [DataLA](https://data.lacity.org/)
 7 |      * [data.gov](https://www.data.gov/) (see also: [Project Open Data Dashboard](http://data.civicagency.org/))
 8 |      * [data.gov.uk](http://data.gov.uk/)
 9 |      * [US Census Bureau](http://www.census.gov/)
10 |      * [World Bank Open Data](http://data.worldbank.org/)
11 |      * [Humanitarian Data Exchange](http://docs.hdx.rwlabs.org/)
12 |      * [Sunlight Foundation](http://sunlightfoundation.com/api/): government-focused data
13 |      * [ProPublica Data Store](https://projects.propublica.org/data-store/)
14 | * Datasets hosted by academic institutions:
15 |      * [UC Irvine Machine Learning Repository](http://archive.ics.uci.edu/ml/): datasets specifically designed for machine learning
16 |      * [Stanford Large Network Dataset Collection](http://snap.stanford.edu/data/): graph data
17 |      * [Inter-university Consortium for Political and Social Research](http://www.icpsr.umich.edu/)
18 |      * [Pittsburgh Science of Learning Center's DataShop](http://www.learnlab.org/technologies/datashop/)
19 |      * [Academic Torrents](http://academictorrents.com/): distributed network for sharing large research datasets
20 |      * [Dataverse Project](http://dataverse.org/): searchable archive of research data
21 | * Datasets hosted by private companies:
22 |      * [Quandl](https://www.quandl.com/): over 10 million financial, economic, and social datasets
23 |      * [Amazon Web Services Public Data Sets](http://aws.amazon.com/datasets/)
24 |      * [Kaggle](http://www.kaggle.com/) provides datasets with their challenges, but each competition has its own rules as to whether the data can be used outside of the scope of the competition.
25 | * Big lists of datasets:
26 |      * [Awesome Public Datasets](https://github.com/caesar0301/awesome-public-datasets): Well-organized and frequently updated
27 |      * [Rdatasets](http://vincentarelbundock.github.io/Rdatasets/): collection of 700+ datasets originally distributed with R packages
28 |      * [RDataMining.com](http://www.rdatamining.com/resources/data)
29 |      * [KDnuggets](http://www.kdnuggets.com/datasets/index.html)
30 |      * [inside-R](http://www.inside-r.org/howto/finding-data-internet)
31 |      * [100+ Interesting Data Sets for Statistics](http://rs.io/2014/05/29/list-of-data-sets.html)
32 |      * [20 Free Big Data Sources](http://smartdatacollective.com/bernardmarr/235366/big-data-20-free-big-data-sources-everyone-should-know)
33 |      * [Sebastian Raschka](https://github.com/rasbt/pattern_classification/blob/master/resources/dataset_collections.md): datasets categorized by format and topic
34 | * APIs:
35 |      * [Apigee](https://apigee.com/providers): explore dozens of popular APIs
36 |      * [Mashape](https://www.mashape.com/): explore hundreds of APIs
37 |      * [Python APIs](http://www.pythonforbeginners.com/api/list-of-python-apis): Python wrappers for many APIs
38 | * Other interesting datasets:
39 |      * [FiveThirtyEight](https://github.com/fivethirtyeight/data): data and code related to their articles
40 |      * [The Upshot](https://github.com/TheUpshot/): data related to their articles
41 |      * [Yelp Dataset Challenge](http://www.yelp.com/dataset_challenge): Yelp reviews, business attributes, users, and more from 10 cities
42 |      * [Donors Choose](http://data.donorschoose.org/open-data/overview/): data related to their projects
43 |      * [200,000+ Jeopardy questions](http://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/)
44 |      * [CrowdFlower](http://www.crowdflower.com/data-for-everyone): interesting datasets created or enhanced by their contributors
45 |      * [UFO reports](https://github.com/planetsig/ufo-reports): geolocated and time-standardized UFO reports for close to a century
46 |      * [Reddit Top 2.5 Million](https://github.com/umbrae/reddit-top-2.5-million): all-time top 1,000 posts from each of the top 2,500 subreddits
47 | * Other resources:
48 |      * [Datasets subreddit](http://www.reddit.com/r/datasets/): ask for help finding a specific data set, or post your own
49 |      * [Center for Data Innovation](http://www.datainnovation.org/category/publications/data-set-blog/): blog posts about interesting, recently-released data sets.
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | seaborn
2 | textblob
3 | 


--------------------------------------------------------------------------------
/slides/01_course_overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_course_overview.pdf


--------------------------------------------------------------------------------
/slides/01_course_overview.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_course_overview.pptx


--------------------------------------------------------------------------------
/slides/01_intro_to_data_science.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_intro_to_data_science.pdf


--------------------------------------------------------------------------------
/slides/01_intro_to_data_science.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_intro_to_data_science.pptx


--------------------------------------------------------------------------------
/slides/01_types_of_data.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_types_of_data.pdf


--------------------------------------------------------------------------------
/slides/01_types_of_data.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_types_of_data.pptx


--------------------------------------------------------------------------------
/slides/02_git_github.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/02_git_github.pdf


--------------------------------------------------------------------------------
/slides/02_git_github.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/02_git_github.pptx


--------------------------------------------------------------------------------
/slides/06_machine_learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/06_machine_learning.pdf


--------------------------------------------------------------------------------
/slides/06_machine_learning.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/06_machine_learning.pptx


--------------------------------------------------------------------------------
/slides/12_confusion_matrix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/12_confusion_matrix.pdf


--------------------------------------------------------------------------------
/slides/12_confusion_matrix.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/12_confusion_matrix.pptx


--------------------------------------------------------------------------------
/slides/13_drawing_roc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/13_drawing_roc.pdf


--------------------------------------------------------------------------------
/slides/13_drawing_roc.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/13_drawing_roc.pptx


--------------------------------------------------------------------------------
/slides/14_bayes_theorem.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/14_bayes_theorem.pdf


--------------------------------------------------------------------------------
/slides/14_bayes_theorem.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/14_bayes_theorem.pptx


--------------------------------------------------------------------------------
/slides/14_naive_bayes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/14_naive_bayes.pdf


--------------------------------------------------------------------------------
/slides/14_naive_bayes.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/14_naive_bayes.pptx


--------------------------------------------------------------------------------
/slides/16_kaggle.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/16_kaggle.pdf


--------------------------------------------------------------------------------
/slides/16_kaggle.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/16_kaggle.pptx


--------------------------------------------------------------------------------
/slides/19_clustering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/19_clustering.pdf


--------------------------------------------------------------------------------
/slides/19_clustering.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/19_clustering.pptx


--------------------------------------------------------------------------------