├── .gitignore ├── README.md ├── code ├── 00_python_beginner_workshop.py ├── 00_python_intermediate_workshop.py ├── 02_command_line.md ├── 03_file_reading.py ├── 03_python_homework_chipotle.py ├── 03_python_homework_chipotle_explained_nb.py ├── 04_pandas.py ├── 05_pandas_homework_imdb.py ├── 05_pandas_merge_nb.py ├── 05_pandas_visualization_nb.py ├── 06_human_learning_iris_nb.py ├── 07_api.py ├── 07_web_scraping.py ├── 08_bias_variance_nb.py ├── 08_knn_sklearn_nb.py ├── 08_nba_knn_nb.py ├── 08_pandas_review_nb.py ├── 09_model_evaluation_nb.py ├── 10_linear_regression_nb.py ├── 10_yelp_votes_homework_nb.py ├── 12_e_log_examples_nb.py ├── 12_logistic_regression_nb.py ├── 12_titanic_confusion_nb.py ├── 13_advanced_model_evaluation_nb.py ├── 13_bank_exercise_nb.py ├── 13_cross_validation_nb.py ├── 14_bayes_theorem_iris_nb.py ├── 14_text_data_sklearn_nb.py ├── 14_types_of_naive_bayes_nb.py ├── 15_natural_language_processing_nb.py ├── 16_kaggle.py ├── 16_kaggle_minimal.py ├── 17_bikeshare_exercise_nb.py ├── 17_decision_trees_nb.py ├── 18_ensembling_nb.py ├── 19_advanced_sklearn_nb.py ├── 19_clustering_nb.py ├── 20_regex_exercise.py ├── 20_regex_reference.py └── 20_regularization_nb.py ├── data ├── airlines.csv ├── bank-additional.csv ├── beer.txt ├── bikeshare.csv ├── chipotle.tsv ├── drinks.csv ├── example.html ├── hitters.csv ├── homicides.txt ├── imdb_1000.csv ├── imdb_ids.txt ├── sms.tsv ├── titanic.csv ├── u.data ├── u.item ├── u.user ├── u.user_original ├── ufo.csv ├── vehicles_test.csv ├── vehicles_train.csv ├── yelp.csv └── yelp.json ├── homework ├── 02_command_line_chipotle.md ├── 09_bias_variance.md ├── 10_yelp_votes.md ├── 13_cross_validation.md ├── 13_roc_auc.md ├── 14_spam_filtering.md └── 14_yelp_review_text.md ├── notebooks ├── 03_python_homework_chipotle_explained.ipynb ├── 05_pandas_merge.ipynb ├── 05_pandas_visualization.ipynb ├── 06_human_learning_iris.ipynb ├── 08_bias_variance.ipynb ├── 08_knn_sklearn.ipynb ├── 08_nba_knn.ipynb ├── 08_pandas_review.ipynb ├── 09_model_evaluation.ipynb ├── 10_linear_regression.ipynb ├── 10_yelp_votes_homework.ipynb ├── 12_e_log_examples.ipynb ├── 12_logistic_regression.ipynb ├── 12_titanic_confusion.ipynb ├── 13_advanced_model_evaluation.ipynb ├── 13_bank_exercise.ipynb ├── 13_cross_validation.ipynb ├── 14_bayes_theorem_iris.ipynb ├── 14_naive_bayes_spam.ipynb ├── 14_text_data_sklearn.ipynb ├── 14_types_of_naive_bayes.ipynb ├── 14_yelp_review_text_homework.ipynb ├── 15_natural_language_processing.ipynb ├── 17_bikeshare_exercise.ipynb ├── 17_decision_trees.ipynb ├── 18_ensembling.ipynb ├── 19_advanced_sklearn.ipynb ├── 19_clustering.ipynb ├── 20_regularization.ipynb └── images │ ├── bias_variance.png │ ├── cross_validation_diagram.png │ ├── crowdflower_ensembling.jpg │ ├── driver_ensembling.png │ ├── estimating_coefficients.png │ ├── iris_01nn_map.png │ ├── iris_05nn_map.png │ ├── iris_15nn_map.png │ ├── iris_50nn_map.png │ ├── lasso_ridge_coefficients.png │ ├── lasso_ridge_path.png │ ├── logistic_betas.png │ ├── obama_clinton_tree.jpg │ ├── polynomial_overfitting.png │ ├── salary_color.png │ ├── salary_regions.png │ ├── salary_tree.png │ ├── salary_tree_annotated.png │ ├── salary_tree_deep.png │ ├── supervised_learning.png │ ├── train_test_split.png │ ├── training_testing_error.png │ ├── tree_bikeshare.png │ ├── tree_titanic.png │ ├── tree_vehicles.png │ └── tree_vs_linear.png ├── other ├── 02_exercise_output.png ├── 02_file_tree.png ├── advice.md ├── model_comparison.md ├── model_evaluation_comparison.md ├── python_packages.md └── setup_checklist.md ├── project ├── README.md ├── peer_review.md └── public_data.md ├── requirements.txt └── slides ├── 01_course_overview.pdf ├── 01_course_overview.pptx ├── 01_intro_to_data_science.pdf ├── 01_intro_to_data_science.pptx ├── 01_types_of_data.pdf ├── 01_types_of_data.pptx ├── 02_git_github.pdf ├── 02_git_github.pptx ├── 06_machine_learning.pdf ├── 06_machine_learning.pptx ├── 12_confusion_matrix.pdf ├── 12_confusion_matrix.pptx ├── 13_drawing_roc.pdf ├── 13_drawing_roc.pptx ├── 14_bayes_theorem.pdf ├── 14_bayes_theorem.pptx ├── 14_naive_bayes.pdf ├── 14_naive_bayes.pptx ├── 16_kaggle.pdf ├── 16_kaggle.pptx ├── 19_clustering.pdf └── 19_clustering.pptx /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .DS_Store 3 | *.pyc 4 | extras/ 5 | -------------------------------------------------------------------------------- /code/00_python_beginner_workshop.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Python Beginner Workshop 3 | ''' 4 | 5 | ''' 6 | Multi-line comments go between 3 quotation marks. 7 | You can use single or double quotes. 8 | ''' 9 | 10 | # One-line comments are preceded by the pound symbol 11 | 12 | 13 | # BASIC DATA TYPES 14 | 15 | x = 5 # creates an object 16 | print type(x) # check the type: int (not declared explicitly) 17 | type(x) # automatically prints 18 | type(5) # assigning it to a variable is not required 19 | 20 | type(5.0) # float 21 | type('five') # str 22 | type(True) # bool 23 | 24 | 25 | # LISTS 26 | 27 | nums = [5, 5.0, 'five'] # multiple data types 28 | nums # print the list 29 | type(nums) # check the type: list 30 | len(nums) # check the length: 3 31 | nums[0] # print first element 32 | nums[0] = 6 # replace a list element 33 | 34 | nums.append(7) # list 'method' that modifies the list 35 | help(nums.append) # help on this method 36 | help(nums) # help on a list object 37 | nums.remove('five') # another list method 38 | 39 | sorted(nums) # 'function' that does not modify the list 40 | nums # it was not affected 41 | nums = sorted(nums) # overwrite the original list 42 | sorted(nums, reverse=True) # optional argument 43 | 44 | 45 | # FUNCTIONS 46 | 47 | def give_me_five(): # function definition ends with colon 48 | return 5 # indentation required for function body 49 | 50 | give_me_five() # prints the return value (5) 51 | num = give_me_five() # assigns return value to a variable, doesn't print it 52 | 53 | def calc(x, y, op): # three parameters (without any defaults) 54 | if op == 'add': # conditional statement 55 | return x + y 56 | elif op == 'subtract': 57 | return x - y 58 | else: 59 | print 'Valid operations: add, subtract' 60 | 61 | calc(5, 3, 'add') 62 | calc(5, 3, 'subtract') 63 | calc(5, 3, 'multiply') 64 | calc(5, 3) 65 | 66 | 67 | # EXERCISE: Write a function that takes two parameters (hours and rate), and 68 | # returns the total pay. 69 | 70 | def compute_pay(hours, rate): 71 | return hours * rate 72 | 73 | compute_pay(40, 10.50) 74 | 75 | 76 | # FOR LOOPS 77 | 78 | # print each list element in uppercase 79 | fruits = ['apple', 'banana', 'cherry'] 80 | for fruit in fruits: 81 | print fruit.upper() 82 | -------------------------------------------------------------------------------- /code/00_python_intermediate_workshop.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Python Intermediate Workshop 3 | ''' 4 | 5 | ''' 6 | LISTS 7 | ''' 8 | 9 | # creating 10 | a = [1, 2, 3, 4, 5] # create lists using brackets 11 | 12 | # slicing 13 | a[0] # returns 1 (Python is zero indexed) 14 | a[1:3] # returns [2, 3] (inclusive of first index but exclusive of second) 15 | a[-1] # returns 5 (last element) 16 | 17 | # appending 18 | a[5] = 6 # error because you can't assign outside the existing range 19 | a.append(6) # list method that appends 6 to the end 20 | a = a + [0] # use plus sign to combine lists 21 | 22 | # checking length 23 | len(a) # returns 7 24 | 25 | # checking type 26 | type(a) # returns list 27 | type(a[0]) # returns int 28 | 29 | # sorting 30 | sorted(a) # sorts the list 31 | sorted(a, reverse=True) # reverse=True is an 'optional argument' 32 | sorted(a, True) # error because optional arguments must be named 33 | 34 | 35 | ''' 36 | STRINGS 37 | ''' 38 | 39 | # creating 40 | a = 'hello' # can use single or double quotes 41 | 42 | # slicing 43 | a[0] # returns 'h' (works like list slicing) 44 | a[1:3] # returns 'el' 45 | a[-1] # returns 'o' 46 | 47 | # concatenating 48 | a + ' there' # use plus sign to combine strings 49 | 5 + ' there' # error because they are different types 50 | str(5) + ' there' # cast 5 to a string in order for this to work 51 | 52 | # uppercasing 53 | a[0] = 'H' # error because strings are immutable (can't overwrite characters) 54 | a.upper() # string method (this method doesn't exist for lists) 55 | 56 | # checking length 57 | len(a) # returns 5 (number of characters) 58 | 59 | 60 | ''' 61 | EXERCISE: 62 | 1. Create a list of the first names of your family members. 63 | 2. Print the name of the last person in the list. 64 | 3. Print the length of the name of the first person in the list. 65 | 4. Change one of the names from their real name to their nickname. 66 | 5. Append a new person to the list. 67 | 6. Change the name of the new person to lowercase using the string method 'lower'. 68 | 7. Sort the list in reverse alphabetical order. 69 | Bonus: Sort the list by the length of the names (shortest to longest). 70 | ''' 71 | 72 | names = ['Wesley', 'Larry', 'Wan'] # list of names 73 | names[-1] # last element 74 | len(names[0]) # length of first string 75 | names[0] = 'Wes' # overwrite existing element 76 | names.append('Gabriel') # append new element 77 | names[-1] = names[-1].lower() # change last string to be lowercase 78 | sorted(names, reverse=True) # sort the list in reverse order 79 | sorted(names, key=len) # sort the list by length 80 | 81 | 82 | ''' 83 | FOR LOOPS AND LIST COMPREHENSIONS 84 | ''' 85 | 86 | # for loop to print 1 through 5 87 | nums = range(1, 6) # create a list of 1 through 5 88 | for num in nums: # num 'becomes' each list element for one loop 89 | print num 90 | 91 | # for loop to print 1, 3, 5 92 | other = [1, 3, 5] # create a different list 93 | for x in other: # name 'x' does not matter, not defined in advance 94 | print x # this loop only executes 3 times (not 5) 95 | 96 | # for loop to create a list of 2, 4, 6, 8, 10 97 | doubled = [] # create empty list to store results 98 | for num in nums: # loop through nums (will execute 5 times) 99 | doubled.append(num*2) # append the double of the current value of num 100 | 101 | # equivalent list comprehension 102 | doubled = [num*2 for num in nums] # expression (num*2) goes first, brackets 103 | # indicate we are storing results in a list 104 | 105 | 106 | ''' 107 | EXERCISE 1: 108 | Given that: letters = ['a', 'b', 'c'] 109 | Write a list comprehension that returns: ['A', 'B', 'C'] 110 | 111 | EXERCISE 2 (BONUS): 112 | Given that: word = 'abc' 113 | Write a list comprehension that returns: ['A', 'B', 'C'] 114 | 115 | EXERCISE 3 (BONUS): 116 | Given that: fruits = ['Apple', 'Banana', 'Cherry'] 117 | Write a list comprehension that returns: ['A', 'B', 'C'] 118 | ''' 119 | 120 | letters = ['a', 'b', 'c'] 121 | [letter.upper() for letter in letters] # iterate through a list of strings, 122 | # and each string has an 'upper' method 123 | word = 'abc' 124 | [letter.upper() for letter in word] # iterate through each character 125 | 126 | fruits = ['Apple', 'Banana', 'Cherry'] 127 | [fruit[0] for fruit in fruits] # slice the first character from each string 128 | 129 | 130 | ''' 131 | DICTIONARIES 132 | ''' 133 | 134 | # dictionaries are made of key-value pairs (like a real dictionary) 135 | family = {'dad':'Homer', 'mom':'Marge', 'size':2} 136 | 137 | # check the length 138 | len(family) # returns 3 (number of key-value pairs) 139 | 140 | # use the key to look up a value (fast operation regardless of dictionary size) 141 | family['dad'] # returns 'Homer' 142 | 143 | # can't use a value to look up a key 144 | family['Homer'] # error 145 | 146 | # dictionaries are unordered 147 | family[0] # error 148 | 149 | # add a new entry 150 | family['cat'] = 'snowball' 151 | 152 | # keys must be unique, so this edits an existing entry 153 | family['cat'] = 'snowball ii' 154 | 155 | # delete an entry 156 | del family['cat'] 157 | 158 | # keys can be strings or numbers or tuples, values can be any type 159 | family['kids'] = ['bart', 'lisa'] # value can be a list 160 | 161 | # accessing a list element within a dictionary 162 | family['kids'][0] # returns 'bart' 163 | 164 | # useful methods 165 | family.keys() # returns list: ['dad', 'kids', 'mom', 'size'] 166 | family.values() # returns list: ['Homer', ['bart', 'lisa'], 'Marge', 2] 167 | family.items() # returns list of tuples: 168 | # [('dad', 'Homer'), ('kids', ['bart', 'lisa']), ('mom', 'Marge'), ('size', 2)] 169 | 170 | 171 | ''' 172 | EXERCISE: 173 | 1. Print the name of the mom. 174 | 2. Change the size to 5. 175 | 3. Add 'Maggie' to the list of kids. 176 | 4. Fix 'bart' and 'lisa' so that the first letter is capitalized. 177 | Bonus: Do this last step using a list comprehension. 178 | ''' 179 | 180 | family['mom'] # returns 'Marge' 181 | family['size'] = 5 # replaces existing value for 'size' 182 | family['kids'].append('Maggie') # access a list, then append 'Maggie' to it 183 | family['kids'][0] = 'Bart' # capitalize names by overwriting them 184 | family['kids'][1] = 'Lisa' 185 | 186 | # or, capitalize using a list comprehension and the 'capitalize' string method 187 | family['kids'] = [kid.capitalize() for kid in family['kids']] 188 | 189 | # or, slice the string, uppercase the first letter, and concatenate with other letters 190 | family['kids'] = [kid[0].upper() + kid[1:] for kid in family['kids']] 191 | 192 | 193 | ''' 194 | REQUESTS 195 | ''' 196 | 197 | # import module (make its functions available) 198 | import requests 199 | 200 | # use requests to talk to the web 201 | r = requests.get('http://www.google.com') 202 | type(r) # special 'response' object 203 | r.text # HTML of web page stored as string 204 | type(r.text) # string is encoded as unicode 205 | r.text[0] # string can be sliced like any string 206 | 207 | 208 | ''' 209 | APIs 210 | 211 | What is an API? 212 | - Application Programming Interface 213 | - Structured way to expose specific functionality and data access to users 214 | - Web APIs usually follow the "REST" standard 215 | 216 | How to interact with a REST API: 217 | - Make a "request" to a specific URL (an "endpoint"), and get the data back in a "response" 218 | - Most relevant request method for us is GET (other methods: POST, PUT, DELETE) 219 | - Response is often JSON format 220 | - Web console is sometimes available (allows you to explore an API) 221 | 222 | API Providers: https://apigee.com/providers 223 | Echo Nest API Console: https://apigee.com/console/echonest 224 | API key: http://bit.ly/myechonest 225 | ''' 226 | 227 | # request data from the Echo Nest API 228 | r = requests.get('http://developer.echonest.com/api/v4/artist/top_hottt?api_key=KBGUPZPJZS9PHWNIN&format=json') 229 | r.text # looks like a dictionary 230 | type(r.text) # actually stored as a string 231 | r.json() # decodes JSON 232 | type(r.json()) # JSON can be represented as a dictionary 233 | top = r.json() # store that dictionary 234 | 235 | # store the artist data 236 | artists = top['response']['artists'] # list of 15 dictionaries 237 | 238 | # create a list of artist names only 239 | names = [artist['name'] for artist in artists] # can iterate through list to access dictionaries 240 | -------------------------------------------------------------------------------- /code/02_command_line.md: -------------------------------------------------------------------------------- 1 | ## Introduction to the Command Line 2 | 3 | This document outlines basic usage of the command line. For Linux and Mac users, these commands should work in **Terminal**. For Windows users, these should work in **Git Bash**. 4 | 5 | ### What is the command line? 6 | 7 | The Command Line Interface (CLI) is a way of interacting with your computer using text-based commands. This is different from the way most people interact with their computers, using their mouse and a Graphical User Interface (GUI). 8 | 9 | ### Why should I use it? 10 | 11 | Once you become comfortable with the basics, it can be a more powerful way to use your computer. You're able to do many things more quickly and programatically. 12 | 13 | ### General format for commands 14 | 15 | ` - ` 16 | * `` is the action we want the computer to take 17 | * `` (or "flags") modify the behavior of the command 18 | * `` are the things we want the command to act on 19 | 20 | For Linux and Mac users, you can get view the **man**ual for a command by typing `man `. For Windows users, you can view the help page by typing ` --help`. 21 | 22 | ### Tips 23 | 24 | * If there are spaces in file or directory names, use a "\" to "escape" the space characters, or just put the entire file path in quotes. 25 | * After typing the first few letters of a file or directory name, you can hit Tab to auto-complete the name. (This often auto-escapes spaces for you.) 26 | * Use the up and down arrow keys to navigate previously entered commands. 27 | 28 | ### File paths 29 | 30 | A **relative file path** specifies the path to a file, taking into account your current working directory. For example, if you were to give someone "relative" directions to your house, you would give them directions from their current location (the relative path from where they are to where you are). 31 | 32 | An **absolute file path** specifies the complete path to a file, ignoring your current working directory. For example, if you were to give someone "absolute" directions to your house, you would start by telling them to be on earth, then go to your continent, then go to your country, then go to your region, etc. 33 | 34 | 35 | ### Basic commands 36 | 37 | ##### `pwd` 38 | * **p**rints **w**orking **d**irectory (the directory you are currently in) 39 | 40 | ##### `ls` 41 | * **l**i**s**ts files and subdirectories in your working directory 42 | * `ls -a` lists **a**ll files, including hidden files 43 | * `ls -l` lists the files in a **l**ong format with extra information (permissions, size, last modified date, etc.) 44 | * `ls *` also lists the contents of subdirectories (one level deep) in your working directory 45 | * `ls ` lists files in a specific directory (without changing your working directory) 46 | 47 | ##### `clear` 48 | * **clear**s all output from your console 49 | 50 | ##### `cd` 51 | * `cd ` **c**hanges **d**irectory to the path you specify, which can be a relative path or an absolute path 52 | * `cd ..` moves you "up" one directory (to the parent directory) 53 | * `cd` moves you to your "home" directory 54 | 55 | ##### `mkdir` 56 | * `mkdir ` **m**a**k**es a new **dir**ectory called `` 57 | 58 | ##### `touch` 59 | * `touch ` creates an empty file called `` 60 | * This is useful for creating empty files to be edited at a later time. 61 | * You can create multiple empty files with a single command: `touch ...` 62 | 63 | ##### `rm -i` 64 | * `rm ` **r**e**m**oves (deletes) a file permanently 65 | * `rm -i ` removes files in **i**nteractive mode, in which you are prompted to confirm that you really want to delete the file. It's best to always use `rm -i`. 66 | * `rm -ir ` removes a directory and **r**ecursively deletes all of its contents 67 | 68 | ##### `mv` 69 | * `mv ` **m**o**v**es a file from its current location to `` 70 | * `mv ` renames a file without changing its location 71 | 72 | ##### `cp` 73 | * `cp ` **c**o**p**ies a file from its current location to ``, leaving the original file unchanged 74 | * `cp ` copies a file without changing its location 75 | 76 | 77 | ### Pre-class exercise 78 | * Open your command line interface. 79 | * Navigate to your Desktop, and confirm you are there: 80 | * Print your working directory (it should end with `Desktop`). 81 | * List your files and subdirectories (they should match what you see on your Desktop). 82 | * Create a directory called `project`. 83 | * Navigate to the `project` directory, and create the following files in it: `draft_paper.md`, `plot1.png`, `plot2.png`. 84 | * Create two subdirectories in the `project` directory: `code`, `data` 85 | * Navigate to the `code` subdirectory, and create the following files in it: `processing.py`, `exploration.py`. 86 | * Navigate to the `data` subdirectory, and create the following files in it: `original.csv`, `clean.csv`, `other.csv`. 87 | * Make a copy of `draft_paper.md` called `final_paper.md`. 88 | * Rename `plot1.png` as `scatterplot.png`, and rename `plot2.png` as `histogram.png`. 89 | * Create a subdirectory called `viz`, and then move `scatterplot.png` and `histogram.png` to `viz`. 90 | * Delete `other.csv` from the `data` subdirectory. 91 | * Navigate back to `project`, and then print out (with a single command) all of its files, subdirectories, and the contents of those subdirectories. The output should look similar to [this image](../other/02_exercise_output.png). 92 | * Viewing this [collapsible tree diagram](../other/02_file_tree.png) may help you to visualize the directory structure that we have created. 93 | 94 | 95 | ### Intermediate commands 96 | 97 | ##### `head` 98 | * `head ` prints the **head** (the first 10 lines) of the file 99 | * `head -n20 ` prints the first 20 lines of the file 100 | * This is useful for previewing the contents of a large file without opening it. 101 | 102 | ##### `tail` 103 | * `tail ` prints the **tail** (the last 10 lines) of the file 104 | 105 | ##### `cat` 106 | * `cat ` prints the entire file 107 | 108 | ##### `less` 109 | * `less ` allows you to page or scroll through the file 110 | * Hit the spacebar to go down a page, use the arrow keys to scroll up and down, and hit `q` to exit. 111 | 112 | ##### `wc` 113 | * `wc ` returns the **c**ount of lines, **w**ords, and characters in a file 114 | * `wc -l ` only counts lines, `wc -w ` only counts words, and `wc -c ` only counts characters 115 | * A "word" is defined as any set of characters delimited by a space. 116 | 117 | ##### `find` 118 | * `find -name ` will recursively search the specified path (and its subdirectories) and **find** files and directories with a given `` 119 | * Use `.` for the `` to refer to the working directory. 120 | * For the ``, you can search for an exact match, or use wildcard characters to search for a partial match: 121 | * `*` specifies any number of any characters, such as `find . -name *.py` or `find . -name *data*.*` 122 | * `?` specifies one character, such as `find . -name ??_*.*` 123 | 124 | ##### `grep` 125 | * `grep ` searches a file for a **r**egular **e**xpression **p**attern and prints the matching lines 126 | * The pattern should be in quotation marks to allow for multiple words. 127 | * The pattern is case-sensitive by default, but you can use the `-i` option to **i**gnore case. 128 | * You can use wildcards in the filename to search multiple files, but it only searches the working directory (not subdirectories). 129 | * `grep -r ` does a **r**ecursive search of the path (checks subdirectories) for matches within files 130 | * Use `.` for the `` to refer to the working directory. 131 | * `grep ` does a **g**lobal search (of your entire computer) for matches 132 | * Hit `Ctrl + c` if you want to cancel the search. 133 | * Much more complex string-matching patterns can be used. 134 | 135 | ##### `|` 136 | * ` | ` pipes the results from `` into ``, and then the results of `` are printed to the console 137 | 138 | ##### `>` 139 | * ` > ` takes the output of `` and saves it in `` 140 | * This will overwrite the file if it already exists. 141 | 142 | ##### `>>` 143 | * ` >> ` takes the output of `` and appends it to `` 144 | * This will create the file if it does not yet exist. 145 | 146 | 147 | ### Advanced commands 148 | 149 | ##### `cut` 150 | * `cut -f1,2 ` **cut**s a tab-delimited file into columns and returns the first two **f**ields 151 | * `cut -f1,2 -d, ` indicates that the file is **d**elimited by commas 152 | 153 | ##### `sort` 154 | * `sort ` **sort**s a file by the first field 155 | 156 | ##### `uniq` 157 | * `uniq ` discards all but one of the successive identical lines (thus it only keeps **uniq**ue lines) 158 | * `uniq -c ` also records the **c**ount of the number of occurrences 159 | * Because lines must be successive to be counted as identical, you will usually use `sort` before `uniq`. 160 | -------------------------------------------------------------------------------- /code/03_file_reading.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Lesson on file reading using Airline Safety Data 3 | https://github.com/fivethirtyeight/data/tree/master/airline-safety 4 | ''' 5 | 6 | # read the whole file at once, return a single string (including newlines) 7 | # 'rU' mode (read universal) converts different line endings into '\n' 8 | f = open('airlines.csv', mode='rU') 9 | file_string = f.read() 10 | f.close() 11 | 12 | # use a context manager to automatically close your file 13 | with open('airlines.csv', mode='rU') as f: 14 | file_string = f.read() 15 | 16 | # read the file into a list (each list element is one row) 17 | with open('airlines.csv', mode='rU') as f: 18 | file_list = [] 19 | for row in f: 20 | file_list.append(row) 21 | 22 | # do the same thing using a list comprehension 23 | with open('airlines.csv', mode='rU') as f: 24 | file_list = [row for row in f] 25 | 26 | # side note: splitting strings 27 | 'hello DAT students'.split() 28 | 'hello DAT students'.split('e') 29 | 30 | # split each string (at the commas) into a list 31 | with open('airlines.csv', mode='rU') as f: 32 | file_nested_list = [row.split(',') for row in f] 33 | 34 | # do the same thing using the csv module 35 | import csv 36 | with open('airlines.csv', mode='rU') as f: 37 | file_nested_list = [row for row in csv.reader(f)] 38 | 39 | # separate the header and data 40 | header = file_nested_list[0] 41 | data = file_nested_list[1:] 42 | 43 | ''' 44 | EXERCISES: 45 | 46 | 1. Create a list containing the average number of incidents per year for each airline. 47 | Example for Aer Lingus: (2 + 0)/30 = 0.07 48 | Expected output: [0.07, 2.73, 0.23, ...] 49 | 50 | 2. Create a list of airline names (without the star). 51 | Expected output: ['Aer Lingus', 'Aeroflot', 'Aerolineas Argentinas', ...] 52 | 53 | 3. Create a list (of the same length) that contains 1 if there's a star and 0 if not. 54 | Expected output: [0, 1, 0, ...] 55 | 56 | 4. BONUS: Create a dictionary in which the key is the airline name (without the star) 57 | and the value is the average number of incidents. 58 | Expected output: {'Aer Lingus': 0.07, 'Aeroflot': 2.73, ...} 59 | ''' 60 | 61 | # Part 1 62 | incidents = [round((int(row[2]) + int(row[5])) / float(30), 2) for row in data] 63 | 64 | # Parts 2 and 3 65 | airlines = [] 66 | starred = [] 67 | for row in data: 68 | if row[0][-1] == '*': 69 | starred.append(1) 70 | airlines.append(row[0][:-1]) 71 | else: 72 | starred.append(0) 73 | airlines.append(row[0]) 74 | 75 | # Part 4 76 | airline_incidents = dict(zip(airlines, incidents)) 77 | 78 | ''' 79 | A few extra things that will help you with the homework 80 | ''' 81 | 82 | # 'set' data structure is useful for gathering unique elements 83 | my_list = [1, 2, 1] 84 | set(my_list) # returns a set of 1, 2 85 | len(set(my_list)) # count of unique elements 86 | 87 | # 'in' statement is useful for lists 88 | 1 in my_list # True 89 | 3 in my_list # False 90 | 91 | # 'in' is useful for strings (checks for substrings) 92 | my_string = 'hello there' 93 | 'the' in my_string # True 94 | 'then' in my_string # False 95 | 96 | # 'in' is useful for dictionaries (checks keys but not values) 97 | my_dict = {'name':'Kevin', 'title':'instructor'} 98 | 'name' in my_dict # True 99 | 'Kevin' in my_dict # False 100 | 101 | # 'count' method for strings counts how many times a character appears 102 | my_string.count('e') # 3 103 | -------------------------------------------------------------------------------- /code/03_python_homework_chipotle.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Python Homework with Chipotle data 3 | https://github.com/TheUpshot/chipotle 4 | ''' 5 | 6 | ''' 7 | BASIC LEVEL 8 | PART 1: Read in the file with csv.reader() and store it in an object called 'file_nested_list'. 9 | Hint: This is a TSV file, and csv.reader() needs to be told how to handle it. 10 | https://docs.python.org/2/library/csv.html 11 | ''' 12 | 13 | import csv 14 | 15 | # specify that the delimiter is a tab character 16 | with open('chipotle.tsv', mode='rU') as f: 17 | file_nested_list = [row for row in csv.reader(f, delimiter='\t')] 18 | 19 | 20 | ''' 21 | BASIC LEVEL 22 | PART 2: Separate 'file_nested_list' into the 'header' and the 'data'. 23 | ''' 24 | 25 | header = file_nested_list[0] 26 | data = file_nested_list[1:] 27 | 28 | 29 | ''' 30 | INTERMEDIATE LEVEL 31 | PART 3: Calculate the average price of an order. 32 | Hint: Examine the data to see if the 'quantity' column is relevant to this calculation. 33 | Hint: Think carefully about the simplest way to do this! 34 | ''' 35 | 36 | # count the number of unique order_id's 37 | # note: you could assume this is 1834 since that's the maximum order_id, but it's best to check 38 | num_orders = len(set([row[0] for row in data])) # 1834 39 | 40 | # create a list of prices 41 | # note: ignore the 'quantity' column because the 'item_price' takes quantity into account 42 | prices = [float(row[4][1:-1]) for row in data] # strip the dollar sign and trailing space 43 | 44 | # calculate the average price of an order and round to 2 digits 45 | round(sum(prices) / num_orders, 2) # $18.81 46 | 47 | 48 | ''' 49 | INTERMEDIATE LEVEL 50 | PART 4: Create a list (or set) of all unique sodas and soft drinks that they sell. 51 | Note: Just look for 'Canned Soda' and 'Canned Soft Drink', and ignore other drinks like 'Izze'. 52 | ''' 53 | 54 | # if 'item_name' includes 'Canned', append 'choice_description' to 'sodas' list 55 | sodas = [] 56 | for row in data: 57 | if 'Canned' in row[2]: 58 | sodas.append(row[3][1:-1]) # strip the brackets 59 | 60 | # equivalent list comprehension (using an 'if' condition) 61 | sodas = [row[3][1:-1] for row in data if 'Canned' in row[2]] 62 | 63 | # create a set of unique sodas 64 | unique_sodas = set(sodas) 65 | 66 | 67 | ''' 68 | ADVANCED LEVEL 69 | PART 5: Calculate the average number of toppings per burrito. 70 | Note: Let's ignore the 'quantity' column to simplify this task. 71 | Hint: Think carefully about the easiest way to count the number of toppings! 72 | ''' 73 | 74 | # keep a running total of burritos and toppings 75 | burrito_count = 0 76 | topping_count = 0 77 | 78 | # calculate number of toppings by counting the commas and adding 1 79 | # note: x += 1 is equivalent to x = x + 1 80 | for row in data: 81 | if 'Burrito' in row[2]: 82 | burrito_count += 1 83 | topping_count += (row[3].count(',') + 1) 84 | 85 | # calculate the average topping count and round to 2 digits 86 | round(topping_count / float(burrito_count), 2) # 5.40 87 | 88 | 89 | ''' 90 | ADVANCED LEVEL 91 | PART 6: Create a dictionary in which the keys represent chip orders and 92 | the values represent the total number of orders. 93 | Expected output: {'Chips and Roasted Chili-Corn Salsa': 18, ... } 94 | Note: Please take the 'quantity' column into account! 95 | Optional: Learn how to use 'defaultdict' to simplify your code. 96 | ''' 97 | 98 | # start with an empty dictionary 99 | chips = {} 100 | 101 | # if chip order is not in dictionary, then add a new key/value pair 102 | # if chip order is already in dictionary, then update the value for that key 103 | for row in data: 104 | if 'Chips' in row[2]: 105 | if row[2] not in chips: 106 | chips[row[2]] = int(row[1]) # this is a new key, so create key/value pair 107 | else: 108 | chips[row[2]] += int(row[1]) # this is an existing key, so add to the value 109 | 110 | # defaultdict saves you the trouble of checking whether a key already exists 111 | from collections import defaultdict 112 | dchips = defaultdict(int) 113 | for row in data: 114 | if 'Chips' in row[2]: 115 | dchips[row[2]] += int(row[1]) 116 | 117 | 118 | ''' 119 | BONUS: Think of a question about this data that interests you, and then answer it! 120 | ''' 121 | -------------------------------------------------------------------------------- /code/05_pandas_homework_imdb.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Pandas Homework with IMDb data 3 | ''' 4 | 5 | ''' 6 | BASIC LEVEL 7 | ''' 8 | 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | 12 | # read in 'imdb_1000.csv' and store it in a DataFrame named movies 13 | movies = pd.read_csv('imdb_1000.csv') 14 | 15 | # check the number of rows and columns 16 | movies.shape 17 | 18 | # check the data type of each column 19 | movies.dtypes 20 | 21 | # calculate the average movie duration 22 | movies.duration.mean() 23 | 24 | # sort the DataFrame by duration to find the shortest and longest movies 25 | movies.sort('duration').head(1) 26 | movies.sort('duration').tail(1) 27 | 28 | # create a histogram of duration, choosing an "appropriate" number of bins 29 | movies.duration.plot(kind='hist', bins=20) 30 | 31 | # use a box plot to display that same data 32 | movies.duration.plot(kind='box') 33 | 34 | ''' 35 | INTERMEDIATE LEVEL 36 | ''' 37 | 38 | # count how many movies have each of the content ratings 39 | movies.content_rating.value_counts() 40 | 41 | # use a visualization to display that same data, including a title and x and y labels 42 | movies.content_rating.value_counts().plot(kind='bar', title='Top 1000 Movies by Content Rating') 43 | plt.xlabel('Content Rating') 44 | plt.ylabel('Number of Movies') 45 | 46 | # convert the following content ratings to "UNRATED": NOT RATED, APPROVED, PASSED, GP 47 | movies.content_rating.replace(['NOT RATED', 'APPROVED', 'PASSED', 'GP'], 'UNRATED', inplace=True) 48 | 49 | # convert the following content ratings to "NC-17": X, TV-MA 50 | movies.content_rating.replace(['X', 'TV-MA'], 'NC-17', inplace=True) 51 | 52 | # count the number of missing values in each column 53 | movies.isnull().sum() 54 | 55 | # if there are missing values: examine them, then fill them in with "reasonable" values 56 | movies[movies.content_rating.isnull()] 57 | movies.content_rating.fillna('UNRATED', inplace=True) 58 | 59 | # calculate the average star rating for movies 2 hours or longer, 60 | # and compare that with the average star rating for movies shorter than 2 hours 61 | movies[movies.duration >= 120].star_rating.mean() 62 | movies[movies.duration < 120].star_rating.mean() 63 | 64 | # use a visualization to detect whether there is a relationship between duration and star rating 65 | movies.plot(kind='scatter', x='duration', y='star_rating', alpha=0.2) 66 | 67 | # calculate the average duration for each genre 68 | movies.groupby('genre').duration.mean() 69 | 70 | ''' 71 | ADVANCED LEVEL 72 | ''' 73 | 74 | # visualize the relationship between content rating and duration 75 | movies.boxplot(column='duration', by='content_rating') 76 | movies.hist(column='duration', by='content_rating', sharex=True) 77 | 78 | # determine the top rated movie (by star rating) for each genre 79 | movies.sort('star_rating', ascending=False).groupby('genre').title.first() 80 | movies.groupby('genre').title.first() # equivalent, since DataFrame is already sorted by star rating 81 | 82 | # check if there are multiple movies with the same title, and if so, determine if they are actually duplicates 83 | dupe_titles = movies[movies.title.duplicated()].title 84 | movies[movies.title.isin(dupe_titles)] 85 | 86 | # calculate the average star rating for each genre, but only include genres with at least 10 movies 87 | 88 | # option 1: manually create a list of relevant genres, then filter using that list 89 | movies.genre.value_counts() 90 | top_genres = ['Drama', 'Comedy', 'Action', 'Crime', 'Biography', 'Adventure', 'Animation', 'Horror', 'Mystery'] 91 | movies[movies.genre.isin(top_genres)].groupby('genre').star_rating.mean() 92 | 93 | # option 2: automatically create a list of relevant genres by saving the value_counts and then filtering 94 | genre_counts = movies.genre.value_counts() 95 | top_genres = genre_counts[genre_counts >= 10].index 96 | movies[movies.genre.isin(top_genres)].groupby('genre').star_rating.mean() 97 | 98 | # option 3: calculate the average star rating for all genres, then filter using a boolean Series 99 | movies.groupby('genre').star_rating.mean()[movies.genre.value_counts() >= 10] 100 | 101 | # option 4: aggregate by count and mean, then filter using the count 102 | genre_ratings = movies.groupby('genre').star_rating.agg(['count', 'mean']) 103 | genre_ratings[genre_ratings['count'] >= 10] 104 | 105 | ''' 106 | BONUS 107 | ''' 108 | 109 | # Figure out something "interesting" using the actors data! 110 | -------------------------------------------------------------------------------- /code/05_pandas_merge_nb.py: -------------------------------------------------------------------------------- 1 | # # Joining (Merging) DataFrames 2 | 3 | # Using the [MovieLens 100k data](http://grouplens.org/datasets/movielens/), let's create two DataFrames: 4 | # 5 | # - **movies**: shows information about movies, namely a unique **movie_id** and its **title** 6 | # - **ratings**: shows the **rating** that a particular **user_id** gave to a particular **movie_id** at a particular **timestamp** 7 | 8 | # ### Movies 9 | 10 | import pandas as pd 11 | movie_url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.item' 12 | movie_cols = ['movie_id', 'title'] 13 | movies = pd.read_table(movie_url, sep='|', header=None, names=movie_cols, usecols=[0, 1]) 14 | movies.head() 15 | 16 | 17 | # ### Ratings 18 | 19 | rating_url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.data' 20 | rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp'] 21 | ratings = pd.read_table(rating_url, sep='\t', header=None, names=rating_cols) 22 | ratings.head() 23 | 24 | 25 | # Let's pretend that you want to examine the ratings DataFrame, but you want to know the **title** of each movie rather than its **movie_id**. The best way to accomplish this objective is by "joining" (or "merging") the DataFrames using the Pandas `merge` function: 26 | 27 | movie_ratings = pd.merge(movies, ratings) 28 | movie_ratings.head() 29 | 30 | 31 | # Here's what just happened: 32 | # 33 | # - Pandas noticed that movies and ratings had one column in common, namely **movie_id**. This is the "key" on which the DataFrames will be joined. 34 | # - The first **movie_id** in movies is 1. Thus, Pandas looked through every row in the ratings DataFrame, searching for a movie_id of 1. Every time it found such a row, it recorded the **user_id**, **rating**, and **timestamp** listed in that row. In this case, it found 452 matching rows. 35 | # - The second **movie_id** in movies is 2. Again, Pandas did a search of ratings and found 131 matching rows. 36 | # - This process was repeated for all of the remaining rows in movies. 37 | # 38 | # At the end of the process, the movie_ratings DataFrame is created, which contains the two columns from movies (**movie_id** and **title**) and the three other colums from ratings (**user_id**, **rating**, and **timestamp**). 39 | # 40 | # - **movie_id** 1 and its **title** are listed 452 times, next to the **user_id**, **rating**, and **timestamp** for each of the 452 matching ratings. 41 | # - **movie_id** 2 and its **title** are listed 131 times, next to the **user_id**, **rating**, and **timestamp** for each of the 131 matching ratings. 42 | # - And so on, for every movie in the dataset. 43 | 44 | print movies.shape 45 | print ratings.shape 46 | print movie_ratings.shape 47 | 48 | 49 | # Notice the shapes of the three DataFrames: 50 | # 51 | # - There are 1682 rows in the movies DataFrame. 52 | # - There are 100000 rows in the ratings DataFrame. 53 | # - The `merge` function resulted in a movie_ratings DataFrame with 100000 rows, because every row from ratings matched a row from movies. 54 | # - The movie_ratings DataFrame has 5 columns, namely the 2 columns from movies, plus the 4 columns from ratings, minus the 1 column in common. 55 | # 56 | # By default, the `merge` function joins the DataFrames using all column names that are in common (**movie_id**, in this case). The [documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) explains how you can override this behavior. 57 | 58 | # ## Four Types of Joins 59 | 60 | # There are actually four types of joins supported by the Pandas `merge` function. Here's how they are described by the documentation: 61 | # 62 | # - **inner:** use intersection of keys from both frames (SQL: inner join) 63 | # - **outer:** use union of keys from both frames (SQL: full outer join) 64 | # - **left:** use only keys from left frame (SQL: left outer join) 65 | # - **right:** use only keys from right frame (SQL: right outer join) 66 | # 67 | # The default is the "inner join", which was used when creating the movie_ratings DataFrame. 68 | # 69 | # It's easiest to understand the different types by looking at some simple examples: 70 | 71 | # ### Example DataFrames A and B 72 | 73 | A = pd.DataFrame({'color': ['green', 'yellow', 'red'], 'num':[1, 2, 3]}) 74 | A 75 | 76 | 77 | B = pd.DataFrame({'color': ['green', 'yellow', 'pink'], 'size':['S', 'M', 'L']}) 78 | B 79 | 80 | 81 | # ### Inner join 82 | # 83 | # Only include observations found in both A and B: 84 | 85 | pd.merge(A, B, how='inner') 86 | 87 | 88 | # ### Outer join 89 | # 90 | # Include observations found in either A or B: 91 | 92 | pd.merge(A, B, how='outer') 93 | 94 | 95 | # ### Left join 96 | # 97 | # Include all observations found in A: 98 | 99 | pd.merge(A, B, how='left') 100 | 101 | 102 | # ### Right join 103 | # 104 | # Include all observations found in B: 105 | 106 | pd.merge(A, B, how='right') 107 | -------------------------------------------------------------------------------- /code/05_pandas_visualization_nb.py: -------------------------------------------------------------------------------- 1 | # # Visualization with Pandas (and Matplotlib) 2 | 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | 6 | # display plots in the notebook 7 | 8 | # increase default figure and font sizes for easier viewing 9 | plt.rcParams['figure.figsize'] = (8, 6) 10 | plt.rcParams['font.size'] = 14 11 | 12 | 13 | # read in the drinks data 14 | drink_cols = ['country', 'beer', 'spirit', 'wine', 'liters', 'continent'] 15 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv' 16 | drinks = pd.read_csv(url, header=0, names=drink_cols, na_filter=False) 17 | 18 | 19 | # ## Histogram: show the distribution of a numerical variable 20 | 21 | # sort the beer column and mentally split it into 3 groups 22 | drinks.beer.order().values 23 | 24 | 25 | # compare with histogram 26 | drinks.beer.plot(kind='hist', bins=3) 27 | 28 | 29 | # try more bins 30 | drinks.beer.plot(kind='hist', bins=20) 31 | 32 | 33 | # add title and labels 34 | drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings') 35 | plt.xlabel('Beer Servings') 36 | plt.ylabel('Frequency') 37 | 38 | 39 | # compare with density plot (smooth version of a histogram) 40 | drinks.beer.plot(kind='density', xlim=(0, 500)) 41 | 42 | 43 | # ## Scatter Plot: show the relationship between two numerical variables 44 | 45 | # select the beer and wine columns and sort by beer 46 | drinks[['beer', 'wine']].sort('beer').values 47 | 48 | 49 | # compare with scatter plot 50 | drinks.plot(kind='scatter', x='beer', y='wine') 51 | 52 | 53 | # add transparency 54 | drinks.plot(kind='scatter', x='beer', y='wine', alpha=0.3) 55 | 56 | 57 | # vary point color by spirit servings 58 | drinks.plot(kind='scatter', x='beer', y='wine', c='spirit', colormap='Blues') 59 | 60 | 61 | # scatter matrix of three numerical columns 62 | pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']]) 63 | 64 | 65 | # increase figure size 66 | pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']], figsize=(10, 8)) 67 | 68 | 69 | # ## Bar Plot: show a numerical comparison across different categories 70 | 71 | # count the number of countries in each continent 72 | drinks.continent.value_counts() 73 | 74 | 75 | # compare with bar plot 76 | drinks.continent.value_counts().plot(kind='bar') 77 | 78 | 79 | # calculate the mean alcohol amounts for each continent 80 | drinks.groupby('continent').mean() 81 | 82 | 83 | # side-by-side bar plots 84 | drinks.groupby('continent').mean().plot(kind='bar') 85 | 86 | 87 | # drop the liters column 88 | drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar') 89 | 90 | 91 | # stacked bar plots 92 | drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar', stacked=True) 93 | 94 | 95 | # ## Box Plot: show quartiles (and outliers) for one or more numerical variables 96 | # 97 | # **Five-number summary:** 98 | # 99 | # - min = minimum value 100 | # - 25% = first quartile (Q1) = median of the lower half of the data 101 | # - 50% = second quartile (Q2) = median of the data 102 | # - 75% = third quartile (Q3) = median of the upper half of the data 103 | # - max = maximum value 104 | # 105 | # (More useful than mean and standard deviation for describing skewed distributions) 106 | # 107 | # **Interquartile Range (IQR)** = Q3 - Q1 108 | # 109 | # **Outliers:** 110 | # 111 | # - below Q1 - 1.5 * IQR 112 | # - above Q3 + 1.5 * IQR 113 | 114 | # sort the spirit column 115 | drinks.spirit.order().values 116 | 117 | 118 | # show "five-number summary" for spirit 119 | drinks.spirit.describe() 120 | 121 | 122 | # compare with box plot 123 | drinks.spirit.plot(kind='box') 124 | 125 | 126 | # include multiple variables 127 | drinks.drop('liters', axis=1).plot(kind='box') 128 | 129 | 130 | # ## Line Plot: show the trend of a numerical variable over time 131 | 132 | # read in the ufo data 133 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/ufo.csv' 134 | ufo = pd.read_csv(url) 135 | ufo['Time'] = pd.to_datetime(ufo.Time) 136 | ufo['Year'] = ufo.Time.dt.year 137 | 138 | 139 | # count the number of ufo reports each year (and sort by year) 140 | ufo.Year.value_counts().sort_index() 141 | 142 | 143 | # compare with line plot 144 | ufo.Year.value_counts().sort_index().plot() 145 | 146 | 147 | # don't use a line plot when there is no logical ordering 148 | drinks.continent.value_counts().plot() 149 | 150 | 151 | # ## Grouped Box Plots: show one box plot for each group 152 | 153 | # reminder: box plot of beer servings 154 | drinks.beer.plot(kind='box') 155 | 156 | 157 | # box plot of beer servings grouped by continent 158 | drinks.boxplot(column='beer', by='continent') 159 | 160 | 161 | # box plot of all numeric columns grouped by continent 162 | drinks.boxplot(by='continent') 163 | 164 | 165 | # ## Grouped Histograms: show one histogram for each group 166 | 167 | # reminder: histogram of beer servings 168 | drinks.beer.plot(kind='hist') 169 | 170 | 171 | # histogram of beer servings grouped by continent 172 | drinks.hist(column='beer', by='continent') 173 | 174 | 175 | # share the x axes 176 | drinks.hist(column='beer', by='continent', sharex=True) 177 | 178 | 179 | # share the x and y axes 180 | drinks.hist(column='beer', by='continent', sharex=True, sharey=True) 181 | 182 | 183 | # change the layout 184 | drinks.hist(column='beer', by='continent', sharex=True, layout=(2, 3)) 185 | 186 | 187 | # ## Assorted Functionality 188 | 189 | # saving a plot to a file 190 | drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings') 191 | plt.xlabel('Beer Servings') 192 | plt.ylabel('Frequency') 193 | plt.savefig('beer_histogram.png') 194 | 195 | 196 | # list available plot styles 197 | plt.style.available 198 | 199 | 200 | # change to a different style 201 | plt.style.use('ggplot') 202 | -------------------------------------------------------------------------------- /code/06_human_learning_iris_nb.py: -------------------------------------------------------------------------------- 1 | # # Exercise: "Human learning" with iris data 2 | # 3 | # **Question:** Can you predict the species of an iris using petal and sepal measurements? 4 | # 5 | # 1. Read the iris data into a Pandas DataFrame, including column names. 6 | # 2. Gather some basic information about the data. 7 | # 3. Use sorting, split-apply-combine, and/or visualization to look for differences between species. 8 | # 4. Write down a set of rules that could be used to predict species based on iris measurements. 9 | # 10 | # **BONUS:** Define a function that accepts a row of data and returns a predicted species. Then, use that function to make predictions for all existing rows of data, and check the accuracy of your predictions. 11 | 12 | import pandas as pd 13 | import matplotlib.pyplot as plt 14 | 15 | # display plots in the notebook 16 | 17 | # increase default figure and font sizes for easier viewing 18 | plt.rcParams['figure.figsize'] = (8, 6) 19 | plt.rcParams['font.size'] = 14 20 | 21 | 22 | # ## Task 1 23 | # 24 | # Read the iris data into a pandas DataFrame, including column names. 25 | 26 | # define a list of column names (as strings) 27 | col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] 28 | 29 | # define the URL from which to retrieve the data (as a string) 30 | url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' 31 | 32 | # retrieve the CSV file and add the column names 33 | iris = pd.read_csv(url, header=None, names=col_names) 34 | 35 | 36 | # ## Task 2 37 | # 38 | # Gather some basic information about the data. 39 | 40 | iris.shape 41 | 42 | 43 | iris.head() 44 | 45 | 46 | iris.dtypes 47 | 48 | 49 | iris.describe() 50 | 51 | 52 | iris.species.value_counts() 53 | 54 | 55 | iris.isnull().sum() 56 | 57 | 58 | # ## Task 3 59 | # 60 | # Use sorting, split-apply-combine, and/or visualization to look for differences between species. 61 | 62 | # ### sorting 63 | 64 | # sort the DataFrame by petal_width and display the NumPy array 65 | print iris.sort('petal_width').values 66 | 67 | 68 | # ### split-apply-combine 69 | 70 | # mean of sepal_length grouped by species 71 | iris.groupby('species').sepal_length.mean() 72 | 73 | 74 | # mean of all numeric columns grouped by species 75 | iris.groupby('species').mean() 76 | 77 | 78 | # description of all numeric columns grouped by species 79 | iris.groupby('species').describe() 80 | 81 | 82 | # ### visualization 83 | 84 | # histogram of petal_width grouped by species 85 | iris.hist(column='petal_width', by='species', sharex=True) 86 | 87 | 88 | # box plot of petal_width grouped by species 89 | iris.boxplot(column='petal_width', by='species') 90 | 91 | 92 | # box plot of all numeric columns grouped by species 93 | iris.boxplot(by='species') 94 | 95 | 96 | # map species to a numeric value so that plots can be colored by species 97 | iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}) 98 | 99 | # alternative method 100 | iris['species_num'] = iris.species.factorize()[0] 101 | 102 | 103 | # scatter plot of petal_length vs petal_width colored by species 104 | iris.plot(kind='scatter', x='petal_length', y='petal_width', c='species_num', colormap='brg') 105 | 106 | 107 | # scatter matrix of all features colored by species 108 | pd.scatter_matrix(iris.drop('species_num', axis=1), c=iris.species_num, figsize=(12, 10)) 109 | 110 | 111 | # ## Task 4 112 | # 113 | # Write down a set of rules that could be used to predict species based on iris measurements. 114 | 115 | # define a new feature that represents petal area ("feature engineering") 116 | iris['petal_area'] = iris.petal_length * iris.petal_width 117 | 118 | 119 | # description of petal_area grouped by species 120 | iris.groupby('species').petal_area.describe().unstack() 121 | 122 | 123 | # box plot of petal_area grouped by species 124 | iris.boxplot(column='petal_area', by='species') 125 | 126 | 127 | # only show irises with a petal_area between 7 and 9 128 | iris[(iris.petal_area > 7) & (iris.petal_area < 9)].sort('petal_area') 129 | 130 | 131 | # My set of rules for predicting species: 132 | # 133 | # - If petal_area is less than 2, predict **setosa**. 134 | # - Else if petal_area is less than 7.4, predict **versicolor**. 135 | # - Otherwise, predict **virginica**. 136 | 137 | # ## Bonus 138 | # 139 | # Define a function that accepts a row of data and returns a predicted species. Then, use that function to make predictions for all existing rows of data, and check the accuracy of your predictions. 140 | 141 | # given a row of data, return a predicted species_num (0/1/2) 142 | def classify_iris(row): 143 | 144 | # calculate the petal_area 145 | petal_area = row[2] * row[3] 146 | 147 | # predict the species based on the rules above 148 | if petal_area < 2: 149 | prediction = 'setosa' 150 | elif petal_area < 7.4: 151 | prediction = 'versicolor' 152 | else: 153 | prediction = 'virginica' 154 | 155 | # map the species name to a numeric value 156 | species_to_num = {'setosa':0, 'versicolor':1, 'virginica':2} 157 | 158 | # return that value 159 | return species_to_num[prediction] 160 | 161 | 162 | # print the first row 163 | iris.iloc[0, :] 164 | 165 | 166 | # print the last row 167 | iris.iloc[149, :] 168 | 169 | 170 | # test the function on the first and last rows 171 | print classify_iris(iris.iloc[0, :]) 172 | print classify_iris(iris.iloc[149, :]) 173 | 174 | 175 | # make predictions for all rows and store them in the DataFrame 176 | iris['prediction'] = [classify_iris(row) for index, row in iris.iterrows()] 177 | 178 | 179 | # calculate the percentage of correct predictions 180 | sum(iris.species_num == iris.prediction) / 150. 181 | -------------------------------------------------------------------------------- /code/07_api.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CLASS: Getting Data from APIs 3 | 4 | What is an API? 5 | - Application Programming Interface 6 | - Structured way to expose specific functionality and data access to users 7 | - Web APIs usually follow the "REST" standard 8 | 9 | How to interact with a REST API: 10 | - Make a "request" to a specific URL (an "endpoint"), and get the data back in a "response" 11 | - Most relevant request method for us is GET (other methods: POST, PUT, DELETE) 12 | - Response is often JSON format 13 | - Web console is sometimes available (allows you to explore an API) 14 | ''' 15 | 16 | # read IMDb data into a DataFrame: we want a year column! 17 | import pandas as pd 18 | movies = pd.read_csv('imdb_1000.csv') 19 | movies.head() 20 | 21 | # use requests library to interact with a URL 22 | import requests 23 | r = requests.get('http://www.omdbapi.com/?t=the shawshank redemption&r=json&type=movie') 24 | 25 | # check the status: 200 means success, 4xx means error 26 | r.status_code 27 | 28 | # view the raw response text 29 | r.text 30 | 31 | # decode the JSON response body into a dictionary 32 | r.json() 33 | 34 | # extracting the year from the dictionary 35 | r.json()['Year'] 36 | 37 | # what happens if the movie name is not recognized? 38 | r = requests.get('http://www.omdbapi.com/?t=blahblahblah&r=json&type=movie') 39 | r.status_code 40 | r.json() 41 | 42 | # define a function to return the year 43 | def get_movie_year(title): 44 | r = requests.get('http://www.omdbapi.com/?t=' + title + '&r=json&type=movie') 45 | info = r.json() 46 | if info['Response'] == 'True': 47 | return int(info['Year']) 48 | else: 49 | return None 50 | 51 | # test the function 52 | get_movie_year('The Shawshank Redemption') 53 | get_movie_year('blahblahblah') 54 | 55 | # create a smaller DataFrame for testing 56 | top_movies = movies.head().copy() 57 | 58 | # write a for loop to build a list of years 59 | from time import sleep 60 | years = [] 61 | for title in top_movies.title: 62 | years.append(get_movie_year(title)) 63 | sleep(1) 64 | 65 | # check that the DataFrame and the list of years are the same length 66 | assert(len(top_movies) == len(years)) 67 | 68 | # save that list as a new column 69 | top_movies['year'] = years 70 | 71 | ''' 72 | Bonus content: Updating the DataFrame as part of a loop 73 | ''' 74 | 75 | # enumerate allows you to access the item location while iterating 76 | letters = ['a', 'b', 'c'] 77 | for index, letter in enumerate(letters): 78 | print index, letter 79 | 80 | # iterrows method for DataFrames is similar 81 | for index, row in top_movies.iterrows(): 82 | print index, row.title 83 | 84 | # create a new column and set a default value 85 | movies['year'] = -1 86 | 87 | # loc method allows you to access a DataFrame element by 'label' 88 | movies.loc[0, 'year'] = 1994 89 | 90 | # write a for loop to update the year for the first three movies 91 | for index, row in movies.iterrows(): 92 | if index < 3: 93 | movies.loc[index, 'year'] = get_movie_year(row.title) 94 | sleep(1) 95 | else: 96 | break 97 | 98 | ''' 99 | Other considerations when accessing APIs: 100 | - Most APIs require you to have an access key (which you should store outside your code) 101 | - Most APIs limit the number of API calls you can make (per day, hour, minute, etc.) 102 | - Not all APIs are free 103 | - Not all APIs are well-documented 104 | - Pay attention to the API version 105 | 106 | Python wrapper is another option for accessing an API: 107 | - Set of functions that "wrap" the API code for ease of use 108 | - Potentially simplifies your code 109 | - But, wrapper could have bugs or be out-of-date or poorly documented 110 | ''' 111 | -------------------------------------------------------------------------------- /code/08_bias_variance_nb.py: -------------------------------------------------------------------------------- 1 | # # Exploring the Bias-Variance Tradeoff 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import seaborn as sns 6 | 7 | # allow plots to appear in the notebook 8 | 9 | 10 | # ## Brain and body weight 11 | 12 | # This is a [dataset](http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt) of the average weight of the body and the brain for 62 mammal species. Let's read it into pandas and take a quick look: 13 | 14 | url = 'http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt' 15 | col_names = ['id', 'brain', 'body'] 16 | mammals = pd.read_table(url, sep='\s+', skiprows=33, names=col_names, index_col='id') 17 | mammals.head() 18 | 19 | 20 | mammals.describe() 21 | 22 | 23 | # We're going to focus on a smaller subset in which the body weight is less than 200: 24 | 25 | # only keep rows in which the body weight is less than 200 26 | mammals = mammals[mammals.body < 200] 27 | mammals.shape 28 | 29 | 30 | # We're now going to pretend that there are only 51 mammal species in existence. In other words, we are pretending that this is the entire dataset of brain and body weights for **every known mammal species**. 31 | # 32 | # Let's create a scatterplot (using [Seaborn](http://stanford.edu/~mwaskom/software/seaborn/)) to visualize the relationship between brain and body weight: 33 | 34 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, fit_reg=False) 35 | sns.plt.xlim(-10, 200) 36 | sns.plt.ylim(-10, 250) 37 | 38 | 39 | # There appears to be a relationship between brain and body weight for mammals. 40 | 41 | # ## Making a prediction 42 | 43 | # Now let's pretend that a **new mammal species** is discovered. We measure the body weight of every member of this species that we can find, and calculate an **average body weight of 100**. We want to **predict the average brain weight** of this species (rather than measuring it directly). How might we do this? 44 | 45 | sns.lmplot(x='body', y='brain', data=mammals, ci=None) 46 | sns.plt.xlim(-10, 200) 47 | sns.plt.ylim(-10, 250) 48 | 49 | 50 | # We drew a straight line that appears to best capture the relationship between brain and body weight. So, we might predict that our new species has a brain weight of about 45, since that's the approximate y value when x=100. 51 | # 52 | # This is known as a "linear model" or a "linear regression model", which we will study in a future class. 53 | 54 | # ## Making a prediction from a sample 55 | 56 | # Earlier, I said that this dataset contained every known mammal species. That's very convenient, but **in the real world, all you ever have is a sample of data**. A more realistic situation would be to only have brain and body weights for (let's say) half of the 51 known mammals. 57 | # 58 | # When that new mammal species (with a body weight of 100) is discovered, we still want to make an accurate prediction for the brain weight, but this task might be more difficult since we don't have all of the data that we would ideally like to have. 59 | # 60 | # Let's simulate this situation by assigning each of the 51 observations to **either universe 1 or universe 2**: 61 | 62 | # set a random seed for reproducibility 63 | np.random.seed(12345) 64 | 65 | # randomly assign every observation to either universe 1 or universe 2 66 | mammals['universe'] = np.random.randint(1, 3, len(mammals)) 67 | mammals.head() 68 | 69 | 70 | # **Important:** We only live in one of the two universes. Both universes have 51 known mammal species, but each universe knows the brain and body weight for different species. 71 | # 72 | # We can now tell Seaborn to create two plots, in which the left plot only uses the data from **universe 1** and the right plot only uses the data from **universe 2**: 73 | 74 | # col='universe' subsets the data by universe and creates two separate plots 75 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, col='universe') 76 | sns.plt.xlim(-10, 200) 77 | sns.plt.ylim(-10, 250) 78 | 79 | 80 | # The line looks pretty similar between the two plots, despite the fact that they used separate samples of data. In both cases, we would predict a brain weight of about 45. 81 | # 82 | # It's easier to see the degree of similarity by placing them on the same plot: 83 | 84 | # hue='universe' subsets the data by universe and creates a single plot 85 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, hue='universe') 86 | sns.plt.xlim(-10, 200) 87 | sns.plt.ylim(-10, 250) 88 | 89 | 90 | # What was the point of this exercise? This was a visual demonstration of a high bias, low variance model: 91 | # 92 | # - It's **high bias** because it doesn't fit the data particularly well. 93 | # - It's **low variance** because it doesn't change much depending on which observations happen to be available in that universe. 94 | 95 | # ## Let's try something completely different 96 | 97 | # What would a **low bias, high variance** model look like? Let's try polynomial regression, with an eighth order polynomial: 98 | 99 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, col='universe', order=8) 100 | sns.plt.xlim(-10, 200) 101 | sns.plt.ylim(-10, 250) 102 | 103 | 104 | # - It's **low bias** because the models match the data quite well! 105 | # - It's **high variance** because the models are widely different depending on which observations happen to be available in that universe. (For a body weight of 100, the brain weight prediction would be 40 in one universe and 0 in the other universe!) 106 | 107 | # ## Can we find a middle ground? 108 | 109 | # Perhaps we can create a model that has **less bias than the linear model**, and **less variance than the eighth order polynomial**? 110 | # 111 | # Let's try a second order polynomial instead: 112 | 113 | sns.lmplot(x='body', y='brain', data=mammals, ci=None, col='universe', order=2) 114 | sns.plt.xlim(-10, 200) 115 | sns.plt.ylim(-10, 250) 116 | 117 | 118 | # This seems better. In both the left and right plots, **it fits the data pretty well, but not too well**. 119 | # 120 | # This is the essence of the **bias-variance tradeoff**: You are seeking a model that appropriately balances bias and variance, and thus will generalize to new data (known as "out-of-sample" data). 121 | -------------------------------------------------------------------------------- /code/08_nba_knn_nb.py: -------------------------------------------------------------------------------- 1 | # # KNN exercise with NBA player data 2 | 3 | # ## Introduction 4 | # 5 | # - NBA player statistics from 2014-2015 (partial season): [data](https://github.com/justmarkham/DAT4-students/blob/master/kerry/Final/NBA_players_2015.csv), [data dictionary](https://github.com/justmarkham/DAT-project-examples/blob/master/pdf/nba_paper.pdf) 6 | # - **Goal:** Predict player position using assists, steals, blocks, turnovers, and personal fouls 7 | 8 | # ## Step 1: Read the data into Pandas 9 | 10 | # read the data into a DataFrame 11 | import pandas as pd 12 | url = 'https://raw.githubusercontent.com/justmarkham/DAT4-students/master/kerry/Final/NBA_players_2015.csv' 13 | nba = pd.read_csv(url, index_col=0) 14 | 15 | 16 | # examine the columns 17 | nba.columns 18 | 19 | 20 | # examine the positions 21 | nba.pos.value_counts() 22 | 23 | 24 | # ## Step 2: Create X and y 25 | # 26 | # Use the following features: assists, steals, blocks, turnovers, personal fouls 27 | 28 | # map positions to numbers 29 | nba['pos_num'] = nba.pos.map({'C':0, 'F':1, 'G':2}) 30 | 31 | 32 | # create feature matrix (X) 33 | feature_cols = ['ast', 'stl', 'blk', 'tov', 'pf'] 34 | X = nba[feature_cols] 35 | 36 | 37 | # alternative way to create X 38 | X = nba.loc[:, 'ast':'pf'] 39 | 40 | 41 | # create response vector (y) 42 | y = nba.pos_num 43 | 44 | 45 | # ## Step 3: Train a KNN model (K=5) 46 | 47 | # import class 48 | from sklearn.neighbors import KNeighborsClassifier 49 | 50 | 51 | # instantiate with K=5 52 | knn = KNeighborsClassifier(n_neighbors=5) 53 | 54 | 55 | # fit with data 56 | knn.fit(X, y) 57 | 58 | 59 | # ## Step 4: Predict player position and calculate predicted probability of each position 60 | # 61 | # Predict for a player with these statistics: 1 assist, 1 steal, 0 blocks, 1 turnover, 2 personal fouls 62 | 63 | # create a list to represent a player 64 | player = [1, 1, 0, 1, 2] 65 | 66 | 67 | # make a prediction 68 | knn.predict(player) 69 | 70 | 71 | # calculate predicted probabilities 72 | knn.predict_proba(player) 73 | 74 | 75 | # ## Step 5: Repeat steps 3 and 4 using K=50 76 | 77 | # repeat for K=50 78 | knn = KNeighborsClassifier(n_neighbors=50) 79 | knn.fit(X, y) 80 | knn.predict(player) 81 | 82 | 83 | # calculate predicted probabilities 84 | knn.predict_proba(player) 85 | 86 | 87 | # ## Bonus: Explore the features to decide which ones are predictive 88 | 89 | # allow plots to appear in the notebook 90 | import matplotlib.pyplot as plt 91 | 92 | # increase default figure and font sizes for easier viewing 93 | plt.rcParams['figure.figsize'] = (6, 4) 94 | plt.rcParams['font.size'] = 14 95 | 96 | 97 | # description of assists grouped by position 98 | nba.groupby('pos').ast.describe().unstack() 99 | 100 | 101 | # box plot of assists grouped by position 102 | nba.boxplot(column='ast', by='pos') 103 | 104 | 105 | # histogram of assists grouped by position 106 | nba.hist(column='ast', by='pos', sharex=True) 107 | -------------------------------------------------------------------------------- /code/08_pandas_review_nb.py: -------------------------------------------------------------------------------- 1 | # # Pandas Review 2 | 3 | import pandas as pd 4 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv' 5 | df = pd.read_csv(url).head(5).copy() 6 | df 7 | 8 | 9 | # For each of the following lines of code: 10 | # 11 | # - What the **data type** of the object that is returned? 12 | # - What is the **shape** of the object that is returned? 13 | # 14 | # 15 | # 1. `df` 16 | # 2. `df.continent` 17 | # 3. `df['continent']` 18 | # 4. `df[['country', 'continent']]` 19 | # 5. `df[[False, True, False, True, False]]` 20 | 21 | # ## Question 1 22 | 23 | df 24 | 25 | 26 | print type(df) 27 | print df.shape 28 | 29 | 30 | # ## Question 2 31 | 32 | df.continent 33 | 34 | 35 | print type(df.continent) 36 | print df.continent.shape 37 | 38 | 39 | # ## Question 3 40 | 41 | df['continent'] 42 | 43 | 44 | print type(df['continent']) 45 | print df['continent'].shape 46 | 47 | 48 | # ## Question 4 49 | 50 | df[['country', 'continent']] 51 | 52 | 53 | print type(df[['country', 'continent']]) 54 | print df[['country', 'continent']].shape 55 | 56 | 57 | # equivalent 58 | cols = ['country', 'continent'] 59 | df[cols] 60 | 61 | 62 | # ## Question 5 63 | 64 | df[[False, True, False, True, False]] 65 | 66 | 67 | print type(df[[False, True, False, True, False]]) 68 | print df[[False, True, False, True, False]].shape 69 | 70 | 71 | # equivalent 72 | df[df.continent=='EU'] 73 | -------------------------------------------------------------------------------- /code/10_yelp_votes_homework_nb.py: -------------------------------------------------------------------------------- 1 | # # Linear regression homework with Yelp votes 2 | 3 | # ## Introduction 4 | # 5 | # This assignment uses a small subset of the data from Kaggle's [Yelp Business Rating Prediction](https://www.kaggle.com/c/yelp-recsys-2013) competition. 6 | # 7 | # **Description of the data:** 8 | # 9 | # - `yelp.json` is the original format of the file. `yelp.csv` contains the same data, in a more convenient format. Both of the files are in this repo, so there is no need to download the data from the Kaggle website. 10 | # - Each observation in this dataset is a review of a particular business by a particular user. 11 | # - The "stars" column is the number of stars (1 through 5) assigned by the reviewer to the business. (Higher stars is better.) In other words, it is the rating of the business by the person who wrote the review. 12 | # - The "cool" column is the number of "cool" votes this review received from other Yelp users. All reviews start with 0 "cool" votes, and there is no limit to how many "cool" votes a review can receive. In other words, it is a rating of the review itself, not a rating of the business. 13 | # - The "useful" and "funny" columns are similar to the "cool" column. 14 | 15 | # ## Task 1 16 | # 17 | # Read `yelp.csv` into a DataFrame. 18 | 19 | # access yelp.csv using a relative path 20 | import pandas as pd 21 | yelp = pd.read_csv('../data/yelp.csv') 22 | yelp.head(1) 23 | 24 | 25 | # ## Task 1 (Bonus) 26 | # 27 | # Ignore the `yelp.csv` file, and construct this DataFrame yourself from `yelp.json`. This involves reading the data into Python, decoding the JSON, converting it to a DataFrame, and adding individual columns for each of the vote types. 28 | 29 | # read the data from yelp.json into a list of rows 30 | # each row is decoded into a dictionary using using json.loads() 31 | import json 32 | with open('../data/yelp.json', 'rU') as f: 33 | data = [json.loads(row) for row in f] 34 | 35 | 36 | # show the first review 37 | data[0] 38 | 39 | 40 | # convert the list of dictionaries to a DataFrame 41 | yelp = pd.DataFrame(data) 42 | yelp.head(1) 43 | 44 | 45 | # add DataFrame columns for cool, useful, and funny 46 | yelp['cool'] = [row['votes']['cool'] for row in data] 47 | yelp['useful'] = [row['votes']['useful'] for row in data] 48 | yelp['funny'] = [row['votes']['funny'] for row in data] 49 | 50 | 51 | # drop the votes column 52 | yelp.drop('votes', axis=1, inplace=True) 53 | yelp.head(1) 54 | 55 | 56 | # ## Task 2 57 | # 58 | # Explore the relationship between each of the vote types (cool/useful/funny) and the number of stars. 59 | 60 | # treat stars as a categorical variable and look for differences between groups 61 | yelp.groupby('stars').mean() 62 | 63 | 64 | # correlation matrix 65 | import seaborn as sns 66 | sns.heatmap(yelp.corr()) 67 | 68 | 69 | # multiple scatter plots 70 | sns.pairplot(yelp, x_vars=['cool', 'useful', 'funny'], y_vars='stars', size=6, aspect=0.7, kind='reg') 71 | 72 | 73 | # ## Task 3 74 | # 75 | # Define cool/useful/funny as the features, and stars as the response. 76 | 77 | feature_cols = ['cool', 'useful', 'funny'] 78 | X = yelp[feature_cols] 79 | y = yelp.stars 80 | 81 | 82 | # ## Task 4 83 | # 84 | # Fit a linear regression model and interpret the coefficients. Do the coefficients make intuitive sense to you? Explore the Yelp website to see if you detect similar trends. 85 | 86 | from sklearn.linear_model import LinearRegression 87 | linreg = LinearRegression() 88 | linreg.fit(X, y) 89 | zip(feature_cols, linreg.coef_) 90 | 91 | 92 | # ## Task 5 93 | # 94 | # Evaluate the model by splitting it into training and testing sets and computing the RMSE. Does the RMSE make intuitive sense to you? 95 | 96 | from sklearn.cross_validation import train_test_split 97 | from sklearn import metrics 98 | import numpy as np 99 | 100 | 101 | # define a function that accepts a list of features and returns testing RMSE 102 | def train_test_rmse(feature_cols): 103 | X = yelp[feature_cols] 104 | y = yelp.stars 105 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 106 | linreg = LinearRegression() 107 | linreg.fit(X_train, y_train) 108 | y_pred = linreg.predict(X_test) 109 | return np.sqrt(metrics.mean_squared_error(y_test, y_pred)) 110 | 111 | 112 | # calculate RMSE with all three features 113 | train_test_rmse(['cool', 'useful', 'funny']) 114 | 115 | 116 | # ## Task 6 117 | # 118 | # Try removing some of the features and see if the RMSE improves. 119 | 120 | print train_test_rmse(['cool', 'useful']) 121 | print train_test_rmse(['cool', 'funny']) 122 | print train_test_rmse(['useful', 'funny']) 123 | 124 | 125 | # ## Task 7 (Bonus) 126 | # 127 | # Think of some new features you could create from the existing data that might be predictive of the response. Figure out how to create those features in Pandas, add them to your model, and see if the RMSE improves. 128 | 129 | # new feature: review length (number of characters) 130 | yelp['length'] = yelp.text.apply(len) 131 | 132 | 133 | # new features: whether or not the review contains 'love' or 'hate' 134 | yelp['love'] = yelp.text.str.contains('love', case=False).astype(int) 135 | yelp['hate'] = yelp.text.str.contains('hate', case=False).astype(int) 136 | 137 | 138 | # add new features to the model and calculate RMSE 139 | train_test_rmse(['cool', 'useful', 'funny', 'length', 'love', 'hate']) 140 | 141 | 142 | # ## Task 8 (Bonus) 143 | # 144 | # Compare your best RMSE on the testing set with the RMSE for the "null model", which is the model that ignores all features and simply predicts the mean response value in the testing set. 145 | 146 | # split the data (outside of the function) 147 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 148 | 149 | 150 | # create a NumPy array with the same shape as y_test 151 | y_null = np.zeros_like(y_test, dtype=float) 152 | 153 | 154 | # fill the array with the mean of y_test 155 | y_null.fill(y_test.mean()) 156 | 157 | 158 | # calculate null RMSE 159 | print np.sqrt(metrics.mean_squared_error(y_test, y_null)) 160 | 161 | 162 | # ## Task 9 (Bonus) 163 | # 164 | # Instead of treating this as a regression problem, treat it as a classification problem and see what testing accuracy you can achieve with KNN. 165 | 166 | # import and instantiate KNN 167 | from sklearn.neighbors import KNeighborsClassifier 168 | knn = KNeighborsClassifier(n_neighbors=50) 169 | 170 | 171 | # classification models will automatically treat the response value (1/2/3/4/5) as unordered categories 172 | knn.fit(X_train, y_train) 173 | y_pred_class = knn.predict(X_test) 174 | print metrics.accuracy_score(y_test, y_pred_class) 175 | 176 | 177 | # ## Task 10 (Bonus) 178 | # 179 | # Figure out how to use linear regression for classification, and compare its classification accuracy with KNN's accuracy. 180 | 181 | # use linear regression to make continuous predictions 182 | linreg = LinearRegression() 183 | linreg.fit(X_train, y_train) 184 | y_pred = linreg.predict(X_test) 185 | 186 | 187 | # round its predictions to the nearest integer 188 | y_pred_class = y_pred.round() 189 | 190 | 191 | # calculate classification accuracy of the rounded predictions 192 | print metrics.accuracy_score(y_test, y_pred_class) 193 | -------------------------------------------------------------------------------- /code/12_e_log_examples_nb.py: -------------------------------------------------------------------------------- 1 | # # Exponential functions and logarithms 2 | 3 | import math 4 | import numpy as np 5 | 6 | 7 | # ## Exponential functions 8 | 9 | # What is **e**? It is simply a number (known as Euler's number): 10 | 11 | math.e 12 | 13 | 14 | # **e** is a significant number, because it is the base rate of growth shared by all continually growing processes. 15 | # 16 | # For example, if I have **10 dollars**, and it grows 100% in 1 year (compounding continuously), I end up with **10\*e^1 dollars**: 17 | 18 | # 100% growth for 1 year 19 | 10 * np.exp(1) 20 | 21 | 22 | # 100% growth for 2 years 23 | 10 * np.exp(2) 24 | 25 | 26 | # Side note: When e is raised to a power, it is known as **the exponential function**. Technically, any number can be the base, and it would still be known as **an exponential function** (such as 2^5). But in our context, the base of the exponential function is assumed to be e. 27 | # 28 | # Anyway, what if I only have 20% growth instead of 100% growth? 29 | 30 | # 20% growth for 1 year 31 | 10 * np.exp(0.20) 32 | 33 | 34 | # 20% growth for 2 years 35 | 10 * np.exp(0.20 * 2) 36 | 37 | 38 | # ## Logarithms 39 | 40 | # What is the **(natural) logarithm**? It gives you the time needed to reach a certain level of growth. For example, if I want growth by a factor of 2.718, it will take me 1 unit of time (assuming a 100% growth rate): 41 | 42 | # time needed to grow 1 unit to 2.718 units 43 | np.log(2.718) 44 | 45 | 46 | # If I want growth by a factor of 7.389, it will take me 2 units of time: 47 | 48 | # time needed to grow 1 unit to 7.389 units 49 | np.log(7.389) 50 | 51 | 52 | # If I want growth by a factor of 1, it will take me 0 units of time: 53 | 54 | # time needed to grow 1 unit to 1 unit 55 | np.log(1) 56 | 57 | 58 | # If I want growth by a factor of 0.5, it will take me -0.693 units of time (which is like looking back in time): 59 | 60 | # time needed to grow 1 unit to 0.5 units 61 | np.log(0.5) 62 | 63 | 64 | # ## Connecting the concepts 65 | 66 | # As you can see, the exponential function and the natural logarithm are **inverses** of one another: 67 | 68 | np.log(np.exp(5)) 69 | 70 | 71 | np.exp(np.log(5)) 72 | -------------------------------------------------------------------------------- /code/12_titanic_confusion_nb.py: -------------------------------------------------------------------------------- 1 | # # Logistic regression exercise with Titanic data 2 | 3 | # ## Introduction 4 | # 5 | # - Data from Kaggle's Titanic competition: [data](https://github.com/justmarkham/DAT8/blob/master/data/titanic.csv), [data dictionary](https://www.kaggle.com/c/titanic/data) 6 | # - **Goal**: Predict survival based on passenger characteristics 7 | # - `titanic.csv` is already in our repo, so there is no need to download the data from the Kaggle website 8 | 9 | # ## Step 1: Read the data into Pandas 10 | 11 | import pandas as pd 12 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/titanic.csv' 13 | titanic = pd.read_csv(url, index_col='PassengerId') 14 | titanic.head() 15 | 16 | 17 | # ## Step 2: Create X and y 18 | # 19 | # Define **Pclass** and **Parch** as the features, and **Survived** as the response. 20 | 21 | feature_cols = ['Pclass', 'Parch'] 22 | X = titanic[feature_cols] 23 | y = titanic.Survived 24 | 25 | 26 | # ## Step 3: Split the data into training and testing sets 27 | 28 | from sklearn.cross_validation import train_test_split 29 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 30 | 31 | 32 | # ## Step 4: Fit a logistic regression model and examine the coefficients 33 | # 34 | # Confirm that the coefficients make intuitive sense. 35 | 36 | from sklearn.linear_model import LogisticRegression 37 | logreg = LogisticRegression(C=1e9) 38 | logreg.fit(X_train, y_train) 39 | zip(feature_cols, logreg.coef_[0]) 40 | 41 | 42 | # ## Step 5: Make predictions on the testing set and calculate the accuracy 43 | 44 | # class predictions (not predicted probabilities) 45 | y_pred_class = logreg.predict(X_test) 46 | 47 | 48 | # calculate classification accuracy 49 | from sklearn import metrics 50 | print metrics.accuracy_score(y_test, y_pred_class) 51 | 52 | 53 | # ## Step 6: Compare your testing accuracy to the null accuracy 54 | 55 | # this works regardless of the number of classes 56 | y_test.value_counts().head(1) / len(y_test) 57 | 58 | 59 | # this only works for binary classification problems coded as 0/1 60 | max(y_test.mean(), 1 - y_test.mean()) 61 | 62 | 63 | # # Confusion matrix of Titanic predictions 64 | 65 | # print confusion matrix 66 | print metrics.confusion_matrix(y_test, y_pred_class) 67 | 68 | 69 | # save confusion matrix and slice into four pieces 70 | confusion = metrics.confusion_matrix(y_test, y_pred_class) 71 | TP = confusion[1][1] 72 | TN = confusion[0][0] 73 | FP = confusion[0][1] 74 | FN = confusion[1][0] 75 | 76 | 77 | print 'True Positives:', TP 78 | print 'True Negatives:', TN 79 | print 'False Positives:', FP 80 | print 'False Negatives:', FN 81 | 82 | 83 | # calculate the sensitivity 84 | print TP / float(TP + FN) 85 | print 44 / float(44 + 51) 86 | 87 | 88 | # calculate the specificity 89 | print TN / float(TN + FP) 90 | print 105 / float(105 + 23) 91 | 92 | 93 | # store the predicted probabilities 94 | y_pred_prob = logreg.predict_proba(X_test)[:, 1] 95 | 96 | 97 | # histogram of predicted probabilities 98 | import matplotlib.pyplot as plt 99 | plt.hist(y_pred_prob) 100 | plt.xlim(0, 1) 101 | plt.xlabel('Predicted probability of survival') 102 | plt.ylabel('Frequency') 103 | 104 | 105 | # increase sensitivity by lowering the threshold for predicting survival 106 | import numpy as np 107 | y_pred_class = np.where(y_pred_prob > 0.3, 1, 0) 108 | 109 | 110 | # old confusion matrix 111 | print confusion 112 | 113 | 114 | # new confusion matrix 115 | print metrics.confusion_matrix(y_test, y_pred_class) 116 | 117 | 118 | # new sensitivity (higher than before) 119 | print 63 / float(63 + 32) 120 | 121 | 122 | # new specificity (lower than before) 123 | print 72 / float(72 + 56) 124 | -------------------------------------------------------------------------------- /code/13_advanced_model_evaluation_nb.py: -------------------------------------------------------------------------------- 1 | # # Data Preparation and Advanced Model Evaluation 2 | 3 | # ## Agenda 4 | # 5 | # **Data preparation** 6 | # 7 | # - Handling missing values 8 | # - Handling categorical features (review) 9 | # 10 | # **Advanced model evaluation** 11 | # 12 | # - ROC curves and AUC 13 | # - Bonus: ROC curve is only sensitive to rank order of predicted probabilities 14 | # - Cross-validation 15 | 16 | # ## Part 1: Handling missing values 17 | 18 | # scikit-learn models expect that all values are **numeric** and **hold meaning**. Thus, missing values are not allowed by scikit-learn. 19 | 20 | # read the Titanic data 21 | import pandas as pd 22 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/titanic.csv' 23 | titanic = pd.read_csv(url, index_col='PassengerId') 24 | titanic.shape 25 | 26 | 27 | # check for missing values 28 | titanic.isnull().sum() 29 | 30 | 31 | # One possible strategy is to **drop missing values**: 32 | 33 | # drop rows with any missing values 34 | titanic.dropna().shape 35 | 36 | 37 | # drop rows where Age is missing 38 | titanic[titanic.Age.notnull()].shape 39 | 40 | 41 | # Sometimes a better strategy is to **impute missing values**: 42 | 43 | # mean Age 44 | titanic.Age.mean() 45 | 46 | 47 | # median Age 48 | titanic.Age.median() 49 | 50 | 51 | # most frequent Age 52 | titanic.Age.mode() 53 | 54 | 55 | # fill missing values for Age with the median age 56 | titanic.Age.fillna(titanic.Age.median(), inplace=True) 57 | 58 | 59 | # Another strategy would be to build a **KNN model** just to impute missing values. How would we do that? 60 | # 61 | # If values are missing from a categorical feature, we could treat the missing values as **another category**. Why might that make sense? 62 | # 63 | # How do we **choose** between all of these strategies? 64 | 65 | # ## Part 2: Handling categorical features (Review) 66 | 67 | # How do we include a categorical feature in our model? 68 | # 69 | # - **Ordered categories:** transform them to sensible numeric values (example: small=1, medium=2, large=3) 70 | # - **Unordered categories:** use dummy encoding (0/1) 71 | 72 | titanic.head(10) 73 | 74 | 75 | # encode Sex_Female feature 76 | titanic['Sex_Female'] = titanic.Sex.map({'male':0, 'female':1}) 77 | 78 | 79 | # create a DataFrame of dummy variables for Embarked 80 | embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked') 81 | embarked_dummies.drop(embarked_dummies.columns[0], axis=1, inplace=True) 82 | 83 | # concatenate the original DataFrame and the dummy DataFrame 84 | titanic = pd.concat([titanic, embarked_dummies], axis=1) 85 | 86 | 87 | titanic.head(1) 88 | 89 | 90 | # - How do we **interpret** the encoding for Embarked? 91 | # - Why didn't we just encode Embarked using a **single feature** (C=0, Q=1, S=2)? 92 | # - Does it matter which category we choose to define as the **baseline**? 93 | # - Why do we only need **two dummy variables** for Embarked? 94 | 95 | # define X and y 96 | feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female', 'Embarked_Q', 'Embarked_S'] 97 | X = titanic[feature_cols] 98 | y = titanic.Survived 99 | 100 | # train/test split 101 | from sklearn.cross_validation import train_test_split 102 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 103 | 104 | # train a logistic regression model 105 | from sklearn.linear_model import LogisticRegression 106 | logreg = LogisticRegression(C=1e9) 107 | logreg.fit(X_train, y_train) 108 | 109 | # make predictions for testing set 110 | y_pred_class = logreg.predict(X_test) 111 | 112 | # calculate testing accuracy 113 | from sklearn import metrics 114 | print metrics.accuracy_score(y_test, y_pred_class) 115 | 116 | 117 | # ## Part 3: ROC curves and AUC 118 | 119 | # predict probability of survival 120 | y_pred_prob = logreg.predict_proba(X_test)[:, 1] 121 | 122 | 123 | import matplotlib.pyplot as plt 124 | plt.rcParams['figure.figsize'] = (8, 6) 125 | plt.rcParams['font.size'] = 14 126 | 127 | 128 | # plot ROC curve 129 | fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob) 130 | plt.plot(fpr, tpr) 131 | plt.xlim([0.0, 1.0]) 132 | plt.ylim([0.0, 1.0]) 133 | plt.xlabel('False Positive Rate (1 - Specificity)') 134 | plt.ylabel('True Positive Rate (Sensitivity)') 135 | 136 | 137 | # calculate AUC 138 | print metrics.roc_auc_score(y_test, y_pred_prob) 139 | 140 | 141 | # Besides allowing you to calculate AUC, seeing the ROC curve can help you to choose a threshold that **balances sensitivity and specificity** in a way that makes sense for the particular context. 142 | 143 | # histogram of predicted probabilities grouped by actual response value 144 | df = pd.DataFrame({'probability':y_pred_prob, 'actual':y_test}) 145 | df.hist(column='probability', by='actual', sharex=True, sharey=True) 146 | 147 | 148 | # What would have happened if you had used **y_pred_class** instead of **y_pred_prob** when drawing the ROC curve or calculating AUC? 149 | 150 | # ROC curve using y_pred_class - WRONG! 151 | fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_class) 152 | plt.plot(fpr, tpr) 153 | 154 | 155 | # AUC using y_pred_class - WRONG! 156 | print metrics.roc_auc_score(y_test, y_pred_class) 157 | 158 | 159 | # If you use **y_pred_class**, it will interpret the zeros and ones as predicted probabilities of 0% and 100%. 160 | 161 | # ## Bonus: ROC curve is only sensitive to rank order of predicted probabilities 162 | 163 | # print the first 10 predicted probabilities 164 | y_pred_prob[:10] 165 | 166 | 167 | # take the square root of predicted probabilities (to make them all bigger) 168 | import numpy as np 169 | y_pred_prob_new = np.sqrt(y_pred_prob) 170 | 171 | # print the modified predicted probabilities 172 | y_pred_prob_new[:10] 173 | 174 | 175 | # histogram of predicted probabilities has changed 176 | df = pd.DataFrame({'probability':y_pred_prob_new, 'actual':y_test}) 177 | df.hist(column='probability', by='actual', sharex=True, sharey=True) 178 | 179 | 180 | # ROC curve did not change 181 | fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob_new) 182 | plt.plot(fpr, tpr) 183 | 184 | 185 | # AUC did not change 186 | print metrics.roc_auc_score(y_test, y_pred_prob_new) 187 | 188 | 189 | # ## Part 4: Cross-validation 190 | 191 | # calculate cross-validated AUC 192 | from sklearn.cross_validation import cross_val_score 193 | cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean() 194 | 195 | 196 | # add Fare to the model 197 | feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female', 'Embarked_Q', 'Embarked_S', 'Fare'] 198 | X = titanic[feature_cols] 199 | 200 | # recalculate AUC 201 | cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean() 202 | -------------------------------------------------------------------------------- /code/13_bank_exercise_nb.py: -------------------------------------------------------------------------------- 1 | # # Exercise with bank marketing data 2 | 3 | # ## Introduction 4 | # 5 | # - Data from the UCI Machine Learning Repository: [data](https://github.com/justmarkham/DAT8/blob/master/data/bank-additional.csv), [data dictionary](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing) 6 | # - **Goal:** Predict whether a customer will purchase a bank product marketed over the phone 7 | # - `bank-additional.csv` is already in our repo, so there is no need to download the data from the UCI website 8 | 9 | # ## Step 1: Read the data into Pandas 10 | 11 | import pandas as pd 12 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bank-additional.csv' 13 | bank = pd.read_csv(url, sep=';') 14 | bank.head() 15 | 16 | 17 | # ## Step 2: Prepare at least three features 18 | # 19 | # - Include both numeric and categorical features 20 | # - Choose features that you think might be related to the response (based on intuition or exploration) 21 | # - Think about how to handle missing values (encoded as "unknown") 22 | 23 | # list all columns (for reference) 24 | bank.columns 25 | 26 | 27 | # ### y (response) 28 | 29 | # convert the response to numeric values and store as a new column 30 | bank['outcome'] = bank.y.map({'no':0, 'yes':1}) 31 | 32 | 33 | # ### age 34 | 35 | # probably not a great feature 36 | bank.boxplot(column='age', by='outcome') 37 | 38 | 39 | # ### job 40 | 41 | # looks like a useful feature 42 | bank.groupby('job').outcome.mean() 43 | 44 | 45 | # create job_dummies (we will add it to the bank DataFrame later) 46 | job_dummies = pd.get_dummies(bank.job, prefix='job') 47 | job_dummies.drop(job_dummies.columns[0], axis=1, inplace=True) 48 | 49 | 50 | # ### default 51 | 52 | # looks like a useful feature 53 | bank.groupby('default').outcome.mean() 54 | 55 | 56 | # but only one person in the dataset has a status of yes 57 | bank.default.value_counts() 58 | 59 | 60 | # so, let's treat this as a 2-class feature rather than a 3-class feature 61 | bank['default'] = bank.default.map({'no':0, 'unknown':1, 'yes':1}) 62 | 63 | 64 | # ### contact 65 | 66 | # looks like a useful feature 67 | bank.groupby('contact').outcome.mean() 68 | 69 | 70 | # convert the feature to numeric values 71 | bank['contact'] = bank.contact.map({'cellular':0, 'telephone':1}) 72 | 73 | 74 | # ### month 75 | 76 | # looks like a useful feature at first glance 77 | bank.groupby('month').outcome.mean() 78 | 79 | 80 | # but, it looks like their success rate is actually just correlated with number of calls 81 | # thus, the month feature is unlikely to generalize 82 | bank.groupby('month').outcome.agg(['count', 'mean']).sort('count') 83 | 84 | 85 | # ### duration 86 | 87 | # looks like an excellent feature, but you can't know the duration of a call beforehand, thus it can't be used in your model 88 | bank.boxplot(column='duration', by='outcome') 89 | 90 | 91 | # ### previous 92 | 93 | # looks like a useful feature 94 | bank.groupby('previous').outcome.mean() 95 | 96 | 97 | # ### poutcome 98 | 99 | # looks like a useful feature 100 | bank.groupby('poutcome').outcome.mean() 101 | 102 | 103 | # create poutcome_dummies 104 | poutcome_dummies = pd.get_dummies(bank.poutcome, prefix='poutcome') 105 | poutcome_dummies.drop(poutcome_dummies.columns[0], axis=1, inplace=True) 106 | 107 | 108 | # concatenate bank DataFrame with job_dummies and poutcome_dummies 109 | bank = pd.concat([bank, job_dummies, poutcome_dummies], axis=1) 110 | 111 | 112 | # ### euribor3m 113 | 114 | # looks like an excellent feature 115 | bank.boxplot(column='euribor3m', by='outcome') 116 | 117 | 118 | # ## Step 3: Model building 119 | # 120 | # - Use cross-validation to evaluate the AUC of a logistic regression model with your chosen features 121 | # - Try to increase the AUC by selecting different sets of features 122 | 123 | # new list of columns (including dummy columns) 124 | bank.columns 125 | 126 | 127 | # create X (including 13 dummy columns) 128 | feature_cols = ['default', 'contact', 'previous', 'euribor3m'] + list(bank.columns[-13:]) 129 | X = bank[feature_cols] 130 | 131 | 132 | # create y 133 | y = bank.outcome 134 | 135 | 136 | # calculate cross-validated AUC 137 | from sklearn.linear_model import LogisticRegression 138 | from sklearn.cross_validation import cross_val_score 139 | logreg = LogisticRegression(C=1e9) 140 | cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean() 141 | -------------------------------------------------------------------------------- /code/14_bayes_theorem_iris_nb.py: -------------------------------------------------------------------------------- 1 | # # Applying Bayes' theorem to iris classification 2 | # 3 | # Can **Bayes' theorem** help us to solve a **classification problem**, namely predicting the species of an iris? 4 | 5 | # ## Preparing the data 6 | # 7 | # We'll read the iris data into a DataFrame, and **round up** all of the measurements to the next integer: 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | 13 | # read the iris data into a DataFrame 14 | url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' 15 | col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] 16 | iris = pd.read_csv(url, header=None, names=col_names) 17 | iris.head() 18 | 19 | 20 | # apply the ceiling function to the numeric columns 21 | iris.loc[:, 'sepal_length':'petal_width'] = iris.loc[:, 'sepal_length':'petal_width'].apply(np.ceil) 22 | iris.head() 23 | 24 | 25 | # ## Deciding how to make a prediction 26 | # 27 | # Let's say that I have an **out-of-sample iris** with the following measurements: **7, 3, 5, 2**. How might I predict the species? 28 | 29 | # show all observations with features: 7, 3, 5, 2 30 | iris[(iris.sepal_length==7) & (iris.sepal_width==3) & (iris.petal_length==5) & (iris.petal_width==2)] 31 | 32 | 33 | # count the species for these observations 34 | iris[(iris.sepal_length==7) & (iris.sepal_width==3) & (iris.petal_length==5) & (iris.petal_width==2)].species.value_counts() 35 | 36 | 37 | # count the species for all observations 38 | iris.species.value_counts() 39 | 40 | 41 | # Let's frame this as a **conditional probability problem**: What is the probability of some particular species, given the measurements 7, 3, 5, and 2? 42 | # 43 | # $$P(species \ | \ 7352)$$ 44 | # 45 | # We could calculate the conditional probability for **each of the three species**, and then predict the species with the **highest probability**: 46 | # 47 | # $$P(setosa \ | \ 7352)$$ 48 | # $$P(versicolor \ | \ 7352)$$ 49 | # $$P(virginica \ | \ 7352)$$ 50 | 51 | # ## Calculating the probability of each species 52 | # 53 | # **Bayes' theorem** gives us a way to calculate these conditional probabilities. 54 | # 55 | # Let's start with **versicolor**: 56 | # 57 | # $$P(versicolor \ | \ 7352) = \frac {P(7352 \ | \ versicolor) \times P(versicolor)} {P(7352)}$$ 58 | # 59 | # We can calculate each of the terms on the right side of the equation: 60 | # 61 | # $$P(7352 \ | \ versicolor) = \frac {13} {50} = 0.26$$ 62 | # 63 | # $$P(versicolor) = \frac {50} {150} = 0.33$$ 64 | # 65 | # $$P(7352) = \frac {17} {150} = 0.11$$ 66 | # 67 | # Therefore, Bayes' theorem says the **probability of versicolor given these measurements** is: 68 | # 69 | # $$P(versicolor \ | \ 7352) = \frac {0.26 \times 0.33} {0.11} = 0.76$$ 70 | # 71 | # Let's repeat this process for **virginica** and **setosa**: 72 | # 73 | # $$P(virginica \ | \ 7352) = \frac {0.08 \times 0.33} {0.11} = 0.24$$ 74 | # 75 | # $$P(setosa \ | \ 7352) = \frac {0 \times 0.33} {0.11} = 0$$ 76 | # 77 | # We predict that the iris is a versicolor, since that species had the **highest conditional probability**. 78 | 79 | # ## Summary 80 | # 81 | # 1. We framed a **classification problem** as three conditional probability problems. 82 | # 2. We used **Bayes' theorem** to calculate those conditional probabilities. 83 | # 3. We made a **prediction** by choosing the species with the highest conditional probability. 84 | 85 | # ## Bonus: The intuition behind Bayes' theorem 86 | # 87 | # Let's make some hypothetical adjustments to the data, to demonstrate how Bayes' theorem makes intuitive sense: 88 | # 89 | # Pretend that **more of the existing versicolors had measurements of 7352:** 90 | # 91 | # - $P(7352 \ | \ versicolor)$ would increase, thus increasing the numerator. 92 | # - It would make sense that given an iris with measurements of 7352, the probability of it being a versicolor would also increase. 93 | # 94 | # Pretend that **most of the existing irises were versicolor:** 95 | # 96 | # - $P(versicolor)$ would increase, thus increasing the numerator. 97 | # - It would make sense that the probability of any iris being a versicolor (regardless of measurements) would also increase. 98 | # 99 | # Pretend that **17 of the setosas had measurements of 7352:** 100 | # 101 | # - $P(7352)$ would double, thus doubling the denominator. 102 | # - It would make sense that given an iris with measurements of 7352, the probability of it being a versicolor would be cut in half. 103 | -------------------------------------------------------------------------------- /code/14_text_data_sklearn_nb.py: -------------------------------------------------------------------------------- 1 | # # Working with Text Data and Naive Bayes in scikit-learn 2 | 3 | # ## Agenda 4 | # 5 | # **Working with text data** 6 | # 7 | # - Representing text as data 8 | # - Reading SMS data 9 | # - Vectorizing SMS data 10 | # - Examining the tokens and their counts 11 | # - Bonus: Calculating the "spamminess" of each token 12 | # 13 | # **Naive Bayes classification** 14 | # 15 | # - Building a Naive Bayes model 16 | # - Comparing Naive Bayes with logistic regression 17 | 18 | # ## Part 1: Representing text as data 19 | # 20 | # From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction): 21 | # 22 | # > Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect **numerical feature vectors with a fixed size** rather than the **raw text documents with variable length**. 23 | # 24 | # We will use [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to "convert text into a matrix of token counts": 25 | 26 | from sklearn.feature_extraction.text import CountVectorizer 27 | 28 | 29 | # start with a simple example 30 | simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!'] 31 | 32 | 33 | # learn the 'vocabulary' of the training data 34 | vect = CountVectorizer() 35 | vect.fit(simple_train) 36 | vect.get_feature_names() 37 | 38 | 39 | # transform training data into a 'document-term matrix' 40 | simple_train_dtm = vect.transform(simple_train) 41 | simple_train_dtm 42 | 43 | 44 | # print the sparse matrix 45 | print simple_train_dtm 46 | 47 | 48 | # convert sparse matrix to a dense matrix 49 | simple_train_dtm.toarray() 50 | 51 | 52 | # examine the vocabulary and document-term matrix together 53 | import pandas as pd 54 | pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names()) 55 | 56 | 57 | # From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction): 58 | # 59 | # > In this scheme, features and samples are defined as follows: 60 | # 61 | # > - Each individual token occurrence frequency (normalized or not) is treated as a **feature**. 62 | # > - The vector of all the token frequencies for a given document is considered a multivariate **sample**. 63 | # 64 | # > A **corpus of documents** can thus be represented by a matrix with **one row per document** and **one column per token** (e.g. word) occurring in the corpus. 65 | # 66 | # > We call **vectorization** the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the **Bag of Words** or "Bag of n-grams" representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document. 67 | 68 | # transform testing data into a document-term matrix (using existing vocabulary) 69 | simple_test = ["please don't call me"] 70 | simple_test_dtm = vect.transform(simple_test) 71 | simple_test_dtm.toarray() 72 | 73 | 74 | # examine the vocabulary and document-term matrix together 75 | pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names()) 76 | 77 | 78 | # **Summary:** 79 | # 80 | # - `vect.fit(train)` learns the vocabulary of the training data 81 | # - `vect.transform(train)` uses the fitted vocabulary to build a document-term matrix from the training data 82 | # - `vect.transform(test)` uses the fitted vocabulary to build a document-term matrix from the testing data (and ignores tokens it hasn't seen before) 83 | 84 | # ## Part 2: Reading SMS data 85 | 86 | # read tab-separated file 87 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv' 88 | col_names = ['label', 'message'] 89 | sms = pd.read_table(url, sep='\t', header=None, names=col_names) 90 | print sms.shape 91 | 92 | 93 | sms.head(20) 94 | 95 | 96 | sms.label.value_counts() 97 | 98 | 99 | # convert label to a numeric variable 100 | sms['label'] = sms.label.map({'ham':0, 'spam':1}) 101 | 102 | 103 | # define X and y 104 | X = sms.message 105 | y = sms.label 106 | 107 | 108 | # split into training and testing sets 109 | from sklearn.cross_validation import train_test_split 110 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 111 | print X_train.shape 112 | print X_test.shape 113 | 114 | 115 | # ## Part 3: Vectorizing SMS data 116 | 117 | # instantiate the vectorizer 118 | vect = CountVectorizer() 119 | 120 | 121 | # learn training data vocabulary, then create document-term matrix 122 | vect.fit(X_train) 123 | X_train_dtm = vect.transform(X_train) 124 | X_train_dtm 125 | 126 | 127 | # alternative: combine fit and transform into a single step 128 | X_train_dtm = vect.fit_transform(X_train) 129 | X_train_dtm 130 | 131 | 132 | # transform testing data (using fitted vocabulary) into a document-term matrix 133 | X_test_dtm = vect.transform(X_test) 134 | X_test_dtm 135 | 136 | 137 | # ## Part 4: Examining the tokens and their counts 138 | 139 | # store token names 140 | X_train_tokens = vect.get_feature_names() 141 | 142 | 143 | # first 50 tokens 144 | print X_train_tokens[:50] 145 | 146 | 147 | # last 50 tokens 148 | print X_train_tokens[-50:] 149 | 150 | 151 | # view X_train_dtm as a dense matrix 152 | X_train_dtm.toarray() 153 | 154 | 155 | # count how many times EACH token appears across ALL messages in X_train_dtm 156 | import numpy as np 157 | X_train_counts = np.sum(X_train_dtm.toarray(), axis=0) 158 | X_train_counts 159 | 160 | 161 | X_train_counts.shape 162 | 163 | 164 | # create a DataFrame of tokens with their counts 165 | pd.DataFrame({'token':X_train_tokens, 'count':X_train_counts}).sort('count') 166 | 167 | 168 | # ## Bonus: Calculating the "spamminess" of each token 169 | 170 | # create separate DataFrames for ham and spam 171 | sms_ham = sms[sms.label==0] 172 | sms_spam = sms[sms.label==1] 173 | 174 | 175 | # learn the vocabulary of ALL messages and save it 176 | vect.fit(sms.message) 177 | all_tokens = vect.get_feature_names() 178 | 179 | 180 | # create document-term matrices for ham and spam 181 | ham_dtm = vect.transform(sms_ham.message) 182 | spam_dtm = vect.transform(sms_spam.message) 183 | 184 | 185 | # count how many times EACH token appears across ALL ham messages 186 | ham_counts = np.sum(ham_dtm.toarray(), axis=0) 187 | 188 | 189 | # count how many times EACH token appears across ALL spam messages 190 | spam_counts = np.sum(spam_dtm.toarray(), axis=0) 191 | 192 | 193 | # create a DataFrame of tokens with their separate ham and spam counts 194 | token_counts = pd.DataFrame({'token':all_tokens, 'ham':ham_counts, 'spam':spam_counts}) 195 | 196 | 197 | # add one to ham and spam counts to avoid dividing by zero (in the step that follows) 198 | token_counts['ham'] = token_counts.ham + 1 199 | token_counts['spam'] = token_counts.spam + 1 200 | 201 | 202 | # calculate ratio of spam-to-ham for each token 203 | token_counts['spam_ratio'] = token_counts.spam / token_counts.ham 204 | token_counts.sort('spam_ratio') 205 | 206 | 207 | # ## Part 5: Building a Naive Bayes model 208 | # 209 | # We will use [Multinomial Naive Bayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html): 210 | # 211 | # > The multinomial Naive Bayes classifier is suitable for classification with **discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work. 212 | 213 | # train a Naive Bayes model using X_train_dtm 214 | from sklearn.naive_bayes import MultinomialNB 215 | nb = MultinomialNB() 216 | nb.fit(X_train_dtm, y_train) 217 | 218 | 219 | # make class predictions for X_test_dtm 220 | y_pred_class = nb.predict(X_test_dtm) 221 | 222 | 223 | # calculate accuracy of class predictions 224 | from sklearn import metrics 225 | print metrics.accuracy_score(y_test, y_pred_class) 226 | 227 | 228 | # confusion matrix 229 | print metrics.confusion_matrix(y_test, y_pred_class) 230 | 231 | 232 | # predict (poorly calibrated) probabilities 233 | y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1] 234 | y_pred_prob 235 | 236 | 237 | # calculate AUC 238 | print metrics.roc_auc_score(y_test, y_pred_prob) 239 | 240 | 241 | # print message text for the false positives 242 | X_test[y_test < y_pred_class] 243 | 244 | 245 | # print message text for the false negatives 246 | X_test[y_test > y_pred_class] 247 | 248 | 249 | # what do you notice about the false negatives? 250 | X_test[3132] 251 | 252 | 253 | # ## Part 6: Comparing Naive Bayes with logistic regression 254 | 255 | # import/instantiate/fit 256 | from sklearn.linear_model import LogisticRegression 257 | logreg = LogisticRegression(C=1e9) 258 | logreg.fit(X_train_dtm, y_train) 259 | 260 | 261 | # class predictions and predicted probabilities 262 | y_pred_class = logreg.predict(X_test_dtm) 263 | y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1] 264 | 265 | 266 | # calculate accuracy and AUC 267 | print metrics.accuracy_score(y_test, y_pred_class) 268 | print metrics.roc_auc_score(y_test, y_pred_prob) 269 | -------------------------------------------------------------------------------- /code/14_types_of_naive_bayes_nb.py: -------------------------------------------------------------------------------- 1 | # # Comparing Multinomial and Gaussian Naive Bayes 2 | # 3 | # scikit-learn documentation: [MultinomialNB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html) and [GaussianNB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html) 4 | # 5 | # Dataset: [Pima Indians Diabetes](https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes) from the UCI Machine Learning Repository 6 | 7 | # read the data 8 | import pandas as pd 9 | url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data' 10 | col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label'] 11 | pima = pd.read_csv(url, header=None, names=col_names) 12 | 13 | 14 | # notice that all features are continuous 15 | pima.head() 16 | 17 | 18 | # create X and y 19 | X = pima.drop('label', axis=1) 20 | y = pima.label 21 | 22 | 23 | # split into training and testing sets 24 | from sklearn.cross_validation import train_test_split 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 26 | 27 | 28 | # import both Multinomial and Gaussian Naive Bayes 29 | from sklearn.naive_bayes import MultinomialNB, GaussianNB 30 | from sklearn import metrics 31 | 32 | 33 | # testing accuracy of Multinomial Naive Bayes 34 | mnb = MultinomialNB() 35 | mnb.fit(X_train, y_train) 36 | y_pred_class = mnb.predict(X_test) 37 | print metrics.accuracy_score(y_test, y_pred_class) 38 | 39 | 40 | # testing accuracy of Gaussian Naive Bayes 41 | gnb = GaussianNB() 42 | gnb.fit(X_train, y_train) 43 | y_pred_class = gnb.predict(X_test) 44 | print metrics.accuracy_score(y_test, y_pred_class) 45 | 46 | 47 | # **Conclusion:** When applying Naive Bayes classification to a dataset with **continuous features**, it is better to use Gaussian Naive Bayes than Multinomial Naive Bayes. The latter is suitable for datasets containing **discrete features** (e.g., word counts). 48 | # 49 | # Wikipedia has a short [description](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Gaussian_naive_Bayes) of Gaussian Naive Bayes, as well as an excellent [example](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Sex_classification) of its usage. 50 | -------------------------------------------------------------------------------- /code/16_kaggle_minimal.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CLASS: Kaggle Stack Overflow competition (minimal code file) 3 | ''' 4 | 5 | import pandas as pd 6 | 7 | # define a function that takes a CSV file and returns a DataFrame (with new or modified features) 8 | def make_features(filename): 9 | df = pd.read_csv(filename, index_col=0) 10 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True) 11 | df['TitleLength'] = df.Title.apply(len) 12 | return df 13 | 14 | # apply function to both training and testing files 15 | train = make_features('train.csv') 16 | test = make_features('test.csv') 17 | 18 | 19 | ''' 20 | Create a model with three features 21 | ''' 22 | 23 | # define X and y 24 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength'] 25 | X = train[feature_cols] 26 | y = train.OpenStatus 27 | 28 | # fit a logistic regression model 29 | from sklearn.linear_model import LogisticRegression 30 | logreg = LogisticRegression(C=1e9) 31 | logreg.fit(X, y) 32 | 33 | # predict class probabilities for the actual testing data 34 | X_oos = test[feature_cols] 35 | oos_pred_prob = logreg.predict_proba(X_oos)[:, 1] 36 | 37 | 38 | ''' 39 | Create a submission file 40 | ''' 41 | 42 | # create a DataFrame that has 'id' as the index, then export to a CSV file 43 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id') 44 | sub.to_csv('sub1.csv') # 0.687 45 | 46 | 47 | ''' 48 | Update make_features and create another submission file 49 | ''' 50 | 51 | import numpy as np 52 | 53 | # update the function 54 | def make_features(filename): 55 | df = pd.read_csv(filename, index_col=0, parse_dates=['OwnerCreationDate', 'PostCreationDate']) 56 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True) 57 | df['TitleLength'] = df.Title.apply(len) 58 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1) 59 | df['OwnerAge'] = (df.PostCreationDate - df.OwnerCreationDate).dt.days 60 | df['OwnerAge'] = np.where(df.OwnerAge < 0, 0, df.OwnerAge) 61 | return df 62 | 63 | # apply function to both training and testing files 64 | train = make_features('train.csv') 65 | test = make_features('test.csv') 66 | 67 | # train the model on ALL data 68 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'NumTags', 'OwnerAge'] 69 | X = train[feature_cols] 70 | logreg.fit(X, y) 71 | 72 | # predict class probabilities for the actual testing data 73 | X_oos = test[feature_cols] 74 | oos_pred_prob = logreg.predict_proba(X_oos)[:, 1] 75 | 76 | # create submission file 77 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id') 78 | sub.to_csv('sub2.csv') # 0.650 79 | 80 | 81 | ''' 82 | Build a document-term matrix from Title using CountVectorizer 83 | ''' 84 | 85 | # build document-term matrix for the training data 86 | from sklearn.feature_extraction.text import CountVectorizer 87 | vect = CountVectorizer(stop_words='english') 88 | dtm = vect.fit_transform(train.Title) 89 | 90 | # define X and y 91 | X = dtm 92 | y = train.OpenStatus 93 | 94 | # build document-term matrix for the actual testing data and make predictions 95 | oos_dtm = vect.transform(test.Title) 96 | from sklearn.naive_bayes import MultinomialNB 97 | nb = MultinomialNB() 98 | nb.fit(X, y) 99 | oos_pred_prob = nb.predict_proba(oos_dtm)[:, 1] 100 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id') 101 | sub.to_csv('sub3.csv') # 0.544 102 | 103 | 104 | ''' 105 | BONUS: Dummy encoding of Tag1 106 | ''' 107 | 108 | # convert Tag1 from strings to integers 109 | from sklearn.preprocessing import LabelEncoder 110 | le = LabelEncoder() 111 | train['Tag1_enc'] = le.fit_transform(train.Tag1) 112 | 113 | # create a dummy column for each value of Tag1_enc (returns a sparse matrix) 114 | from sklearn.preprocessing import OneHotEncoder 115 | ohe = OneHotEncoder() 116 | tag1_dummies = ohe.fit_transform(train[['Tag1_enc']]) 117 | 118 | # adjust Tag1 on testing set since LabelEncoder errors on new values during a transform 119 | test['Tag1'] = test['Tag1'].map(lambda s: '' if s not in le.classes_ else s) 120 | le.classes_ = np.append(le.classes_, '') 121 | 122 | # define X and y 123 | X = tag1_dummies 124 | y = train.OpenStatus 125 | 126 | # apply the same encoding to the actual testing data and make predictions 127 | test['Tag1_enc'] = le.transform(test.Tag1) 128 | oos_tag1_dummies = ohe.transform(test[['Tag1_enc']]) 129 | nb.fit(X, y) 130 | oos_pred_prob = nb.predict_proba(oos_tag1_dummies)[:, 1] 131 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id') 132 | sub.to_csv('sub4.csv') # 0.652 133 | -------------------------------------------------------------------------------- /code/17_bikeshare_exercise_nb.py: -------------------------------------------------------------------------------- 1 | # # Exercise with Capital Bikeshare data 2 | 3 | # ## Introduction 4 | # 5 | # - Capital Bikeshare dataset from Kaggle: [data](https://github.com/justmarkham/DAT8/blob/master/data/bikeshare.csv), [data dictionary](https://www.kaggle.com/c/bike-sharing-demand/data) 6 | # - Each observation represents the bikeshare rentals initiated during a given hour of a given day 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from sklearn.cross_validation import cross_val_score 11 | from sklearn.linear_model import LinearRegression 12 | from sklearn.tree import DecisionTreeRegressor, export_graphviz 13 | 14 | 15 | # read the data and set "datetime" as the index 16 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv' 17 | bikes = pd.read_csv(url, index_col='datetime', parse_dates=True) 18 | 19 | 20 | # "count" is a method, so it's best to rename that column 21 | bikes.rename(columns={'count':'total'}, inplace=True) 22 | 23 | 24 | # create "hour" as its own feature 25 | bikes['hour'] = bikes.index.hour 26 | 27 | 28 | bikes.head() 29 | 30 | 31 | bikes.tail() 32 | 33 | 34 | # - **hour** ranges from 0 (midnight) through 23 (11pm) 35 | # - **workingday** is either 0 (weekend or holiday) or 1 (non-holiday weekday) 36 | 37 | # ## Task 1 38 | # 39 | # Run these two `groupby` statements and figure out what they tell you about the data. 40 | 41 | # mean rentals for each value of "workingday" 42 | bikes.groupby('workingday').total.mean() 43 | 44 | 45 | # mean rentals for each value of "hour" 46 | bikes.groupby('hour').total.mean() 47 | 48 | 49 | # ## Task 2 50 | # 51 | # Run this plotting code, and make sure you understand the output. Then, separate this plot into two separate plots conditioned on "workingday". (In other words, one plot should display the hourly trend for "workingday=0", and the other should display the hourly trend for "workingday=1".) 52 | 53 | # mean rentals for each value of "hour" 54 | bikes.groupby('hour').total.mean().plot() 55 | 56 | 57 | # hourly rental trend for "workingday=0" 58 | bikes[bikes.workingday==0].groupby('hour').total.mean().plot() 59 | 60 | 61 | # hourly rental trend for "workingday=1" 62 | bikes[bikes.workingday==1].groupby('hour').total.mean().plot() 63 | 64 | 65 | # combine the two plots 66 | bikes.groupby(['hour', 'workingday']).total.mean().unstack().plot() 67 | 68 | 69 | # ## Task 3 70 | # 71 | # Fit a linear regression model to the entire dataset, using "total" as the response and "hour" and "workingday" as the only features. Then, print the coefficients and interpret them. What are the limitations of linear regression in this instance? 72 | 73 | # create X and y 74 | feature_cols = ['hour', 'workingday'] 75 | X = bikes[feature_cols] 76 | y = bikes.total 77 | 78 | 79 | # fit a linear regression model and print coefficients 80 | linreg = LinearRegression() 81 | linreg.fit(X, y) 82 | linreg.coef_ 83 | 84 | 85 | # ## Task 4 86 | # 87 | # Use 10-fold cross-validation to calculate the RMSE for the linear regression model. 88 | 89 | # save the 10 MSE scores output by cross_val_score 90 | scores = cross_val_score(linreg, X, y, cv=10, scoring='mean_squared_error') 91 | 92 | 93 | # convert MSE to RMSE, and then calculate the mean of the 10 RMSE scores 94 | np.mean(np.sqrt(-scores)) 95 | 96 | 97 | # ## Task 5 98 | # 99 | # Use 10-fold cross-validation to evaluate a decision tree model with those same features (fit to any "max_depth" you choose). 100 | 101 | # evaluate a decision tree model with "max_depth=7" 102 | treereg = DecisionTreeRegressor(max_depth=7, random_state=1) 103 | scores = cross_val_score(treereg, X, y, cv=10, scoring='mean_squared_error') 104 | np.mean(np.sqrt(-scores)) 105 | 106 | 107 | # ## Task 6 108 | # 109 | # Fit a decision tree model to the entire dataset using "max_depth=3", and create a tree diagram using Graphviz. Then, figure out what each leaf represents. What did the decision tree learn that a linear regression model could not learn? 110 | 111 | # fit a decision tree model with "max_depth=3" 112 | treereg = DecisionTreeRegressor(max_depth=3, random_state=1) 113 | treereg.fit(X, y) 114 | 115 | 116 | # create a Graphviz file 117 | export_graphviz(treereg, out_file='tree_bikeshare.dot', feature_names=feature_cols) 118 | 119 | # At the command line, run this to convert to PNG: 120 | # dot -Tpng tree_bikeshare.dot -o tree_bikeshare.png 121 | 122 | 123 | # ![Tree for bikeshare data](images/tree_bikeshare.png) 124 | -------------------------------------------------------------------------------- /code/19_advanced_sklearn_nb.py: -------------------------------------------------------------------------------- 1 | # # Advanced scikit-learn 2 | 3 | # ## Agenda 4 | # 5 | # - StandardScaler 6 | # - Pipeline (bonus content) 7 | 8 | # ## StandardScaler 9 | # 10 | # ### What is the problem we're trying to solve? 11 | 12 | # fake data 13 | import pandas as pd 14 | train = pd.DataFrame({'id':[0,1,2], 'length':[0.9,0.3,0.6], 'mass':[0.1,0.2,0.8], 'rings':[40,50,60]}) 15 | test = pd.DataFrame({'length':[0.59], 'mass':[0.79], 'rings':[54]}) 16 | 17 | 18 | # training data 19 | train 20 | 21 | 22 | # testing data 23 | test 24 | 25 | 26 | # define X and y 27 | feature_cols = ['length', 'mass', 'rings'] 28 | X = train[feature_cols] 29 | y = train.id 30 | 31 | 32 | # KNN with K=1 33 | from sklearn.neighbors import KNeighborsClassifier 34 | knn = KNeighborsClassifier(n_neighbors=1) 35 | knn.fit(X, y) 36 | 37 | 38 | # what "should" it predict? 39 | knn.predict(test) 40 | 41 | 42 | # allow plots to appear in the notebook 43 | import matplotlib.pyplot as plt 44 | plt.rcParams['font.size'] = 14 45 | plt.rcParams['figure.figsize'] = (5, 5) 46 | 47 | 48 | # create a "colors" array for plotting 49 | import numpy as np 50 | colors = np.array(['red', 'green', 'blue']) 51 | 52 | 53 | # scatter plot of training data, colored by id (0=red, 1=green, 2=blue) 54 | plt.scatter(train.mass, train.rings, c=colors[train.id], s=50) 55 | 56 | # testing data 57 | plt.scatter(test.mass, test.rings, c='white', s=50) 58 | 59 | # add labels 60 | plt.xlabel('mass') 61 | plt.ylabel('rings') 62 | plt.title('How we interpret the data') 63 | 64 | 65 | # adjust the x-limits 66 | plt.scatter(train.mass, train.rings, c=colors[train.id], s=50) 67 | plt.scatter(test.mass, test.rings, c='white', s=50) 68 | plt.xlabel('mass') 69 | plt.ylabel('rings') 70 | plt.title('How KNN interprets the data') 71 | plt.xlim(0, 30) 72 | 73 | 74 | # ### How does StandardScaler solve the problem? 75 | # 76 | # [StandardScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) is used for the "standardization" of features, also known as "center and scale" or "z-score normalization". 77 | 78 | # standardize the features 79 | from sklearn.preprocessing import StandardScaler 80 | scaler = StandardScaler() 81 | scaler.fit(X) 82 | X_scaled = scaler.transform(X) 83 | 84 | 85 | # original values 86 | X.values 87 | 88 | 89 | # standardized values 90 | X_scaled 91 | 92 | 93 | # figure out how it standardized 94 | print scaler.mean_ 95 | print scaler.std_ 96 | 97 | 98 | # manually standardize 99 | (X.values - scaler.mean_) / scaler.std_ 100 | 101 | 102 | # ### Applying StandardScaler to a real dataset 103 | # 104 | # - Wine dataset from the UCI Machine Learning Repository: [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data), [data dictionary](http://archive.ics.uci.edu/ml/datasets/Wine) 105 | # - **Goal:** Predict the origin of wine using chemical analysis 106 | 107 | # read three columns from the dataset into a DataFrame 108 | url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' 109 | col_names = ['label', 'color', 'proline'] 110 | wine = pd.read_csv(url, header=None, names=col_names, usecols=[0, 10, 13]) 111 | 112 | 113 | wine.head() 114 | 115 | 116 | wine.describe() 117 | 118 | 119 | # define X and y 120 | feature_cols = ['color', 'proline'] 121 | X = wine[feature_cols] 122 | y = wine.label 123 | 124 | 125 | # split into training and testing sets 126 | from sklearn.cross_validation import train_test_split 127 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 128 | 129 | 130 | # standardize X_train 131 | scaler.fit(X_train) 132 | X_train_scaled = scaler.transform(X_train) 133 | 134 | 135 | # check that it standardized properly 136 | print X_train_scaled[:, 0].mean() 137 | print X_train_scaled[:, 0].std() 138 | print X_train_scaled[:, 1].mean() 139 | print X_train_scaled[:, 1].std() 140 | 141 | 142 | # standardize X_test 143 | X_test_scaled = scaler.transform(X_test) 144 | 145 | 146 | # is this right? 147 | print X_test_scaled[:, 0].mean() 148 | print X_test_scaled[:, 0].std() 149 | print X_test_scaled[:, 1].mean() 150 | print X_test_scaled[:, 1].std() 151 | 152 | 153 | # KNN accuracy on original data 154 | knn = KNeighborsClassifier(n_neighbors=3) 155 | knn.fit(X_train, y_train) 156 | y_pred_class = knn.predict(X_test) 157 | from sklearn import metrics 158 | print metrics.accuracy_score(y_test, y_pred_class) 159 | 160 | 161 | # KNN accuracy on scaled data 162 | knn.fit(X_train_scaled, y_train) 163 | y_pred_class = knn.predict(X_test_scaled) 164 | print metrics.accuracy_score(y_test, y_pred_class) 165 | 166 | 167 | # ## Pipeline (bonus content) 168 | # 169 | # ### What is the problem we're trying to solve? 170 | 171 | # define X and y 172 | feature_cols = ['color', 'proline'] 173 | X = wine[feature_cols] 174 | y = wine.label 175 | 176 | 177 | # proper cross-validation on the original (unscaled) data 178 | knn = KNeighborsClassifier(n_neighbors=3) 179 | from sklearn.cross_validation import cross_val_score 180 | cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean() 181 | 182 | 183 | # why is this improper cross-validation on the scaled data? 184 | scaler = StandardScaler() 185 | X_scaled = scaler.fit_transform(X) 186 | cross_val_score(knn, X_scaled, y, cv=5, scoring='accuracy').mean() 187 | 188 | 189 | # ### How does Pipeline solve the problem? 190 | # 191 | # [Pipeline](http://scikit-learn.org/stable/modules/pipeline.html) is used for chaining steps together: 192 | 193 | # fix the cross-validation process using Pipeline 194 | from sklearn.pipeline import make_pipeline 195 | pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3)) 196 | cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean() 197 | 198 | 199 | # Pipeline can also be used with [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html) for parameter searching: 200 | 201 | # search for an optimal n_neighbors value using GridSearchCV 202 | neighbors_range = range(1, 21) 203 | param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range) 204 | from sklearn.grid_search import GridSearchCV 205 | grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy') 206 | grid.fit(X, y) 207 | print grid.best_score_ 208 | print grid.best_params_ 209 | -------------------------------------------------------------------------------- /code/19_clustering_nb.py: -------------------------------------------------------------------------------- 1 | # # Clustering 2 | 3 | # ## Agenda: 4 | # 5 | # 1. K-means clustering 6 | # 2. Clustering evaluation 7 | # 3. DBSCAN clustering 8 | 9 | # beer dataset 10 | import pandas as pd 11 | url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/beer.txt' 12 | beer = pd.read_csv(url, sep=' ') 13 | beer 14 | 15 | 16 | # How would you cluster these beers? 17 | 18 | # define X 19 | X = beer.drop('name', axis=1) 20 | 21 | 22 | # What happened to y? 23 | 24 | # ## Part 1: K-means clustering 25 | 26 | # K-means with 3 clusters 27 | from sklearn.cluster import KMeans 28 | km = KMeans(n_clusters=3, random_state=1) 29 | km.fit(X) 30 | 31 | 32 | # review the cluster labels 33 | km.labels_ 34 | 35 | 36 | # save the cluster labels and sort by cluster 37 | beer['cluster'] = km.labels_ 38 | beer.sort('cluster') 39 | 40 | 41 | # What do the clusters seem to be based on? Why? 42 | 43 | # review the cluster centers 44 | km.cluster_centers_ 45 | 46 | 47 | # calculate the mean of each feature for each cluster 48 | beer.groupby('cluster').mean() 49 | 50 | 51 | # save the DataFrame of cluster centers 52 | centers = beer.groupby('cluster').mean() 53 | 54 | 55 | # allow plots to appear in the notebook 56 | import matplotlib.pyplot as plt 57 | plt.rcParams['font.size'] = 14 58 | 59 | 60 | # create a "colors" array for plotting 61 | import numpy as np 62 | colors = np.array(['red', 'green', 'blue', 'yellow']) 63 | 64 | 65 | # scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue) 66 | plt.scatter(beer.calories, beer.alcohol, c=colors[beer.cluster], s=50) 67 | 68 | # cluster centers, marked by "+" 69 | plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black') 70 | 71 | # add labels 72 | plt.xlabel('calories') 73 | plt.ylabel('alcohol') 74 | 75 | 76 | # scatter plot matrix (0=red, 1=green, 2=blue) 77 | pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100) 78 | 79 | 80 | # ### Repeat with scaled data 81 | 82 | # center and scale the data 83 | from sklearn.preprocessing import StandardScaler 84 | scaler = StandardScaler() 85 | X_scaled = scaler.fit_transform(X) 86 | 87 | 88 | # K-means with 3 clusters on scaled data 89 | km = KMeans(n_clusters=3, random_state=1) 90 | km.fit(X_scaled) 91 | 92 | 93 | # save the cluster labels and sort by cluster 94 | beer['cluster'] = km.labels_ 95 | beer.sort('cluster') 96 | 97 | 98 | # What are the "characteristics" of each cluster? 99 | 100 | # review the cluster centers 101 | beer.groupby('cluster').mean() 102 | 103 | 104 | # scatter plot matrix of new cluster assignments (0=red, 1=green, 2=blue) 105 | pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100) 106 | 107 | 108 | # Do you notice any cluster assignments that seem a bit odd? How might we explain those? 109 | 110 | # ## Part 2: Clustering evaluation 111 | # 112 | # The [Silhouette Coefficient](http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient) is a common metric for evaluating clustering "performance" in situations when the "true" cluster assignments are not known. 113 | # 114 | # A Silhouette Coefficient is calculated for **each observation**: 115 | # 116 | # $$SC = \frac{b-a} {max(a, b)}$$ 117 | # 118 | # - a = mean distance to all other points in **its cluster** 119 | # - b = mean distance to all other points in **the next nearest cluster** 120 | # 121 | # It ranges from -1 (worst) to 1 (best). A **global score** is calculated by taking the mean score for all observations. 122 | 123 | # calculate SC for K=3 124 | from sklearn import metrics 125 | metrics.silhouette_score(X_scaled, km.labels_) 126 | 127 | 128 | # calculate SC for K=2 through K=19 129 | k_range = range(2, 20) 130 | scores = [] 131 | for k in k_range: 132 | km = KMeans(n_clusters=k, random_state=1) 133 | km.fit(X_scaled) 134 | scores.append(metrics.silhouette_score(X_scaled, km.labels_)) 135 | 136 | 137 | # plot the results 138 | plt.plot(k_range, scores) 139 | plt.xlabel('Number of clusters') 140 | plt.ylabel('Silhouette Coefficient') 141 | plt.grid(True) 142 | 143 | 144 | # K-means with 4 clusters on scaled data 145 | km = KMeans(n_clusters=4, random_state=1) 146 | km.fit(X_scaled) 147 | beer['cluster'] = km.labels_ 148 | beer.sort('cluster') 149 | 150 | 151 | # ## Part 3: DBSCAN clustering 152 | 153 | # DBSCAN with eps=1 and min_samples=3 154 | from sklearn.cluster import DBSCAN 155 | db = DBSCAN(eps=1, min_samples=3) 156 | db.fit(X_scaled) 157 | 158 | 159 | # review the cluster labels 160 | db.labels_ 161 | 162 | 163 | # save the cluster labels and sort by cluster 164 | beer['cluster'] = db.labels_ 165 | beer.sort('cluster') 166 | 167 | 168 | # review the cluster centers 169 | beer.groupby('cluster').mean() 170 | 171 | 172 | # scatter plot matrix of DBSCAN cluster assignments (0=red, 1=green, 2=blue, -1=yellow) 173 | pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100) 174 | -------------------------------------------------------------------------------- /code/20_regex_exercise.py: -------------------------------------------------------------------------------- 1 | ''' 2 | EXERCISE: Regular Expressions 3 | ''' 4 | 5 | # open file and store each line as one list element 6 | with open('homicides.txt', mode='rU') as f: 7 | data = [row for row in f] 8 | 9 | 10 | ''' 11 | Create a list of ages 12 | ''' 13 | 14 | import re 15 | 16 | ages = [] 17 | for row in data: 18 | match = re.search(r'\d+ years? old', row) 19 | if match: 20 | ages.append(match.group()) 21 | else: 22 | ages.append('0') 23 | 24 | # split the string on spaces, only keep the first element, and convert to int 25 | ages = [int(element.split()[0]) for element in ages] 26 | 27 | # calculate average age 28 | sum(ages) / float(len(ages)) 29 | 30 | # check that 'data' and 'ages' are the same length 31 | assert(len(data)==len(ages)) 32 | 33 | 34 | ''' 35 | Create a list of ages (using match groups) 36 | ''' 37 | 38 | ages = [] 39 | for row in data: 40 | match = re.search(r'(\d+)( years? old)', row) 41 | if match: 42 | ages.append(int(match.group(1))) 43 | else: 44 | ages.append(0) 45 | 46 | 47 | ''' 48 | Create a list of causes 49 | ''' 50 | 51 | causes = [] 52 | for row in data: 53 | match = re.search(r'Cause: (.+?)<', row) 54 | if match: 55 | causes.append(match.group(1).lower()) 56 | else: 57 | causes.append('unknown') 58 | 59 | # tally the causes 60 | from collections import Counter 61 | Counter(causes) 62 | -------------------------------------------------------------------------------- /code/20_regex_reference.py: -------------------------------------------------------------------------------- 1 | ''' 2 | REFERENCE GUIDE: Regular Expressions 3 | ''' 4 | 5 | ''' 6 | Rules for Searching: 7 | 8 | Search proceeds through string from start to end, stopping at first match 9 | All of the pattern must be matched 10 | 11 | Basic Patterns: 12 | 13 | Ordinary characters match themselves exactly 14 | . matches any single character except newline \n 15 | \w matches a word character (letter, digit, underscore) 16 | \W matches any non-word character 17 | \b matches boundary between word and non-word 18 | \s matches single whitespace character (space, newline, return, tab, form) 19 | \S matches single non-whitespace character 20 | \d matches single digit (0 through 9) 21 | \t matches tab 22 | \n matches newline 23 | \r matches return 24 | \ match a special character, such as period: \. 25 | 26 | Basic Python Usage: 27 | 28 | match = re.search(r'pattern', string_to_search) 29 | Returns match object 30 | If there is a match, access match using match.group() 31 | If there is no match, match is None 32 | Use 'r' in front of pattern to designate a raw string 33 | ''' 34 | 35 | import re 36 | 37 | s = 'my 1st string!!' 38 | 39 | match = re.search(r'my', s) # returns match object 40 | if match: # checks whether match was found 41 | print match.group() # if match was found, then print result 42 | 43 | re.search(r'my', s).group() # single-line version (without error handling) 44 | re.search(r'st', s).group() # 'st' 45 | re.search(r'sta', s).group() # error 46 | re.search(r'\w\w\w', s).group() # '1st' 47 | re.search(r'\W', s).group() # ' ' 48 | re.search(r'\W\W', s).group() # '!!' 49 | re.search(r'\s', s).group() # ' ' 50 | re.search(r'\s\s', s).group() # error 51 | re.search(r'..t', s).group() # '1st' 52 | re.search(r'\s\St', s).group() # ' st' 53 | re.search(r'\bst', s).group() # 'st' 54 | 55 | 56 | ''' 57 | Repetition: 58 | 59 | + 1 or more occurrences of the pattern to its left 60 | * 0 or more occurrences of the pattern to its left 61 | ? 0 or 1 occurrence of the pattern to its left 62 | 63 | + and * are 'greedy': they try to use up as much of the string as possible 64 | 65 | Add ? after + or * to make them 'lazy': +? or *? 66 | ''' 67 | 68 | s = 'sid is missing class' 69 | 70 | re.search(r'miss\w+', s).group() # 'missing' 71 | re.search(r'is\w+', s).group() # 'issing' 72 | re.search(r'is\w*', s).group() # 'is' 73 | 74 | s = '

my heading

' 75 | 76 | re.search(r'<.+>', s).group() # '

my heading

' 77 | re.search(r'<.+?>', s).group() # '

' 78 | 79 | 80 | ''' 81 | Positions: 82 | 83 | ^ match start of a string 84 | $ match end of a string 85 | ''' 86 | 87 | s = 'sid is missing class' 88 | 89 | re.search(r'^miss', s).group() # error 90 | re.search(r'..ss', s).group() # 'miss' 91 | re.search(r'..ss$', s).group() # 'lass' 92 | 93 | 94 | ''' 95 | Brackets: 96 | 97 | [abc] match a or b or c 98 | \w, \s, etc. work inside brackets, except period just means a literal period 99 | [a-z] match any lowercase letter (dash indicates range unless it's last) 100 | [abc-] match a or b or c or - 101 | [^ab] match anything except a or b 102 | ''' 103 | 104 | s = 'my email is john-doe@gmail.com' 105 | 106 | re.search(r'\w+@\w+', s).group() # 'doe@gmail' 107 | re.search(r'[\w.-]+@[\w.-]+', s).group() # 'john-doe@gmail.com' 108 | 109 | 110 | ''' 111 | Lookarounds: 112 | 113 | Lookahead matches a pattern only if it is followed by another pattern 114 | 100(?= dollars) matches '100' only if it is followed by ' dollars' 115 | 116 | Lookbehind matches a pattern only if it is preceded by another pattern 117 | (?<=\$)100 matches '100' only if it is preceded by '$' 118 | ''' 119 | 120 | s = 'Name: Cindy, 30 years old' 121 | 122 | re.search(r'\d+(?= years? old)', s).group() # '30' 123 | re.search(r'(?<=Name: )\w+', s).group() # 'Cindy' 124 | 125 | 126 | ''' 127 | Match Groups: 128 | 129 | Parentheses create logical groups inside of match text 130 | match.group(1) corresponds to first group 131 | match.group(2) corresponds to second group 132 | match.group() corresponds to entire match text (as usual) 133 | ''' 134 | 135 | s = 'my email is john-doe@gmail.com' 136 | 137 | match = re.search(r'([\w.-]+)@([\w.-]+)', s) 138 | if match: 139 | match.group(1) # 'john-doe' 140 | match.group(2) # 'gmail.com' 141 | match.group() # 'john-doe@gmail.com' 142 | 143 | 144 | ''' 145 | Finding All Matches: 146 | 147 | re.findall() finds all matches and returns them as a list of strings 148 | list_of_strings = re.findall(r'pattern', string_to_search) 149 | 150 | If pattern includes parentheses, a list of tuples is returned 151 | ''' 152 | 153 | s = 'emails: joe@gmail.com, bob@gmail.com' 154 | 155 | re.findall(r'[\w.-]+@[\w.-]+', s) # ['joe@gmail.com', 'bob@gmail.com'] 156 | re.findall(r'([\w.-]+)@([\w.-]+)', s) # [('joe', 'gmail.com'), ('bob', 'gmail.com')] 157 | 158 | 159 | ''' 160 | Option Flags: 161 | 162 | Options flags modify the behavior of the pattern matching 163 | 164 | default: matching is case sensitive 165 | re.IGNORECASE: ignore uppercase/lowercase differences ('a' matches 'a' or 'A') 166 | 167 | default: period matches any character except newline 168 | re.DOTALL: allow period to match newline 169 | 170 | default: within a string of many lines, ^ and $ match start and end of entire string 171 | re.MULTILINE: allow ^ and $ to match start and end of each line 172 | 173 | Option flag is third argument to re.search() or re.findall(): 174 | re.search(r'pattern', string_to_search, re.IGNORECASE) 175 | re.findall(r'pattern', string_to_search, re.IGNORECASE) 176 | ''' 177 | 178 | s = 'emails: nicole@ga.co, joe@gmail.com, PAT@GA.CO' 179 | 180 | re.findall(r'\w+@ga\.co', s) # ['nicole@ga.co'] 181 | re.findall(r'\w+@ga\.co', s, re.IGNORECASE) # ['nicole@ga.co', 'PAT@GA.CO'] 182 | 183 | 184 | ''' 185 | Substitution: 186 | 187 | re.sub() finds all matches and replaces them with a specified string 188 | new_string = re.sub(r'pattern', r'replacement', string_to_search) 189 | 190 | Replacement string can refer to text from matching groups: 191 | \1 refers to group(1) 192 | \2 refers to group(2) 193 | etc. 194 | ''' 195 | 196 | s = 'sid is missing class' 197 | 198 | re.sub(r'is ', r'was ', s) # 'sid was missing class' 199 | 200 | s = 'emails: joe@gmail.com, bob@gmail.com' 201 | 202 | re.sub(r'([\w.-]+)@([\w.-]+)', r'\1@yahoo.com', s) # 'emails: joe@yahoo.com, bob@yahoo.com' 203 | 204 | 205 | ''' 206 | Useful to know, but not covered above: 207 | 208 | re.split() splits a string by the occurrences of a pattern 209 | re.compile() compiles a pattern (for improved performance if it's used many times) 210 | A|B indicates a pattern that can match A or B 211 | ''' 212 | -------------------------------------------------------------------------------- /data/airlines.csv: -------------------------------------------------------------------------------- 1 | airline,avail_seat_km_per_week,incidents_85_99,fatal_accidents_85_99,fatalities_85_99,incidents_00_14,fatal_accidents_00_14,fatalities_00_14 2 | Aer Lingus,320906734,2,0,0,0,0,0 3 | Aeroflot*,1197672318,76,14,128,6,1,88 4 | Aerolineas Argentinas,385803648,6,0,0,1,0,0 5 | Aeromexico*,596871813,3,1,64,5,0,0 6 | Air Canada,1865253802,2,0,0,2,0,0 7 | Air France,3004002661,14,4,79,6,2,337 8 | Air India*,869253552,2,1,329,4,1,158 9 | Air New Zealand*,710174817,3,0,0,5,1,7 10 | Alaska Airlines*,965346773,5,0,0,5,1,88 11 | Alitalia,698012498,7,2,50,4,0,0 12 | All Nippon Airways,1841234177,3,1,1,7,0,0 13 | American*,5228357340,21,5,101,17,3,416 14 | Austrian Airlines,358239823,1,0,0,1,0,0 15 | Avianca,396922563,5,3,323,0,0,0 16 | British Airways*,3179760952,4,0,0,6,0,0 17 | Cathay Pacific*,2582459303,0,0,0,2,0,0 18 | China Airlines,813216487,12,6,535,2,1,225 19 | Condor,417982610,2,1,16,0,0,0 20 | COPA,550491507,3,1,47,0,0,0 21 | Delta / Northwest*,6525658894,24,12,407,24,2,51 22 | Egyptair,557699891,8,3,282,4,1,14 23 | El Al,335448023,1,1,4,1,0,0 24 | Ethiopian Airlines,488560643,25,5,167,5,2,92 25 | Finnair,506464950,1,0,0,0,0,0 26 | Garuda Indonesia,613356665,10,3,260,4,2,22 27 | Gulf Air,301379762,1,0,0,3,1,143 28 | Hawaiian Airlines,493877795,0,0,0,1,0,0 29 | Iberia,1173203126,4,1,148,5,0,0 30 | Japan Airlines,1574217531,3,1,520,0,0,0 31 | Kenya Airways,277414794,2,0,0,2,2,283 32 | KLM*,1874561773,7,1,3,1,0,0 33 | Korean Air,1734522605,12,5,425,1,0,0 34 | LAN Airlines,1001965891,3,2,21,0,0,0 35 | Lufthansa*,3426529504,6,1,2,3,0,0 36 | Malaysia Airlines,1039171244,3,1,34,3,2,537 37 | Pakistan International,348563137,8,3,234,10,2,46 38 | Philippine Airlines,413007158,7,4,74,2,1,1 39 | Qantas*,1917428984,1,0,0,5,0,0 40 | Royal Air Maroc,295705339,5,3,51,3,0,0 41 | SAS*,682971852,5,0,0,6,1,110 42 | Saudi Arabian,859673901,7,2,313,11,0,0 43 | Singapore Airlines,2376857805,2,2,6,2,1,83 44 | South African,651502442,2,1,159,1,0,0 45 | Southwest Airlines,3276525770,1,0,0,8,0,0 46 | Sri Lankan / AirLanka,325582976,2,1,14,4,0,0 47 | SWISS*,792601299,2,1,229,3,0,0 48 | TACA,259373346,3,1,3,1,1,3 49 | TAM,1509195646,8,3,98,7,2,188 50 | TAP - Air Portugal,619130754,0,0,0,0,0,0 51 | Thai Airways,1702802250,8,4,308,2,1,1 52 | Turkish Airlines,1946098294,8,3,64,8,2,84 53 | United / Continental*,7139291291,19,8,319,14,2,109 54 | US Airways / America West*,2455687887,16,7,224,11,2,23 55 | Vietnam Airlines,625084918,7,3,171,1,0,0 56 | Virgin Atlantic,1005248585,1,0,0,0,0,0 57 | Xiamen Airlines,430462962,9,1,82,2,0,0 58 | -------------------------------------------------------------------------------- /data/beer.txt: -------------------------------------------------------------------------------- 1 | name calories sodium alcohol cost 2 | Budweiser 144 15 4.7 0.43 3 | Schlitz 151 19 4.9 0.43 4 | Lowenbrau 157 15 0.9 0.48 5 | Kronenbourg 170 7 5.2 0.73 6 | Heineken 152 11 5.0 0.77 7 | Old_Milwaukee 145 23 4.6 0.28 8 | Augsberger 175 24 5.5 0.40 9 | Srohs_Bohemian_Style 149 27 4.7 0.42 10 | Miller_Lite 99 10 4.3 0.43 11 | Budweiser_Light 113 8 3.7 0.40 12 | Coors 140 18 4.6 0.44 13 | Coors_Light 102 15 4.1 0.46 14 | Michelob_Light 135 11 4.2 0.50 15 | Becks 150 19 4.7 0.76 16 | Kirin 149 6 5.0 0.79 17 | Pabst_Extra_Light 68 15 2.3 0.38 18 | Hamms 139 19 4.4 0.43 19 | Heilemans_Old_Style 144 24 4.9 0.43 20 | Olympia_Goled_Light 72 6 2.9 0.46 21 | Schlitz_Light 97 7 4.2 0.47 22 | -------------------------------------------------------------------------------- /data/drinks.csv: -------------------------------------------------------------------------------- 1 | country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent 2 | Afghanistan,0,0,0,0.0,AS 3 | Albania,89,132,54,4.9,EU 4 | Algeria,25,0,14,0.7,AF 5 | Andorra,245,138,312,12.4,EU 6 | Angola,217,57,45,5.9,AF 7 | Antigua & Barbuda,102,128,45,4.9,NA 8 | Argentina,193,25,221,8.3,SA 9 | Armenia,21,179,11,3.8,EU 10 | Australia,261,72,212,10.4,OC 11 | Austria,279,75,191,9.7,EU 12 | Azerbaijan,21,46,5,1.3,EU 13 | Bahamas,122,176,51,6.3,NA 14 | Bahrain,42,63,7,2.0,AS 15 | Bangladesh,0,0,0,0.0,AS 16 | Barbados,143,173,36,6.3,NA 17 | Belarus,142,373,42,14.4,EU 18 | Belgium,295,84,212,10.5,EU 19 | Belize,263,114,8,6.8,NA 20 | Benin,34,4,13,1.1,AF 21 | Bhutan,23,0,0,0.4,AS 22 | Bolivia,167,41,8,3.8,SA 23 | Bosnia-Herzegovina,76,173,8,4.6,EU 24 | Botswana,173,35,35,5.4,AF 25 | Brazil,245,145,16,7.2,SA 26 | Brunei,31,2,1,0.6,AS 27 | Bulgaria,231,252,94,10.3,EU 28 | Burkina Faso,25,7,7,4.3,AF 29 | Burundi,88,0,0,6.3,AF 30 | Cote d'Ivoire,37,1,7,4.0,AF 31 | Cabo Verde,144,56,16,4.0,AF 32 | Cambodia,57,65,1,2.2,AS 33 | Cameroon,147,1,4,5.8,AF 34 | Canada,240,122,100,8.2,NA 35 | Central African Republic,17,2,1,1.8,AF 36 | Chad,15,1,1,0.4,AF 37 | Chile,130,124,172,7.6,SA 38 | China,79,192,8,5.0,AS 39 | Colombia,159,76,3,4.2,SA 40 | Comoros,1,3,1,0.1,AF 41 | Congo,76,1,9,1.7,AF 42 | Cook Islands,0,254,74,5.9,OC 43 | Costa Rica,149,87,11,4.4,NA 44 | Croatia,230,87,254,10.2,EU 45 | Cuba,93,137,5,4.2,NA 46 | Cyprus,192,154,113,8.2,EU 47 | Czech Republic,361,170,134,11.8,EU 48 | North Korea,0,0,0,0.0,AS 49 | DR Congo,32,3,1,2.3,AF 50 | Denmark,224,81,278,10.4,EU 51 | Djibouti,15,44,3,1.1,AF 52 | Dominica,52,286,26,6.6,NA 53 | Dominican Republic,193,147,9,6.2,NA 54 | Ecuador,162,74,3,4.2,SA 55 | Egypt,6,4,1,0.2,AF 56 | El Salvador,52,69,2,2.2,NA 57 | Equatorial Guinea,92,0,233,5.8,AF 58 | Eritrea,18,0,0,0.5,AF 59 | Estonia,224,194,59,9.5,EU 60 | Ethiopia,20,3,0,0.7,AF 61 | Fiji,77,35,1,2.0,OC 62 | Finland,263,133,97,10.0,EU 63 | France,127,151,370,11.8,EU 64 | Gabon,347,98,59,8.9,AF 65 | Gambia,8,0,1,2.4,AF 66 | Georgia,52,100,149,5.4,EU 67 | Germany,346,117,175,11.3,EU 68 | Ghana,31,3,10,1.8,AF 69 | Greece,133,112,218,8.3,EU 70 | Grenada,199,438,28,11.9,NA 71 | Guatemala,53,69,2,2.2,NA 72 | Guinea,9,0,2,0.2,AF 73 | Guinea-Bissau,28,31,21,2.5,AF 74 | Guyana,93,302,1,7.1,SA 75 | Haiti,1,326,1,5.9,NA 76 | Honduras,69,98,2,3.0,NA 77 | Hungary,234,215,185,11.3,EU 78 | Iceland,233,61,78,6.6,EU 79 | India,9,114,0,2.2,AS 80 | Indonesia,5,1,0,0.1,AS 81 | Iran,0,0,0,0.0,AS 82 | Iraq,9,3,0,0.2,AS 83 | Ireland,313,118,165,11.4,EU 84 | Israel,63,69,9,2.5,AS 85 | Italy,85,42,237,6.5,EU 86 | Jamaica,82,97,9,3.4,NA 87 | Japan,77,202,16,7.0,AS 88 | Jordan,6,21,1,0.5,AS 89 | Kazakhstan,124,246,12,6.8,AS 90 | Kenya,58,22,2,1.8,AF 91 | Kiribati,21,34,1,1.0,OC 92 | Kuwait,0,0,0,0.0,AS 93 | Kyrgyzstan,31,97,6,2.4,AS 94 | Laos,62,0,123,6.2,AS 95 | Latvia,281,216,62,10.5,EU 96 | Lebanon,20,55,31,1.9,AS 97 | Lesotho,82,29,0,2.8,AF 98 | Liberia,19,152,2,3.1,AF 99 | Libya,0,0,0,0.0,AF 100 | Lithuania,343,244,56,12.9,EU 101 | Luxembourg,236,133,271,11.4,EU 102 | Madagascar,26,15,4,0.8,AF 103 | Malawi,8,11,1,1.5,AF 104 | Malaysia,13,4,0,0.3,AS 105 | Maldives,0,0,0,0.0,AS 106 | Mali,5,1,1,0.6,AF 107 | Malta,149,100,120,6.6,EU 108 | Marshall Islands,0,0,0,0.0,OC 109 | Mauritania,0,0,0,0.0,AF 110 | Mauritius,98,31,18,2.6,AF 111 | Mexico,238,68,5,5.5,NA 112 | Micronesia,62,50,18,2.3,OC 113 | Monaco,0,0,0,0.0,EU 114 | Mongolia,77,189,8,4.9,AS 115 | Montenegro,31,114,128,4.9,EU 116 | Morocco,12,6,10,0.5,AF 117 | Mozambique,47,18,5,1.3,AF 118 | Myanmar,5,1,0,0.1,AS 119 | Namibia,376,3,1,6.8,AF 120 | Nauru,49,0,8,1.0,OC 121 | Nepal,5,6,0,0.2,AS 122 | Netherlands,251,88,190,9.4,EU 123 | New Zealand,203,79,175,9.3,OC 124 | Nicaragua,78,118,1,3.5,NA 125 | Niger,3,2,1,0.1,AF 126 | Nigeria,42,5,2,9.1,AF 127 | Niue,188,200,7,7.0,OC 128 | Norway,169,71,129,6.7,EU 129 | Oman,22,16,1,0.7,AS 130 | Pakistan,0,0,0,0.0,AS 131 | Palau,306,63,23,6.9,OC 132 | Panama,285,104,18,7.2,NA 133 | Papua New Guinea,44,39,1,1.5,OC 134 | Paraguay,213,117,74,7.3,SA 135 | Peru,163,160,21,6.1,SA 136 | Philippines,71,186,1,4.6,AS 137 | Poland,343,215,56,10.9,EU 138 | Portugal,194,67,339,11.0,EU 139 | Qatar,1,42,7,0.9,AS 140 | South Korea,140,16,9,9.8,AS 141 | Moldova,109,226,18,6.3,EU 142 | Romania,297,122,167,10.4,EU 143 | Russian Federation,247,326,73,11.5,AS 144 | Rwanda,43,2,0,6.8,AF 145 | St. Kitts & Nevis,194,205,32,7.7,NA 146 | St. Lucia,171,315,71,10.1,NA 147 | St. Vincent & the Grenadines,120,221,11,6.3,NA 148 | Samoa,105,18,24,2.6,OC 149 | San Marino,0,0,0,0.0,EU 150 | Sao Tome & Principe,56,38,140,4.2,AF 151 | Saudi Arabia,0,5,0,0.1,AS 152 | Senegal,9,1,7,0.3,AF 153 | Serbia,283,131,127,9.6,EU 154 | Seychelles,157,25,51,4.1,AF 155 | Sierra Leone,25,3,2,6.7,AF 156 | Singapore,60,12,11,1.5,AS 157 | Slovakia,196,293,116,11.4,EU 158 | Slovenia,270,51,276,10.6,EU 159 | Solomon Islands,56,11,1,1.2,OC 160 | Somalia,0,0,0,0.0,AF 161 | South Africa,225,76,81,8.2,AF 162 | Spain,284,157,112,10.0,EU 163 | Sri Lanka,16,104,0,2.2,AS 164 | Sudan,8,13,0,1.7,AF 165 | Suriname,128,178,7,5.6,SA 166 | Swaziland,90,2,2,4.7,AF 167 | Sweden,152,60,186,7.2,EU 168 | Switzerland,185,100,280,10.2,EU 169 | Syria,5,35,16,1.0,AS 170 | Tajikistan,2,15,0,0.3,AS 171 | Thailand,99,258,1,6.4,AS 172 | Macedonia,106,27,86,3.9,EU 173 | Timor-Leste,1,1,4,0.1,AS 174 | Togo,36,2,19,1.3,AF 175 | Tonga,36,21,5,1.1,OC 176 | Trinidad & Tobago,197,156,7,6.4,NA 177 | Tunisia,51,3,20,1.3,AF 178 | Turkey,51,22,7,1.4,AS 179 | Turkmenistan,19,71,32,2.2,AS 180 | Tuvalu,6,41,9,1.0,OC 181 | Uganda,45,9,0,8.3,AF 182 | Ukraine,206,237,45,8.9,EU 183 | United Arab Emirates,16,135,5,2.8,AS 184 | United Kingdom,219,126,195,10.4,EU 185 | Tanzania,36,6,1,5.7,AF 186 | USA,249,158,84,8.7,NA 187 | Uruguay,115,35,220,6.6,SA 188 | Uzbekistan,25,101,8,2.4,AS 189 | Vanuatu,21,18,11,0.9,OC 190 | Venezuela,333,100,3,7.7,SA 191 | Vietnam,111,2,1,2.0,AS 192 | Yemen,6,0,0,0.1,AS 193 | Zambia,32,19,4,2.5,AF 194 | Zimbabwe,64,18,4,4.7,AF 195 | -------------------------------------------------------------------------------- /data/example.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Example Web Page 6 | 7 | 8 | 9 | 10 |

DAT8 Class 7

11 | 12 |

First, we are covering APIs, which are useful for getting data.

13 |

Then, we are covering web scraping, which is a more flexible way to get data.

14 |

Finally, I will ask you to fill out yet another feedback form!

15 | 16 |

Resource List

17 | 18 |

Here are some helpful API resources:

19 | 20 |
    21 |
  • API resource 1
  • 22 |
  • API resource 2
  • 23 |
24 | 25 |

Here are some helpful web scraping resources:

26 | 27 |
    28 |
  • Web scraping resource 1
  • 29 |
  • Web scraping resource 2
  • 30 |
31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /data/imdb_ids.txt: -------------------------------------------------------------------------------- 1 | tt0111161 2 | tt1856010 3 | tt0096694 4 | tt0088763 5 | tt1375666 6 | -------------------------------------------------------------------------------- /data/u.item: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/data/u.item -------------------------------------------------------------------------------- /data/vehicles_test.csv: -------------------------------------------------------------------------------- 1 | price,year,miles,doors,vtype 2 | 3000,2003,130000,4,truck 3 | 6000,2005,82500,4,car 4 | 12000,2010,60000,2,car 5 | -------------------------------------------------------------------------------- /data/vehicles_train.csv: -------------------------------------------------------------------------------- 1 | price,year,miles,doors,vtype 2 | 22000,2012,13000,2,car 3 | 14000,2010,30000,2,car 4 | 13000,2010,73500,4,car 5 | 9500,2009,78000,4,car 6 | 9000,2007,47000,4,car 7 | 4000,2006,124000,2,car 8 | 3000,2004,177000,4,car 9 | 2000,2004,209000,4,truck 10 | 3000,2003,138000,2,car 11 | 1900,2003,160000,4,car 12 | 2500,2003,190000,2,truck 13 | 5000,2001,62000,4,car 14 | 1800,1999,163000,2,truck 15 | 1300,1997,138000,4,car 16 | -------------------------------------------------------------------------------- /homework/02_command_line_chipotle.md: -------------------------------------------------------------------------------- 1 | ## Class 2 Homework: Command Line Chipotle 2 | 3 | #### Submitting Your Homework 4 | 5 | * Create a Markdown file that includes your answers **and** the code you used to arrive at those answers. 6 | * Add this Markdown file to a GitHub repo that you'll use for all of your coursework. 7 | * Submit a link to your repo using the homework submission form. 8 | 9 | #### Command Line Tasks 10 | 11 | 1. Look at the head and the tail of **chipotle.tsv** in the **data** subdirectory of this repo. Think for a minute about how the data is structured. What do you think each column means? What do you think each row means? Tell me! (If you're unsure, look at more of the file contents.) 12 | 2. How many orders do there appear to be? 13 | 3. How many lines are in this file? 14 | 4. Which burrito is more popular, steak or chicken? 15 | 5. Do chicken burritos more often have black beans or pinto beans? 16 | 6. Make a list of all of the CSV or TSV files in the DAT8 repo (using a single command). Think about how wildcard characters can help you with this task. 17 | 7. Count the approximate number of occurrences of the word "dictionary" (regardless of case) across all files in the DAT8 repo. 18 | 8. **Optional:** Use the the command line to discover something "interesting" about the Chipotle data. Try using the commands from the "advanced" section! 19 | 20 | #### Solution 21 | 22 | 1. **order_id** is the unique identifier for each order. **quantity** is the number purchased of a particular item. **item_name** is the primary name for the item being purchased. **choice_description** is list of modifiers for that item. **price** is the price for that entire line (taking **quantity** into account). A given order consists of one or more rows, depending upon the number of unique items being purchased in that order. 23 | * `head chipotle.tsv` 24 | * `tail chipotle.tsv` 25 | 2. There are 1834 orders (since 1834 is the highest **order_id** number). 26 | 3. The file has 4623 lines. 27 | * `wc -l chipotle.tsv` 28 | 4. Chicken burritos are more popular than steak burritos. 29 | * Compare `grep -i 'chicken burrito' chipotle.tsv | wc -l` with `grep -i 'steak burrito' chipotle.tsv | wc -l` 30 | * Alternatively, use the 'c' option of `grep` to skip the piping step: `grep -ic 'chicken burrito' chipotle.tsv` 31 | 5. Black beans are more popular than pinto beans (on chicken burritos). 32 | * Compare `grep -i 'chicken burrito' chipotle.tsv | grep -i 'black beans' | wc -l` with `grep -i 'chicken burrito' chipotle.tsv | grep -i 'pinto beans' | wc -l` 33 | * Alternatively, use the 'c' option of `grep` and a more complex regular expression pattern to skip the piping steps: `grep -ic 'chicken burrito.*black beans' chipotle.tsv` 34 | 6. At the moment, the CSV and TSV files in the DAT8 repo are **airlines.csv**, **chipotle.tsv**, and **sms.tsv**, all of which are in the **data** subdirectory. 35 | * Change your working directory to DAT8, and then use `find . -name *.?sv` 36 | 7. At the moment, there are 13 lines in DAT8 files that contain the word 'dictionary', which is a good approximation of the number of occurrences. 37 | * Change your working directory to DAT8, and then use `grep -ir 'dictionary' . | wc -l` 38 | * Alternatively, use the 'c' option of `grep` to skip the piping step: `grep -irc 'dictionary' .` 39 | -------------------------------------------------------------------------------- /homework/09_bias_variance.md: -------------------------------------------------------------------------------- 1 | ## Class 9 Pre-work: Bias-Variance Tradeoff 2 | 3 | Read this excellent article, [Understanding the Bias-Variance Tradeoff](http://scott.fortmann-roe.com/docs/BiasVariance.html), and be prepared to **discuss it in class** on Tuesday. 4 | 5 | **Note:** You can ignore sections 4.2 and 4.3. 6 | 7 | Here are some questions to think about while you read: 8 | * In the Party Registration example, what are the features? What is the response? Is this a regression or classification problem? 9 | * The features are wealth and religiousness. The response is voter party registration. This is a classification problem. 10 | * Conceptually, how is KNN being applied to this problem to make a prediction? 11 | * Find the K most similar voters in the training data (in terms of wealth and religiousness), and use the majority party registration among those "neighbors" as the predicted party registration for the unknown individual. 12 | * How do the four visualizations in section 3 relate to one another? Change the value of K using the slider, and make sure you understand what changed in the visualizations (and why it changed). 13 | * First viz: training data colored by response value 14 | * Second viz: classification map for K=1 15 | * Third viz: out-of-sample data colored by predicted response value, and identification of the neighborhoods used to make that prediction 16 | * Fourth viz: predicted response value for each hexagon 17 | * Changing K changes the predictions in the third and fourth viz 18 | * In figures 4 and 5, what do the lighter colors versus the darker colors mean? How is the darkness calculated? 19 | * Darkness indicates confidence in the prediction, and is calculated using the proportion of nearest neighbors that have the same response value. 20 | * What does the black line in figure 5 represent? What predictions would the best possible machine learning model make, with respect to this line? 21 | * The black line is the the underlying model that generated the training data. The best possible machine learning model would learn that line as its decision boundary. It would not be a perfect model, but it would be the best possible model. 22 | * Choose a very small value of K, and click the button "Generate New Training Data" a number of times. Do you "see" low variance or high variance, and low bias or high bias? 23 | * High variance, low bias 24 | * Repeat this with a very large value of K. Do you "see" low variance or high variance, and low bias or high bias? 25 | * Low variance, high bias 26 | * Try using other values of K. What value of K do you think is "best"? How do you define "best"? 27 | * A value of K in the middle is best. The best value is the value that results in a model whose predictions most consistently match the decision boundary. 28 | * Does a small value for K cause "overfitting" or "underfitting"? 29 | * Overfitting 30 | * Why should we care about variance at all? Shouldn't we just minimize bias and ignore variance? 31 | * If you had all of the possible data (past and future), a model with high complexity (and thus high variance) would be ideal because it would capture all of the complexity in the data and wouldn't need to generalize. But given that we only have a single sample of data, both bias and variance contribute to prediction error and should be appropriately balanced. 32 | -------------------------------------------------------------------------------- /homework/10_yelp_votes.md: -------------------------------------------------------------------------------- 1 | ## Class 10 Homework: Yelp Votes 2 | 3 | This assignment uses a small subset of the data from Kaggle's [Yelp Business Rating Prediction](https://www.kaggle.com/c/yelp-recsys-2013) competition. 4 | 5 | **Description of the data:** 6 | 7 | * `yelp.json` is the original format of the file. `yelp.csv` contains the same data, in a more convenient format. Both of the files are in this repo, so there is no need to download the data from the Kaggle website. 8 | * Each observation in this dataset is a review of a particular business by a particular user. 9 | * The "stars" column is the number of stars (1 through 5) assigned by the reviewer to the business. (Higher stars is better.) In other words, it is the rating of the business by the person who wrote the review. 10 | * The "cool" column is the number of "cool" votes this review received from other Yelp users. All reviews start with 0 "cool" votes, and there is no limit to how many "cool" votes a review can receive. In other words, it is a rating of the review itself, not a rating of the business. 11 | * The "useful" and "funny" columns are similar to the "cool" column. 12 | 13 | **Homework tasks:** 14 | 15 | 1. Read `yelp.csv` into a DataFrame. 16 | * **Bonus:** Ignore the `yelp.csv` file, and construct this DataFrame yourself from `yelp.json`. This involves reading the data into Python, decoding the JSON, converting it to a DataFrame, and adding individual columns for each of the vote types. 17 | 2. Explore the relationship between each of the vote types (cool/useful/funny) and the number of stars. 18 | 3. Define cool/useful/funny as the features, and stars as the response. 19 | 4. Fit a linear regression model and interpret the coefficients. Do the coefficients make intuitive sense to you? Explore the Yelp website to see if you detect similar trends. 20 | 5. Evaluate the model by splitting it into training and testing sets and computing the RMSE. Does the RMSE make intuitive sense to you? 21 | 6. Try removing some of the features and see if the RMSE improves. 22 | 7. **Bonus:** Think of some new features you could create from the existing data that might be predictive of the response. Figure out how to create those features in Pandas, add them to your model, and see if the RMSE improves. 23 | 8. **Bonus:** Compare your best RMSE on the testing set with the RMSE for the "null model", which is the model that ignores all features and simply predicts the mean response value in the testing set. 24 | 9. **Bonus:** Instead of treating this as a regression problem, treat it as a classification problem and see what testing accuracy you can achieve with KNN. 25 | 10. **Bonus:** Figure out how to use linear regression for classification, and compare its classification accuracy with KNN's accuracy. 26 | -------------------------------------------------------------------------------- /homework/13_cross_validation.md: -------------------------------------------------------------------------------- 1 | ## Class 13 Pre-work: Cross-validation 2 | 3 | Watch my video on [cross-validation](https://www.youtube.com/watch?v=6dbrR-WymjI) (36 minutes), and be prepared to **discuss it in class** on Tuesday. The [notebook](../notebooks/13_cross_validation.ipynb) shown in the video is also in this repository. 4 | 5 | Alternatively, read section 5.1 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) (11 pages). 6 | 7 | Here are some questions to think about: 8 | 9 | - What is the purpose of model evaluation? 10 | - The purpose is to estimate the likely performance of a model on out-of-sample data, so that we can choose the model that is most likely to generalize, and so that we can have an idea of how well that model will actually perform. 11 | - What is the drawback of training and testing on the same data? 12 | - Training accuracy is maximized for overly complex models which overfit the training data, and thus it's not a good measure of how well a model will generalize. 13 | - How does train/test split work, and what is its primary drawback? 14 | - It splits the data into two pieces, trains the model on the training set, and tests the model on the testing set. Testing accuracy can change a lot depending upon which observations happen to be in the training and testing sets. 15 | - How does K-fold cross-validation work, and what is the role of "K"? 16 | - First, it splits the data into K equal folds. Then, it trains the model on folds 2 through K, tests the model on fold 1, and calculates the requested evaluation metric. Then, it repeats that process K-1 more times, until every fold has been the testing set exactly once. 17 | - Why do we pass X and y, not X_train and y_train, to the `cross_val_score` function? 18 | - It will take care of splitting the data into the K folds, so we don't need to split it ourselves. 19 | - Why does `cross_val_score` need a "scoring" parameter? 20 | - It needs to know what evaluation metric to calculate, since many different metrics are available. 21 | - What does `cross_val_score` return, and what do we usually do with that object? 22 | - It returns a NumPy array containing the K scores. We usually calculate the mean score, though we might also be interested in the standard deviation. 23 | - Under what circumstances does `cross_val_score` return negative scores? 24 | - The scores will be negative if the evaluation metric is a loss function (something you want to minimize) rather than a reward function (something you want to maximize). 25 | - When should you use train/test split, and when should you use cross-validation? 26 | - Train/test split is useful when you want to inspect your testing results (via confusion matrix or ROC curve) and when evaluation speed is a concern. Cross-validation is useful when you are most concerned with the accuracy of your estimation. 27 | -------------------------------------------------------------------------------- /homework/13_roc_auc.md: -------------------------------------------------------------------------------- 1 | ## Class 13 Pre-work: ROC Curves and AUC 2 | 3 | First, read these [lesson notes](http://ebp.uga.edu/courses/Chapter%204%20-%20Diagnosis%20I/8%20-%20ROC%20curves.html) from a university course for an excellent overview of ROC curves. 4 | 5 | Then, watch my video on [ROC Curves and Area Under the Curve](https://www.youtube.com/watch?v=OAl6eAyP-yo) (14 minutes), and be prepared to **discuss it in class** on Tuesday. (Feel free to play with the [visualization](http://www.navan.name/roc/) shown in the video, or view the [video transcript and screenshots](http://www.dataschool.io/roc-curves-and-auc-explained/).) 6 | 7 | **Optional:** If you would like to go even deeper, [An introduction to ROC analysis](http://people.inf.elte.hu/kiss/13dwhdm/roc.pdf) is a very readable paper on the topic. 8 | 9 | Here are some questions to think about: 10 | 11 | - What is the difference between the predict and predict_proba methods in scikit-learn? 12 | - The former outputs class predictions, and the latter outputs predicted probabilities of class membership. 13 | - If you have a classification model that outputs predicted probabilities, how could you convert those probabilities to class predictions? 14 | - Set a threshold, and classify everything above the threshold as a 1 and everything below the threshold as a 0. 15 | - Why are predicted probabilities (rather than just class predictions) required to generate an ROC curve? 16 | - Because an ROC curve is measuring the performance of a classifier at all possible thresholds, and thresholds only make sense in the context of predicted probabilities. 17 | - Could you use an ROC curve for a regression problem? Why or why not? 18 | - No, because ROC is a plot of TPR vs FPR, and those concepts have no meaning in a regression problem. 19 | - What's another term for True Positive Rate? 20 | - Sensitivity or recall. 21 | - If I wanted to increase specificity, how would I change the classification threshold? 22 | - Increase it. 23 | - Is it possible to adjust your classification threshold such that both sensitivity and specificity increase simultaneously? Why or why not? 24 | - No, because increasing either of those requires moving the threshold in opposite directions. 25 | - What are the primary benefits of ROC curves over classification accuracy? 26 | - Doesn't require setting a classification threshold, allows you to visualize the performance of your classifier, works well for unbalanced classes. 27 | - What should you do if your AUC is 0.2? 28 | - Reverse your predictions so that your AUC is 0.8. 29 | - What would the plot of reds and blues look like for a dataset in which each observation was a credit card transaction, and the response variable was whether or not the transaction was fraudulent? (0 = not fraudulent, 1 = fraudulent) 30 | - Blues would be significantly larger, lots of overlap between blues and reds. 31 | - What's a real-world scenario in which you would prefer high specificity (rather than high sensitivity) for your classifier? 32 | - Speed cameras issuing speeding tickets. 33 | -------------------------------------------------------------------------------- /homework/14_spam_filtering.md: -------------------------------------------------------------------------------- 1 | ## Class 14 Pre-work: Spam Filtering 2 | 3 | Read Paul Graham's [A Plan for Spam](http://www.paulgraham.com/spam.html). 4 | 5 | Here are some questions to think about: 6 | 7 | - Should a spam filter optimize for sensitivity or specificity, in Paul's opinion? 8 | - Specificity, in order to minimize false positives (non-spam being incorrectly marked as spam). 9 | - Before he tried the "statistical approach" to spam filtering, what was his approach? 10 | - He hand-engineered features and used those features to compute a score. 11 | - What are the key components of his statistical filtering system? In other words, how does it work? 12 | - Scan the entire text (including headers) and tokenize it. 13 | - Count the number of occurrences of each token in the ham corpus and the spam corpus (separately). 14 | - Assign each token a "spam score" based on its relative frequency in the corpora. 15 | - For new email, only take into account the 15 most "interesting" tokens. 16 | - What did Paul say were some of the benefits of the statistical approach? 17 | - It works better (almost no false positives). 18 | - It requires less work because it discovers features automatically. 19 | - The "spam score" is interpretable. 20 | - It can easily be tuned to the individual user. 21 | - It evolves with the spam. 22 | - It creates an implicit whitelist/blacklist of email addresses, server names, etc. 23 | - How good was his prediction of the "spam of the future"? 24 | - Great! 25 | -------------------------------------------------------------------------------- /homework/14_yelp_review_text.md: -------------------------------------------------------------------------------- 1 | ## Class 14 Homework: Yelp Review Text 2 | 3 | This assignment uses the same data as the [class 10 homework](10_yelp_votes.md). This time, we will attempt to classify reviews as either 5-star or 1-star using only the review text! 4 | 5 | After each task, I recommend that you check the **shape** and the **contents** of your objects, to confirm that they match your expectations. 6 | 7 | **Homework tasks:** 8 | 9 | 1. Read `yelp.csv` into a DataFrame. 10 | 2. Create a new DataFrame that only contains the 5-star and 1-star reviews. 11 | 3. Split the new DataFrame into training and testing sets, using the review text as the only feature and the star rating as the response. 12 | 4. Use CountVectorizer to create document-term matrices from X_train and X_test. 13 | - **Hint:** If you run into a decoding error, instantiate the vectorizer with the argument `decode_error='ignore'`. 14 | 5. Use Naive Bayes to predict the star rating for reviews in the testing set, and calculate the accuracy. 15 | 6. Calculate the AUC. 16 | - **Hint 1:** Make sure to pass the predicted probabilities to `roc_auc_score`, not the predicted classes. 17 | - **Hint 2:** `roc_auc_score` will get confused if y_test contains fives and ones, so you will need to create a new object that contains ones and zeros instead. 18 | 7. Plot the ROC curve. 19 | 8. Print the confusion matrix, and calculate the sensitivity and specificity. Comment on the results. 20 | 9. Browse through the review text for some of the false positives and false negatives. Based on your knowledge of how Naive Bayes works, do you have any theories about why the model is incorrectly classifying these reviews? 21 | 10. Let's pretend that you want to balance sensitivity and specificity. You can achieve this by changing the threshold for predicting a 5-star review. What threshold approximately balances sensitivity and specificity? 22 | 11. Let's see how well Naive Bayes performs when all reviews are included, rather than just 1-star and 5-star reviews: 23 | - Define X and y using the original DataFrame from step 1. (y should contain 5 different classes.) 24 | - Split the data into training and testing sets. 25 | - Calculate the testing accuracy of a Naive Bayes model. 26 | - Compare the testing accuracy with the null accuracy. 27 | - Print the confusion matrix. 28 | - Comment on the results. 29 | -------------------------------------------------------------------------------- /notebooks/12_e_log_examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exponential functions and logarithms" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import math\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Exponential functions" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "What is **e**? It is simply a number (known as Euler's number):" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "2.718281828459045" 47 | ] 48 | }, 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "math.e" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "**e** is a significant number, because it is the base rate of growth shared by all continually growing processes.\n", 63 | "\n", 64 | "For example, if I have **10 dollars**, and it grows 100% in 1 year (compounding continuously), I end up with **10\\*e^1 dollars**:" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "27.18281828459045" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "# 100% growth for 1 year\n", 87 | "10 * np.exp(1)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "73.890560989306508" 101 | ] 102 | }, 103 | "execution_count": 4, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "# 100% growth for 2 years\n", 110 | "10 * np.exp(2)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Side note: When e is raised to a power, it is known as **the exponential function**. Technically, any number can be the base, and it would still be known as **an exponential function** (such as 2^5). But in our context, the base of the exponential function is assumed to be e.\n", 118 | "\n", 119 | "Anyway, what if I only have 20% growth instead of 100% growth?" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "12.214027581601698" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "# 20% growth for 1 year\n", 142 | "10 * np.exp(0.20)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "14.918246976412703" 156 | ] 157 | }, 158 | "execution_count": 6, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "# 20% growth for 2 years\n", 165 | "10 * np.exp(0.20 * 2)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "## Logarithms" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "What is the **(natural) logarithm**? It gives you the time needed to reach a certain level of growth. For example, if I want growth by a factor of 2.718, it will take me 1 unit of time (assuming a 100% growth rate):" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 7, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "0.99989631572895199" 193 | ] 194 | }, 195 | "execution_count": 7, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "# time needed to grow 1 unit to 2.718 units\n", 202 | "np.log(2.718)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "If I want growth by a factor of 7.389, it will take me 2 units of time:" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 8, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "1.9999924078065106" 223 | ] 224 | }, 225 | "execution_count": 8, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "# time needed to grow 1 unit to 7.389 units\n", 232 | "np.log(7.389)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "If I want growth by a factor of 1, it will take me 0 units of time:" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 9, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "0.0" 253 | ] 254 | }, 255 | "execution_count": 9, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "# time needed to grow 1 unit to 1 unit\n", 262 | "np.log(1)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "If I want growth by a factor of 0.5, it will take me -0.693 units of time (which is like looking back in time):" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 10, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "-0.69314718055994529" 283 | ] 284 | }, 285 | "execution_count": 10, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "# time needed to grow 1 unit to 0.5 units\n", 292 | "np.log(0.5)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "## Connecting the concepts" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "As you can see, the exponential function and the natural logarithm are **inverses** of one another:" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 11, 312 | "metadata": { 313 | "collapsed": false 314 | }, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "text/plain": [ 319 | "5.0" 320 | ] 321 | }, 322 | "execution_count": 11, 323 | "metadata": {}, 324 | "output_type": "execute_result" 325 | } 326 | ], 327 | "source": [ 328 | "np.log(np.exp(5))" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 12, 334 | "metadata": { 335 | "collapsed": false 336 | }, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "4.9999999999999991" 342 | ] 343 | }, 344 | "execution_count": 12, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "np.exp(np.log(5))" 351 | ] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "Python 2", 357 | "language": "python", 358 | "name": "python2" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 2 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython2", 370 | "version": "2.7.6" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 0 375 | } 376 | -------------------------------------------------------------------------------- /notebooks/14_naive_bayes_spam.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Applying Naive Bayes classification to spam filtering" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Let's pretend we have an email with three words: \"Send money now.\" We'll use Naive Bayes to classify it as **ham or spam.**\n", 15 | "\n", 16 | "$$P(spam \\ | \\ \\text{send money now}) = \\frac {P(\\text{send money now} \\ | \\ spam) \\times P(spam)} {P(\\text{send money now})}$$\n", 17 | "\n", 18 | "By assuming that the features (the words) are **conditionally independent**, we can simplify the likelihood function:\n", 19 | "\n", 20 | "$$P(spam \\ | \\ \\text{send money now}) \\approx \\frac {P(\\text{send} \\ | \\ spam) \\times P(\\text{money} \\ | \\ spam) \\times P(\\text{now} \\ | \\ spam) \\times P(spam)} {P(\\text{send money now})}$$\n", 21 | "\n", 22 | "We can calculate all of the values in the numerator by examining a corpus of **spam email**:\n", 23 | "\n", 24 | "$$P(spam \\ | \\ \\text{send money now}) \\approx \\frac {0.2 \\times 0.1 \\times 0.1 \\times 0.9} {P(\\text{send money now})} = \\frac {0.0018} {P(\\text{send money now})}$$\n", 25 | "\n", 26 | "We would repeat this process with a corpus of **ham email**:\n", 27 | "\n", 28 | "$$P(ham \\ | \\ \\text{send money now}) \\approx \\frac {0.05 \\times 0.01 \\times 0.1 \\times 0.1} {P(\\text{send money now})} = \\frac {0.000005} {P(\\text{send money now})}$$\n", 29 | "\n", 30 | "All we care about is whether spam or ham has the **higher probability**, and so we predict that the email is **spam**." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Key takeaways\n", 38 | "\n", 39 | "- The **\"naive\" assumption** of Naive Bayes (that the features are conditionally independent) is critical to making these calculations simple.\n", 40 | "- The **normalization constant** (the denominator) can be ignored since it's the same for all classes.\n", 41 | "- The **prior probability** is much less relevant once you have a lot of features." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Comparing Naive Bayes with other models\n", 49 | "\n", 50 | "Advantages of Naive Bayes:\n", 51 | "\n", 52 | "- Model training and prediction are very fast\n", 53 | "- Somewhat interpretable\n", 54 | "- No tuning is required\n", 55 | "- Features don't need scaling\n", 56 | "- Insensitive to irrelevant features (with enough observations)\n", 57 | "- Performs better than logistic regression when the training set is very small\n", 58 | "\n", 59 | "Disadvantages of Naive Bayes:\n", 60 | "\n", 61 | "- Predicted probabilities are not well-calibrated\n", 62 | "- Correlated features can be problematic (due to the independence assumption)\n", 63 | "- Can't handle negative features (with Multinomial Naive Bayes)\n", 64 | "- Has a higher \"asymptotic error\" than logistic regression" 65 | ] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 2", 71 | "language": "python", 72 | "name": "python2" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 2 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython2", 84 | "version": "2.7.6" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 0 89 | } 90 | -------------------------------------------------------------------------------- /notebooks/14_types_of_naive_bayes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Comparing Multinomial and Gaussian Naive Bayes\n", 8 | "\n", 9 | "scikit-learn documentation: [MultinomialNB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html) and [GaussianNB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)\n", 10 | "\n", 11 | "Dataset: [Pima Indians Diabetes](https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes) from the UCI Machine Learning Repository" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "# read the data\n", 23 | "import pandas as pd\n", 24 | "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'\n", 25 | "col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']\n", 26 | "pima = pd.read_csv(url, header=None, names=col_names)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/html": [ 39 | "
\n", 40 | "\n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | "
pregnantglucosebpskininsulinbmipedigreeagelabel
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
\n", 118 | "
" 119 | ], 120 | "text/plain": [ 121 | " pregnant glucose bp skin insulin bmi pedigree age label\n", 122 | "0 6 148 72 35 0 33.6 0.627 50 1\n", 123 | "1 1 85 66 29 0 26.6 0.351 31 0\n", 124 | "2 8 183 64 0 0 23.3 0.672 32 1\n", 125 | "3 1 89 66 23 94 28.1 0.167 21 0\n", 126 | "4 0 137 40 35 168 43.1 2.288 33 1" 127 | ] 128 | }, 129 | "execution_count": 2, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "# notice that all features are continuous\n", 136 | "pima.head()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 3, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "# create X and y\n", 148 | "X = pima.drop('label', axis=1)\n", 149 | "y = pima.label" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 4, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "# split into training and testing sets\n", 161 | "from sklearn.cross_validation import train_test_split\n", 162 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 5, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "# import both Multinomial and Gaussian Naive Bayes\n", 174 | "from sklearn.naive_bayes import MultinomialNB, GaussianNB\n", 175 | "from sklearn import metrics" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 6, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "0.541666666667\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "# testing accuracy of Multinomial Naive Bayes\n", 195 | "mnb = MultinomialNB()\n", 196 | "mnb.fit(X_train, y_train)\n", 197 | "y_pred_class = mnb.predict(X_test)\n", 198 | "print metrics.accuracy_score(y_test, y_pred_class)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 7, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "0.791666666667\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "# testing accuracy of Gaussian Naive Bayes\n", 218 | "gnb = GaussianNB()\n", 219 | "gnb.fit(X_train, y_train)\n", 220 | "y_pred_class = gnb.predict(X_test)\n", 221 | "print metrics.accuracy_score(y_test, y_pred_class)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "**Conclusion:** When applying Naive Bayes classification to a dataset with **continuous features**, it is better to use Gaussian Naive Bayes than Multinomial Naive Bayes. The latter is suitable for datasets containing **discrete features** (e.g., word counts).\n", 229 | "\n", 230 | "Wikipedia has a short [description](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Gaussian_naive_Bayes) of Gaussian Naive Bayes, as well as an excellent [example](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Sex_classification) of its usage." 231 | ] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 2", 237 | "language": "python", 238 | "name": "python2" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 2 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython2", 250 | "version": "2.7.6" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 0 255 | } 256 | -------------------------------------------------------------------------------- /notebooks/images/bias_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/bias_variance.png -------------------------------------------------------------------------------- /notebooks/images/cross_validation_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/cross_validation_diagram.png -------------------------------------------------------------------------------- /notebooks/images/crowdflower_ensembling.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/crowdflower_ensembling.jpg -------------------------------------------------------------------------------- /notebooks/images/driver_ensembling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/driver_ensembling.png -------------------------------------------------------------------------------- /notebooks/images/estimating_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/estimating_coefficients.png -------------------------------------------------------------------------------- /notebooks/images/iris_01nn_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/iris_01nn_map.png -------------------------------------------------------------------------------- /notebooks/images/iris_05nn_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/iris_05nn_map.png -------------------------------------------------------------------------------- /notebooks/images/iris_15nn_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/iris_15nn_map.png -------------------------------------------------------------------------------- /notebooks/images/iris_50nn_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/iris_50nn_map.png -------------------------------------------------------------------------------- /notebooks/images/lasso_ridge_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/lasso_ridge_coefficients.png -------------------------------------------------------------------------------- /notebooks/images/lasso_ridge_path.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/lasso_ridge_path.png -------------------------------------------------------------------------------- /notebooks/images/logistic_betas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/logistic_betas.png -------------------------------------------------------------------------------- /notebooks/images/obama_clinton_tree.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/obama_clinton_tree.jpg -------------------------------------------------------------------------------- /notebooks/images/polynomial_overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/polynomial_overfitting.png -------------------------------------------------------------------------------- /notebooks/images/salary_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_color.png -------------------------------------------------------------------------------- /notebooks/images/salary_regions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_regions.png -------------------------------------------------------------------------------- /notebooks/images/salary_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_tree.png -------------------------------------------------------------------------------- /notebooks/images/salary_tree_annotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_tree_annotated.png -------------------------------------------------------------------------------- /notebooks/images/salary_tree_deep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/salary_tree_deep.png -------------------------------------------------------------------------------- /notebooks/images/supervised_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/supervised_learning.png -------------------------------------------------------------------------------- /notebooks/images/train_test_split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/train_test_split.png -------------------------------------------------------------------------------- /notebooks/images/training_testing_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/training_testing_error.png -------------------------------------------------------------------------------- /notebooks/images/tree_bikeshare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/tree_bikeshare.png -------------------------------------------------------------------------------- /notebooks/images/tree_titanic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/tree_titanic.png -------------------------------------------------------------------------------- /notebooks/images/tree_vehicles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/tree_vehicles.png -------------------------------------------------------------------------------- /notebooks/images/tree_vs_linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/notebooks/images/tree_vs_linear.png -------------------------------------------------------------------------------- /other/02_exercise_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/other/02_exercise_output.png -------------------------------------------------------------------------------- /other/02_file_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/other/02_file_tree.png -------------------------------------------------------------------------------- /other/advice.md: -------------------------------------------------------------------------------- 1 | ## What's Next? 2 | 3 | Here is my best advice for **getting better at data science**: Find "the thing" that motivates you to practice what you learned and to learn more, and then do that thing. That could be personal data science projects, Kaggle competitions, online courses, reading books, reading blogs, attending meetups or conferences, or something else. 4 | 5 | If you create your own **data science projects**, I'd encourage you to share them on GitHub and include writeups. That will help to show others that you know how to do [proper data science](http://simplystatistics.org/2015/03/17/data-science-done-well-looks-easy-and-that-is-a-big-problem-for-data-scientists/). 6 | 7 | **Kaggle competitions** are a great way to practice data science without coming up with the problem yourself. Don't worry about how high you place, just focus on learning something new with every competition. Spend as much time as possible reading the forums, because you'll learn a lot, but don't spend time in the forums at the expense of working on the competition yourself. Also, keep in mind that you won't be practicing important parts of the data science workflow, namely generating questions, gathering data, and communicating results. 8 | 9 | There are many **online courses** to consider, and new ones being created all the time: 10 | 11 | * Coursera's [Data Science Specialization](https://www.coursera.org/specialization/jhudatascience/1) is 9 courses, plus a capstone project. There is a lot of overlap with General Assembly's course, and course quality varies, but you would definitely learn a lot of R. 12 | * Coursera's [Machine Learning](https://www.coursera.org/learn/machine-learning/) is Andrew Ng's highly regarded course. It goes deeper into many topics we covered, and covers many topics we didn't. Keep in mind that it focuses only on machine learning (not the entire data science workflow), the programming assignments use MATLAB/Octave, and it requires some understanding of linear algebra. Browse these [lecture notes](http://www.holehouse.org/mlclass/) (compiled by a student) for a preview of the course. 13 | * Stanford's [Statistical Learning](http://online.stanford.edu/course/statistical-learning) also covers some topics that we did not. It focuses on teaching machine learning at a conceptual (rather than mathematical) level, when possible. The course may be offered again in 2016, but the real gem from the course is the book and videos (linked below). 14 | * Caltech's [Learning from Data](http://work.caltech.edu/telecourse.html) teaches machine learning at a theoretical and conceptual level. The lectures and slides are excellent. The homework assignments are not interactive, and the course does not use a specific programming language. 15 | * Udacity's [Data Analyst Nanodegree](https://www.udacity.com/course/data-analyst-nanodegree--nd002) looks promising, but I don't know anyone who has done it. 16 | * Thinkful's [Data Science in Python](https://www.thinkful.com/courses/learn-data-science-online/) course or SlideRule's [Data Science Intensive](https://www.mysliderule.com/workshops/data-science-intensive) may be a good way to practice our course material with guidance from an expert mentor. 17 | * [Dataquest](https://www.dataquest.io) is an online platform rather than a traditional course, and allows you to learn and practice data science through interactive exercises. Not all of the lessons are free, but new lessons are frequently being added. 18 | * edX's [Introduction to Computer Science and Programming Using Python](https://www.edx.org/course/introduction-computer-science-mitx-6-00-1x7) is apparently an excellent course if you want to get better at programming in Python. 19 | * Coursera recently added many other data science-related [specializations and courses](https://www.coursera.org/browse/data-science?languages=en), most of which I am not familiar with. However, [CourseTalk](https://www.coursetalk.com/) is useful for reading reviews of online courses. 20 | * Some additional courses are listed in the [Additional Resources](../README.md#additional-resources-1) section of the main README. 21 | * I will also be teaching [my own online courses](http://www.dataschool.io/learn/), which will range in level from beginner to advanced. (Subscribe to my [email newsletter](http://www.dataschool.io/subscribe/) to be notified when courses are announced.) 22 | 23 | Here is just a tiny selection of **books**: 24 | * [An Introduction to Statistical Learning with Applications in R](http://www-bcf.usc.edu/~gareth/ISL/) is my favorite book on machine learning because of the thoughtful way in which the material is presented. The Statistical Learning course linked above uses it as the course textbook, and the [related videos](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/) are available on YouTube. 25 | * [Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/) is by the same authors. It covers a wider variety of topics, and in greater mathematical depth. 26 | * [Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do) was written by the creator of Pandas, and is especially useful if you want to go deeper into Pandas and NumPy. 27 | * [Python Machine Learning](https://github.com/rasbt/python-machine-learning-book) came out in September 2015. The author, [Sebastian Raschka](http://sebastianraschka.com/articles.html), is an excellent writer and has a deep understanding of both machine learning and scikit-learn, so I suspect it is worth reading. 28 | 29 | There are an overwhelming number of data science **blogs and newsletters**. If you want to read just one site, [DataTau](http://www.datatau.com/news) is the best aggregator. [Data Elixir](http://dataelixir.com/) is the best newsletter, though the [O'Reilly Data Newsletter](http://www.oreilly.com/data/newsletter.html) and [Python Weekly](http://www.pythonweekly.com/) are also good. Other notable blogs include: [no free hunch](http://blog.kaggle.com/) (Kaggle's blog), [The Yhat blog](http://blog.yhathq.com/) (lots of Python and R content), [Practical Business Python](http://pbpython.com/) (accessible Python content), [Simply Statistics](http://simplystatistics.org/) (a bit more academic), [FastML](http://fastml.com/) (machine learning content), [Win-Vector blog](http://www.win-vector.com/blog/) (great data science advice), [FiveThirtyEight](http://fivethirtyeight.com/) (data journalism), and [Data School](http://www.dataschool.io/) (my blog). 30 | 31 | If you prefer **podcasts**, I don't have any personal recommendations, though this [list](https://blog.growth.supply/the-7-best-data-science-and-machine-learning-podcasts-e8f0d5a4a419) gives a nice summary of seven data science podcasts that the author recommends. 32 | 33 | There are tons of data-related **meetups in DC**, and most of them are organized by Data Community DC. Check out the [calendar](http://www.datacommunitydc.org/calendar/) or just subscribe to their [weekly newsletter](http://www.datacommunitydc.org/newsletter/). [District Data Labs](http://www.districtdatalabs.com/) also offers data science workshops and project opportunities in DC. 34 | 35 | Some notable data science **conferences** are [KDD](http://www.kdd.org/), [Strata](http://strataconf.com/), [PyCon](https://us.pycon.org), [PyData](http://pydata.org/), and [SciPy](http://scipy.org/). 36 | 37 | If you want to go **full-time** with your data science education, read this [guide to data science bootcamps](http://yet-another-data-blog.blogspot.com/2014/04/data-science-bootcamp-landscape-full.html), and this [other guide](http://www.skilledup.com/articles/list-data-science-bootcamps) which also includes part-time and online programs. Or, check out this massive list of [colleges and universities](http://datascience.community/colleges) with data science-related degrees. 38 | 39 | Finally, Dataquest's blog post on [How to actually learn data science](https://www.dataquest.io/blog/how-to-actually-learn-data-science/) has some additional advice that may be useful to you. 40 | -------------------------------------------------------------------------------- /other/model_comparison.md: -------------------------------------------------------------------------------- 1 | # Comparison of Machine Learning Models ([table](http://www.dataschool.io/comparing-supervised-learning-algorithms/)) 2 | 3 | ## K-nearest neighbors (KNN) 4 | 5 | **Advantages:** 6 | 7 | - Simple to understand and explain 8 | - Model training is fast 9 | - Can be used for classification and regression 10 | 11 | **Disadvantages:** 12 | 13 | - Must store all of the training data 14 | - Prediction phase can be slow when n is large 15 | - Sensitive to irrelevant features 16 | - Sensitive to the scale of the data 17 | - Accuracy is (generally) not competitive with the best supervised learning methods 18 | 19 | ## Linear Regression 20 | 21 | **Advantages:** 22 | 23 | - Simple to explain 24 | - Highly interpretable 25 | - Model training and prediction are fast 26 | - No tuning is required (excluding regularization) 27 | - Features don't need scaling 28 | - Can perform well with a small number of observations 29 | - Well-understood 30 | 31 | **Disadvantages:** 32 | 33 | - Presumes a linear relationship between the features and the response 34 | - Performance is (generally) not competitive with the best supervised learning methods due to high bias 35 | - Can't automatically learn feature interactions 36 | 37 | ## Logistic Regression 38 | 39 | **Advantages:** 40 | 41 | - Highly interpretable (if you remember how) 42 | - Model training and prediction are fast 43 | - No tuning is required (excluding regularization) 44 | - Features don't need scaling 45 | - Can perform well with a small number of observations 46 | - Outputs well-calibrated predicted probabilities 47 | 48 | **Disadvantages:** 49 | 50 | - Presumes a linear relationship between the features and the log-odds of the response 51 | - Performance is (generally) not competitive with the best supervised learning methods 52 | - Can't automatically learn feature interactions 53 | 54 | ## Naive Bayes 55 | 56 | **Advantages:** 57 | 58 | - Model training and prediction are very fast 59 | - Somewhat interpretable 60 | - No tuning is required 61 | - Features don't need scaling 62 | - Insensitive to irrelevant features (with enough observations) 63 | - Performs better than logistic regression when the training set is very small 64 | 65 | **Disadvantages:** 66 | 67 | - Predicted probabilities are not well-calibrated 68 | - Correlated features can be problematic (due to the independence assumption) 69 | - Can't handle negative features (with Multinomial Naive Bayes) 70 | - Has a higher "asymptotic error" than logistic regression 71 | 72 | ## Decision Trees 73 | 74 | **Advantages:** 75 | 76 | - Can be used for regression or classification 77 | - Can be displayed graphically 78 | - Highly interpretable 79 | - Can be specified as a series of rules, and more closely approximate human decision-making than other models 80 | - Prediction is fast 81 | - Features don't need scaling 82 | - Automatically learns feature interactions 83 | - Tends to ignore irrelevant features 84 | - Non-parametric (will outperform linear models if relationship between features and response is highly non-linear) 85 | 86 | **Disadvantages:** 87 | 88 | - Performance is (generally) not competitive with the best supervised learning methods 89 | - Can easily overfit the training data (tuning is required) 90 | - Small variations in the data can result in a completely different tree (high variance) 91 | - Recursive binary splitting makes "locally optimal" decisions that may not result in a globally optimal tree 92 | - Doesn't tend to work well if the classes are highly unbalanced 93 | - Doesn't tend to work well with very small datasets 94 | 95 | ## Random Forests 96 | 97 | **Advantages (compared to decision trees):** 98 | 99 | - Performance is competitive with the best supervised learning methods 100 | - Provides a more reliable estimate of feature importance 101 | - Allows you to estimate out-of-sample error without using train/test split or cross-validation 102 | 103 | **Disadvantages (compared to decision trees):** 104 | 105 | - Less interpretable 106 | - Slower to train 107 | - Slower to predict 108 | 109 | ## Regularized Linear Models 110 | 111 | **Advantages (compared to unregularized linear models):** 112 | 113 | - Better performance 114 | - L1 regularization performs automatic feature selection 115 | - Useful for high-dimensional problems (p > n) 116 | 117 | **Disadvantages (compared to unregularized linear models):** 118 | 119 | - Tuning is required 120 | - Feature scaling is recommended 121 | - Less interpretable (due to feature scaling) 122 | -------------------------------------------------------------------------------- /other/model_evaluation_comparison.md: -------------------------------------------------------------------------------- 1 | ## Comparing Model Evaluation Procedures 2 | 3 | **Training and testing on the same data** 4 | 5 | - Goal is to estimate likely performance of a model on out-of-sample data 6 | - But, maximizing training performance rewards overly complex models that won't necessarily generalize 7 | - Unnecessarily complex models overfit the training data: 8 | - Will do well when tested using the in-sample data 9 | - May do poorly on out-of-sample data 10 | - Learns the "noise" in the data rather than the "signal" 11 | 12 | **Train/test split** 13 | 14 | - Split the dataset into two pieces, so that the model can be trained and tested on different data 15 | - Testing performance is a better estimate of out-of-sample performance (compared to training performance) 16 | - But, it provides a high variance estimate since changing which observations happen to be in the testing set can significantly change testing performance 17 | - Allows you to easily inspect your testing results (via confusion matrix or ROC curve) 18 | 19 | **K-fold cross-validation** 20 | 21 | - Systematically create "K" train/test splits and average the results together 22 | - Cross-validated performance is a more reliable estimate of out-of-sample performance (compared to testing performance) 23 | - Runs "K" times slower than train/test split 24 | 25 | ## Comparing Evaluation Metrics for Classification Problems 26 | 27 | **Classification accuracy/error** 28 | 29 | - Classification accuracy is the percentage of correct predictions (higher is better) 30 | - Classification error is the percentage of incorrect predictions (lower is better) 31 | - Easiest classification metric to understand 32 | 33 | **Confusion matrix** 34 | 35 | - Confusion matrix gives you a better understanding of how your classifier is performing 36 | - Allows you to calculate sensitivity, specificity, and many other metrics that might match your business objective better than accuracy 37 | 38 | **ROC curves and Area Under the Curve (AUC)** 39 | 40 | - Allows you to visualize the performance of your classifier across all possible classification thresholds, thus helping you to choose a threshold that appropriately balances sensitivity and specificity 41 | - Still useful when there is high class imbalance (unlike classification accuracy/error) 42 | - Harder to use when there are more than two response classes 43 | 44 | **Log loss** 45 | 46 | - Most useful when well-calibrated predicted probabilities are important to your business objective 47 | 48 | ## Comparing Evaluation Metrics for Regression Problems 49 | 50 | **Mean Absolute Error (MAE)** 51 | 52 | - Mean of the absolute value of the errors 53 | - Easiest regression metric to understand 54 | 55 | **Mean Squared Error (MSE)** 56 | 57 | - Mean of the squared errors 58 | - More popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world 59 | 60 | **Root Mean Squared Error (RMSE)** 61 | 62 | - Square root of the mean of the squared errors 63 | - Even more popular than MSE, because RMSE is interpretable in the "y" units 64 | -------------------------------------------------------------------------------- /other/python_packages.md: -------------------------------------------------------------------------------- 1 | ## List of Python packages used in the course 2 | 3 | Note: Some of these packages have dependencies that will also need to be installed. 4 | 5 | ### Included with Anaconda ([complete list](http://docs.continuum.io/anaconda/pkg-docs.html)) 6 | * beautiful-soup 7 | * ipython 8 | * ipython-notebook 9 | * matplotlib 10 | * nltk 11 | * numpy 12 | * pandas 13 | * pip 14 | * requests 15 | * scikit-learn 16 | * scipy 17 | 18 | ### Available for installation via `conda` 19 | * seaborn 20 | 21 | ### Available for installation via `pip` 22 | * textblob 23 | -------------------------------------------------------------------------------- /other/setup_checklist.md: -------------------------------------------------------------------------------- 1 | ## Setup checklist 2 | 3 | This is a checklist to confirm that your laptop is set up properly for DAT8. If at any point you get an error message, please note the error message and we will help you to fix it! If you don't get any error messages, you are properly set up. 4 | 5 | ### GitHub 6 | * Log into your GitHub account, and "star" the DAT8 repository (the one you are looking at right now) by clicking the Star button in the upper right corner of the screen. 7 | 8 | ### Git 9 | * Open a command line application: 10 | * For Windows, we recommend [Git Bash](http://git-scm.com/download/win) instead of Git Shell (which uses Powershell). 11 | * For Mac, you will probably be using Terminal, or another command line tool of your choice. 12 | * Type `git config --global user.name "YourFirstName YourLastName"` (including the quotes) 13 | * Type `git config --global user.email "youremail@domain.com"` (use the email address associated with your GitHub account) 14 | 15 | ### Python 16 | * While still at the command line: 17 | * Type `conda list` (if you choose not to use Anaconda, this will generate an error) 18 | * Type `pip install textblob` 19 | * Type `python` to open the Python interpreter 20 | * While in the Python interpreter: 21 | * Look at the Python version number. It should start with 2.7. If your version number starts with 3, that's fine as long as you are aware of the differences between Python 2 and 3. 22 | * Type `import pandas` 23 | * Type `import textblob` 24 | * Type `exit()` to exit the interpreter. You can now close the command line application. 25 | * Open Spyder (if you can't find Spyder, look for the Anaconda Launcher application) 26 | * In the console (probably on the right side of the screen), type `import pandas` 27 | * Type `import textblob` 28 | * If this worked from the interpreter but not in Spyder, and you're using a Mac, try opening the PYTHONPATH manager (in Spyder) and adding a path to where textblob was installed (such as `/Users/yourname/anaconda/lib/python2.7/site-packages/`). Then, restart Spyder. 29 | -------------------------------------------------------------------------------- /project/peer_review.md: -------------------------------------------------------------------------------- 1 | ## Peer Review Guidelines 2 | 3 | You will be assigned to review the project drafts of two of your peers, and will provide them with feedback via a private group in Slack that we will set up for you. 4 | 5 | Expectations: 6 | * Read everything they wrote! 7 | * If they provided their data, review it and try to understand it. 8 | * Read their code and try to understand their thought process. 9 | * If their code can be run, try running it. 10 | * Spend at least one hour reviewing their project (including the time it takes to write the feedback). 11 | 12 | Your feedback would ideally consist of: 13 | * Strengths of their project (things you particularly like about it) 14 | * Comments about things you think could be improved 15 | * Questions about things you don't understand 16 | * Comments about their code 17 | * Links to resources or code snippets that might be useful to them 18 | * Suggestions for next steps 19 | * Guiding principle: Give feedback that would be helpful to you if it was your project! 20 | 21 | You should take a quick glance through their project as soon as possible, to make sure you understand what they have given you and what files you should be reviewing. If you're unclear, ask them about it! 22 | -------------------------------------------------------------------------------- /project/public_data.md: -------------------------------------------------------------------------------- 1 | ## Public Data Sources 2 | 3 | * Open data catalogs from various governments and NGOs: 4 | * [NYC Open Data](https://nycopendata.socrata.com/) 5 | * [DC Open Data Catalog](http://data.dc.gov/) / [OpenDataDC](http://www.opendatadc.org/) 6 | * [DataLA](https://data.lacity.org/) 7 | * [data.gov](https://www.data.gov/) (see also: [Project Open Data Dashboard](http://data.civicagency.org/)) 8 | * [data.gov.uk](http://data.gov.uk/) 9 | * [US Census Bureau](http://www.census.gov/) 10 | * [World Bank Open Data](http://data.worldbank.org/) 11 | * [Humanitarian Data Exchange](http://docs.hdx.rwlabs.org/) 12 | * [Sunlight Foundation](http://sunlightfoundation.com/api/): government-focused data 13 | * [ProPublica Data Store](https://projects.propublica.org/data-store/) 14 | * Datasets hosted by academic institutions: 15 | * [UC Irvine Machine Learning Repository](http://archive.ics.uci.edu/ml/): datasets specifically designed for machine learning 16 | * [Stanford Large Network Dataset Collection](http://snap.stanford.edu/data/): graph data 17 | * [Inter-university Consortium for Political and Social Research](http://www.icpsr.umich.edu/) 18 | * [Pittsburgh Science of Learning Center's DataShop](http://www.learnlab.org/technologies/datashop/) 19 | * [Academic Torrents](http://academictorrents.com/): distributed network for sharing large research datasets 20 | * [Dataverse Project](http://dataverse.org/): searchable archive of research data 21 | * Datasets hosted by private companies: 22 | * [Quandl](https://www.quandl.com/): over 10 million financial, economic, and social datasets 23 | * [Amazon Web Services Public Data Sets](http://aws.amazon.com/datasets/) 24 | * [Kaggle](http://www.kaggle.com/) provides datasets with their challenges, but each competition has its own rules as to whether the data can be used outside of the scope of the competition. 25 | * Big lists of datasets: 26 | * [Awesome Public Datasets](https://github.com/caesar0301/awesome-public-datasets): Well-organized and frequently updated 27 | * [Rdatasets](http://vincentarelbundock.github.io/Rdatasets/): collection of 700+ datasets originally distributed with R packages 28 | * [RDataMining.com](http://www.rdatamining.com/resources/data) 29 | * [KDnuggets](http://www.kdnuggets.com/datasets/index.html) 30 | * [inside-R](http://www.inside-r.org/howto/finding-data-internet) 31 | * [100+ Interesting Data Sets for Statistics](http://rs.io/2014/05/29/list-of-data-sets.html) 32 | * [20 Free Big Data Sources](http://smartdatacollective.com/bernardmarr/235366/big-data-20-free-big-data-sources-everyone-should-know) 33 | * [Sebastian Raschka](https://github.com/rasbt/pattern_classification/blob/master/resources/dataset_collections.md): datasets categorized by format and topic 34 | * APIs: 35 | * [Apigee](https://apigee.com/providers): explore dozens of popular APIs 36 | * [Mashape](https://www.mashape.com/): explore hundreds of APIs 37 | * [Python APIs](http://www.pythonforbeginners.com/api/list-of-python-apis): Python wrappers for many APIs 38 | * Other interesting datasets: 39 | * [FiveThirtyEight](https://github.com/fivethirtyeight/data): data and code related to their articles 40 | * [The Upshot](https://github.com/TheUpshot/): data related to their articles 41 | * [Yelp Dataset Challenge](http://www.yelp.com/dataset_challenge): Yelp reviews, business attributes, users, and more from 10 cities 42 | * [Donors Choose](http://data.donorschoose.org/open-data/overview/): data related to their projects 43 | * [200,000+ Jeopardy questions](http://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/) 44 | * [CrowdFlower](http://www.crowdflower.com/data-for-everyone): interesting datasets created or enhanced by their contributors 45 | * [UFO reports](https://github.com/planetsig/ufo-reports): geolocated and time-standardized UFO reports for close to a century 46 | * [Reddit Top 2.5 Million](https://github.com/umbrae/reddit-top-2.5-million): all-time top 1,000 posts from each of the top 2,500 subreddits 47 | * Other resources: 48 | * [Datasets subreddit](http://www.reddit.com/r/datasets/): ask for help finding a specific data set, or post your own 49 | * [Center for Data Innovation](http://www.datainnovation.org/category/publications/data-set-blog/): blog posts about interesting, recently-released data sets. 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | seaborn 2 | textblob 3 | -------------------------------------------------------------------------------- /slides/01_course_overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_course_overview.pdf -------------------------------------------------------------------------------- /slides/01_course_overview.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_course_overview.pptx -------------------------------------------------------------------------------- /slides/01_intro_to_data_science.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_intro_to_data_science.pdf -------------------------------------------------------------------------------- /slides/01_intro_to_data_science.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_intro_to_data_science.pptx -------------------------------------------------------------------------------- /slides/01_types_of_data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_types_of_data.pdf -------------------------------------------------------------------------------- /slides/01_types_of_data.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/01_types_of_data.pptx -------------------------------------------------------------------------------- /slides/02_git_github.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/02_git_github.pdf -------------------------------------------------------------------------------- /slides/02_git_github.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/02_git_github.pptx -------------------------------------------------------------------------------- /slides/06_machine_learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/06_machine_learning.pdf -------------------------------------------------------------------------------- /slides/06_machine_learning.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/06_machine_learning.pptx -------------------------------------------------------------------------------- /slides/12_confusion_matrix.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/12_confusion_matrix.pdf -------------------------------------------------------------------------------- /slides/12_confusion_matrix.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/12_confusion_matrix.pptx -------------------------------------------------------------------------------- /slides/13_drawing_roc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/13_drawing_roc.pdf -------------------------------------------------------------------------------- /slides/13_drawing_roc.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/13_drawing_roc.pptx -------------------------------------------------------------------------------- /slides/14_bayes_theorem.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/14_bayes_theorem.pdf -------------------------------------------------------------------------------- /slides/14_bayes_theorem.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/14_bayes_theorem.pptx -------------------------------------------------------------------------------- /slides/14_naive_bayes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/14_naive_bayes.pdf -------------------------------------------------------------------------------- /slides/14_naive_bayes.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/14_naive_bayes.pptx -------------------------------------------------------------------------------- /slides/16_kaggle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/16_kaggle.pdf -------------------------------------------------------------------------------- /slides/16_kaggle.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/16_kaggle.pptx -------------------------------------------------------------------------------- /slides/19_clustering.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/19_clustering.pdf -------------------------------------------------------------------------------- /slides/19_clustering.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT8/ff04af83e8ac1409cd86f3cb3524e4141644c5a1/slides/19_clustering.pptx --------------------------------------------------------------------------------