├── .gitignore ├── README.md ├── code ├── 00_python_beginner_workshop.py ├── 00_python_intermediate_workshop.py ├── 01_chipotle_homework_solution.py ├── 01_reading_files.py ├── 03_exploratory_analysis_pandas.py ├── 04_apis.py ├── 04_visualization.py ├── 05_iris_exercise.py ├── 05_sklearn_knn.py ├── 07_glass_id_homework_solution.py ├── 08_web_scraping.py ├── 10_logistic_regression_confusion_matrix.py ├── 13_naive_bayes.py ├── 15_kaggle.py ├── 17_ensembling_exercise.py ├── 18_clustering.py ├── 18_regularization.py ├── 19_advanced_sklearn.py ├── 19_gridsearchcv_exercise.py ├── 19_regex_exercise.py ├── 19_regex_reference.py ├── 20_sql.py └── 21_ensembles_example.py ├── data ├── SMSSpamCollection.txt ├── airline_safety.csv ├── auto_mpg.txt ├── chipotle_orders.tsv ├── default.csv ├── drinks.csv ├── homicides.txt ├── imdb_movie_ratings_top_1000.csv ├── imdb_movie_urls.csv ├── kaggle_tweets.csv ├── sales.db ├── titanic_train.csv ├── vehicles.db ├── vehicles_test.csv └── vehicles_train.csv ├── homework ├── 02_command_line_hw_soln.md ├── 03_pandas_hw_soln.py ├── 04_visualization_hw_soln.py ├── 06_bias_variance.md ├── 07_glass_identification.md ├── 11_roc_auc.md ├── 11_roc_auc_annotated.md ├── 13_spam_filtering.md └── 13_spam_filtering_annotated.md ├── notebooks ├── 06_bias_variance.ipynb ├── 06_model_evaluation_procedures.ipynb ├── 09_linear_regression.ipynb ├── 11_cross_validation.ipynb ├── 11_roc_auc.ipynb ├── 11_titanic_exercise.ipynb ├── 13_bayes_iris.ipynb ├── 13_naive_bayes_spam.ipynb ├── 14_nlp.ipynb ├── 16_decision_trees.ipynb ├── 17_ensembling.ipynb ├── 18_regularization.ipynb └── images │ ├── 18_bias_variance.png │ ├── 18_overfitting.png │ ├── 18_ridge_lasso_path.png │ ├── 18_ridge_lasso_regression_coefficients.png │ ├── 18_underfitting_overfitting.png │ ├── cross_validation_diagram.png │ ├── cross_validation_example.png │ ├── estimating_coefficients.png │ ├── obama_clinton_tree.jpg │ ├── overfitting.png │ ├── r_squared.png │ ├── salary_color.png │ ├── salary_regions.png │ ├── salary_tree.png │ ├── salary_tree_annotated.png │ ├── salary_unpruned.png │ ├── slope_intercept.png │ ├── train_test_split.png │ ├── training_error.png │ ├── tree_titanic.png │ ├── tree_vehicles.png │ ├── tree_vs_linear.png │ └── underfitting_overfitting.png ├── other ├── peer_review.md ├── project.md ├── public_data.md └── resources.md └── slides ├── 01_course_overview.pdf ├── 01_course_overview.pptx ├── 02_Introduction_to_the_Command_Line.md ├── 02_git_github.pdf ├── 02_git_github.pptx ├── 04_apis.pdf ├── 04_apis.pptx ├── 04_visualization.pdf ├── 04_visualization.pptx ├── 05_intro_to_data_science.pdf ├── 05_intro_to_data_science.pptx ├── 05_machine_learning_knn.pdf ├── 05_machine_learning_knn.pptx ├── 08_web_scraping.pdf ├── 08_web_scraping.pptx ├── 10_logistic_regression_confusion_matrix.pdf ├── 10_logistic_regression_confusion_matrix.pptx ├── 11_drawing_roc.pdf ├── 11_drawing_roc.pptx ├── 13_bayes_theorem.pdf ├── 13_bayes_theorem.pptx ├── 13_naive_bayes.pdf ├── 13_naive_bayes.pptx ├── 15_kaggle.pdf ├── 15_kaggle.pptx ├── 18_clustering.pdf ├── 18_clustering.pptx ├── 20_sales_db_schema.png ├── 20_sql.pdf └── 20_sql.pptx /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .DS_Store 3 | *.pyc 4 | -------------------------------------------------------------------------------- /code/00_python_beginner_workshop.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Multi-line comments go between 3 quotation marks. 3 | You can use single or double quotes. 4 | ''' 5 | 6 | # One-line comments are preceded by the pound symbol 7 | 8 | 9 | # BASIC DATA TYPES 10 | 11 | x = 5 # creates an object 12 | print type(x) # check the type: int (not declared explicitly) 13 | type(x) # automatically prints 14 | type(5) # assigning it to a variable is not required 15 | 16 | type(5.0) # float 17 | type('five') # str 18 | type(True) # bool 19 | 20 | 21 | # LISTS 22 | 23 | nums = [5, 5.0, 'five'] # multiple data types 24 | nums # print the list 25 | type(nums) # check the type: list 26 | len(nums) # check the length: 3 27 | nums[0] # print first element 28 | nums[0] = 6 # replace a list element 29 | 30 | nums.append(7) # list 'method' that modifies the list 31 | help(nums.append) # help on this method 32 | help(nums) # help on a list object 33 | nums.remove('five') # another list method 34 | 35 | sorted(nums) # 'function' that does not modify the list 36 | nums # it was not affected 37 | nums = sorted(nums) # overwrite the original list 38 | sorted(nums, reverse=True) # optional argument 39 | 40 | # list slicing [start:end:stride] 41 | weekdays = ['mon','tues','wed','thurs','fri'] 42 | weekdays[0] # element 0 43 | weekdays[0:3] # elements 0, 1, 2 44 | weekdays[:3] # elements 0, 1, 2 45 | weekdays[3:] # elements 3, 4 46 | weekdays[-1] # last element (element 4) 47 | weekdays[::2] # every 2nd element (0, 2, 4) 48 | weekdays[::-1] # backwards (4, 3, 2, 1, 0) 49 | 50 | days = weekdays + ['sat','sun'] # concatenate lists 51 | 52 | 53 | # FUNCTIONS 54 | 55 | def give_me_five(): # function definition ends with colon 56 | return 5 # indentation required for function body 57 | 58 | give_me_five() # prints the return value (5) 59 | num = give_me_five() # assigns return value to a variable, doesn't print it 60 | 61 | def calc(x, y, op): # three parameters (without any defaults) 62 | if op == 'add': # conditional statement 63 | return x + y 64 | elif op == 'subtract': 65 | return x - y 66 | else: 67 | print 'Valid operations: add, subtract' 68 | 69 | calc(5, 3, 'add') 70 | calc(5, 3, 'subtract') 71 | calc(5, 3, 'multiply') 72 | calc(5, 3) 73 | 74 | 75 | # EXERCISE: Write a function that takes two parameters (hours and rate), and 76 | # returns the total pay. 77 | 78 | def compute_pay(hours, rate): 79 | return hours * rate 80 | 81 | compute_pay(40, 10.50) 82 | 83 | 84 | # EXERCISE: Update your function to give the employee 1.5 times the hourly rate 85 | # for hours worked above 40 hours. 86 | 87 | def compute_more_pay(hours, rate): 88 | if hours <= 40: 89 | return hours * rate 90 | else: 91 | return 40*rate + (hours-40)*(rate*1.5) 92 | 93 | compute_more_pay(30, 10) 94 | compute_more_pay(45, 10) 95 | 96 | 97 | # STRINGS 98 | 99 | # create a string 100 | s = str(42) # convert another data type into a string 101 | s = 'I like you' 102 | 103 | # examine a string 104 | s[0] # returns 'I' 105 | len(s) # returns 10 106 | 107 | # string slicing like lists 108 | s[:6] # returns 'I like' 109 | s[7:] # returns 'you' 110 | s[-1] # returns 'u' 111 | 112 | # split a string into a list of substrings separated by a delimiter 113 | s.split(' ') # returns ['I','like','you'] 114 | s.split() # same thing 115 | 116 | # concatenate strings 117 | s3 = 'The meaning of life is' 118 | s4 = '42' 119 | s3 + ' ' + s4 # returns 'The meaning of life is 42' 120 | s3 + ' ' + str(42) # same thing 121 | 122 | 123 | # EXERCISE: Given a string s, return a string made of the first 2 and last 2 124 | # characters of the original string, so 'spring' yields 'spng'. However, if the 125 | # string length is less than 2, instead return the empty string. 126 | 127 | def both_ends(s): 128 | if len(s) < 2: 129 | return '' 130 | else: 131 | return s[:2] + s[-2:] 132 | 133 | both_ends('spring') 134 | both_ends('cat') 135 | both_ends('a') 136 | 137 | 138 | # FOR LOOPS 139 | 140 | # range returns a list of integers 141 | range(0, 3) # returns [0, 1, 2]: includes first value but excludes second value 142 | range(3) # same thing: starting at zero is the default 143 | 144 | # simple for loop 145 | for i in range(5): 146 | print i 147 | 148 | # print each list element in uppercase 149 | fruits = ['apple', 'banana', 'cherry'] 150 | for i in range(len(fruits)): 151 | print fruits[i].upper() 152 | 153 | # better for loop 154 | for fruit in fruits: 155 | print fruit.upper() 156 | 157 | 158 | # EXERCISE: Write a program that prints the numbers from 1 to 100. But for 159 | # multiples of 3 print 'fizz' instead of the number, and for the multiples of 160 | # 5 print 'buzz'. For numbers which are multiples of both 3 and 5 print 'fizzbuzz'. 161 | 162 | def fizz_buzz(): 163 | nums = range(1, 101) 164 | for num in nums: 165 | if num % 15 == 0: 166 | print 'fizzbuzz' 167 | elif num % 3 == 0: 168 | print 'fizz' 169 | elif num % 5 == 0: 170 | print 'buzz' 171 | else: 172 | print num 173 | 174 | fizz_buzz() 175 | 176 | 177 | # EXERCISE: Given a list of strings, return a list with the strings 178 | # in sorted order, except group all the strings that begin with 'x' first. 179 | # e.g. ['mix', 'xyz', 'apple', 'xanadu', 'aardvark'] returns 180 | # ['xanadu', 'xyz', 'aardvark', 'apple', 'mix'] 181 | # Hint: this can be done by making 2 lists and sorting each of them 182 | # before combining them. 183 | 184 | def front_x(words): 185 | lista=[] 186 | listb=[] 187 | for word in words: 188 | if word[0]=='x': 189 | lista.append(word) 190 | else: 191 | listb.append(word) 192 | return sorted(lista) + sorted(listb) 193 | 194 | front_x(['mix', 'xyz', 'apple', 'xanadu', 'aardvark']) 195 | -------------------------------------------------------------------------------- /code/00_python_intermediate_workshop.py: -------------------------------------------------------------------------------- 1 | ## QUIZ TO REVIEW BEGINNER WORKSHOP 2 | 3 | a = 5 4 | b = 5.0 5 | c = a/2 6 | d = b/2 7 | 8 | ''' 9 | What is type(a)? 10 | int 11 | What is type(b)? 12 | float 13 | What is c? 14 | 2 15 | What is d? 16 | 2.5 17 | ''' 18 | 19 | e = [a, b] 20 | f = range(10) 21 | 22 | ''' 23 | What is type(e)? 24 | list 25 | What is len(e)? 26 | 2 27 | What is type(f)? 28 | list 29 | What are the contents of f? 30 | integers 0 through 9 31 | What is 'range' called? 32 | a function 33 | How do I get help on 'range'? 34 | help(range) 35 | ''' 36 | 37 | g = ['mon','tues','wed','thurs','fri'] 38 | 39 | ''' 40 | How do I slice out 'mon'? 41 | g[0] 42 | How do I slice out 'mon' through 'wed'? 43 | g[0:3] 44 | What are two ways to slice out 'fri'? 45 | g[4] or g[-1] 46 | How do I check the type of 'mon'? 47 | type(g[0]) 48 | ''' 49 | 50 | g.remove('wed') 51 | sorted(g) 52 | h = sorted(g, reverse=True) 53 | 54 | ''' 55 | What are the contents of g? 56 | ['mon','tues','thurs','fri'] 57 | What are the contents of h? 58 | ['tues','thurs','mon','fri'] 59 | What is 'remove' called? 60 | a list method 61 | How do I get help on 'remove'? 62 | help(g.remove) 63 | What is 'reverse=True' called? 64 | an optional argument 65 | ''' 66 | 67 | i = 'Hello' 68 | j = 'friend' 69 | k = i + j 70 | l = i + 3 71 | m = i[0] 72 | 73 | ''' 74 | What is 'k'? 75 | 'Hellofriend' 76 | What is 'l'? 77 | undefined (due to error) 78 | What is 'm'? 79 | 'H' 80 | ''' 81 | 82 | 83 | 84 | ## FOR LOOPS AND BASIC LIST COMPREHENSIONS 85 | 86 | # print 1 through 5 87 | nums = range(1, 6) 88 | for num in nums: 89 | print num 90 | 91 | # for loop to create a list of cubes 92 | cubes = [] 93 | for num in nums: 94 | cubes.append(num**3) 95 | 96 | # equivalent list comprehension 97 | cubes = [num**3 for num in nums] # [1, 8, 27, 64, 125] 98 | 99 | ''' 100 | EXERCISE: 101 | Given that: letters = ['a','b','c'] 102 | Write a list comprehension that returns: ['A','B','C'] 103 | Hint: 'hello'.upper() returns 'HELLO' 104 | 105 | [letter.upper() for letter in letters] 106 | 107 | BONUS EXERCISE: 108 | Given that: word = 'abc' 109 | Write a list comprehension that returns: ['A','B','C'] 110 | 111 | [letter.upper() for letter in word] 112 | ''' 113 | 114 | 115 | 116 | ## LIST COMPREHENSIONS WITH CONDITIONS 117 | 118 | nums = range(1, 6) 119 | 120 | # for loop to create a list of cubes of even numbers 121 | cubes_of_even = [] 122 | for num in nums: 123 | if num % 2 == 0: 124 | cubes_of_even.append(num**3) 125 | 126 | # equivalent list comprehension 127 | # syntax: [expression for variable in iterable if condition] 128 | cubes_of_even = [num**3 for num in nums if num % 2 == 0] # [8, 64] 129 | 130 | 131 | 132 | ## DICTIONARIES 133 | 134 | # dictionaries are similar to lists: 135 | # - both can contain multiple data types 136 | # - both are iterable 137 | # - both are mutable 138 | 139 | # dictionaries are different from lists: 140 | # - dictionaries are unordered 141 | # - dictionary lookup time is constant regardless of dictionary size 142 | 143 | # dictionaries are like real dictionaries: 144 | # - dictionaries are made of key-value pairs (word and definition) 145 | # - dictionary keys must be unique (each word is only defined once) 146 | # - you can use the key to look up the value, but not the other way around 147 | 148 | # create a dictionary (and open Variable Explorer in Spyder) 149 | family = {'dad':'homer', 'mom':'marge', 'size':6} 150 | 151 | # examine a dictionary 152 | family[0] # throws an error (there is no ordering) 153 | family['dad'] # returns 'homer' 154 | len(family) # returns 3 155 | family.keys() # returns list: ['dad', 'mom', 'size'] 156 | family.values() # returns list: ['homer', 'marge', 6] 157 | family.items() # returns list of tuples: 158 | # [('dad', 'homer'), ('mom', 'marge'), ('size', 6)] 159 | 160 | # modify a dictionary 161 | family['cat'] = 'snowball' # add a new entry 162 | family['cat'] = 'snowball ii' # edit an existing entry 163 | del family['cat'] # delete an entry 164 | family['kids'] = ['bart', 'lisa'] # value can be a list 165 | 166 | # accessing a list element within a dictionary 167 | family['kids'][0] # returns 'bart' 168 | 169 | ''' 170 | EXERCISE: 171 | Given that: d = {'a':10, 'b':20, 'c':[30, 40]} 172 | First, print the value for 'a' 173 | Then, change the value for 'b' to be 25 174 | Then, change the 30 to be 35 175 | Finally, append 45 to the end of the list that contains 35 and 40 176 | 177 | d['a'] 178 | d['b'] = 25 179 | d['c'][0] = 35 180 | d['c'].append(45) 181 | 182 | BONUS EXERCISE: 183 | Write a list comprehension that returns a list of the keys in uppercase 184 | 185 | [key.upper() for key in d.keys()] 186 | ''' 187 | 188 | 189 | 190 | ## APIs 191 | 192 | # API Providers: https://apigee.com/providers 193 | # Echo Nest API Console: https://apigee.com/console/echonest 194 | # Echo Nest Developer Center: http://developer.echonest.com/ 195 | 196 | import requests # import module (make its functions available) 197 | 198 | # use requests to talk to the web 199 | r = requests.get('http://www.google.com') 200 | r.text 201 | type(r.text) 202 | 203 | # request data from the Echo Nest API 204 | r = requests.get('http://developer.echonest.com/api/v4/artist/top_hottt?api_key=KBGUPZPJZS9PHWNIN&format=json') 205 | r.text 206 | r.json() # decode JSON 207 | type(r.json()) 208 | top = r.json() 209 | 210 | # pretty print for easier readability 211 | import pprint 212 | pprint.pprint(top) 213 | 214 | # pull out the artist data 215 | artists = top['response']['artists'] # list of 15 dictionaries 216 | 217 | # reformat data into a table structure 218 | artists_data = [artist.values() for artist in artists] # list of 15 lists 219 | artists_header = artists[0].keys() # list of 2 strings 220 | 221 | 222 | 223 | ## WORKING WITH PUBLIC DATA 224 | 225 | # List of data sources: https://github.com/justmarkham/DAT5/blob/master/other/public_data.md 226 | # FiveThirtyEight: http://fivethirtyeight.com/ 227 | # FiveThirtyEight data: https://github.com/fivethirtyeight/data 228 | # NFL ticket prices data: https://github.com/fivethirtyeight/data/tree/master/nfl-ticket-prices 229 | 230 | # Question: What is the average ticket price for Ravens' home vs away games? 231 | 232 | # open a CSV file from a URL 233 | import csv 234 | r = requests.get('https://raw.githubusercontent.com/fivethirtyeight/data/master/nfl-ticket-prices/2014-average-ticket-price.csv') 235 | data = [row for row in csv.reader(r.iter_lines())] # list of lists 236 | 237 | # open a downloaded CSV file from your working directory 238 | with open('2014-average-ticket-price.csv', 'rU') as f: 239 | data = [row for row in csv.reader(f)] # list of lists 240 | 241 | # examine the data 242 | type(data) 243 | len(data) 244 | data[0] 245 | data[1] 246 | 247 | # save the data we want 248 | data = data[1:97] 249 | 250 | # step 1: create a list that only contains events 251 | data[0][0] 252 | data[1][0] 253 | data[2][0] 254 | events = [row[0] for row in data] 255 | 256 | # EXERCISE 257 | # step 2: create a list that only contains prices (stored as integers) 258 | prices = [int(row[2]) for row in data] 259 | 260 | # step 3: figure out how to locate the away teams 261 | events[0] 262 | events[0].find(' at ') 263 | stop = events[0].find(' at ') 264 | events[0][:stop] 265 | 266 | # step 4: use a for loop to make a list of the away teams 267 | away_teams = [] 268 | for event in events: 269 | stop = event.find(' at ') 270 | away_teams.append(event[:stop]) 271 | 272 | # EXERCISE 273 | # step 5: use a for loop to make a list of the home teams 274 | home_teams = [] 275 | for event in events: 276 | start = event.find(' at ') + 4 277 | stop = event.find(' Tickets ') 278 | home_teams.append(event[start:stop]) 279 | 280 | # step 6: figure out how to get prices only for Ravens home games 281 | zip(home_teams, prices) # list of tuples 282 | [pair[1] for pair in zip(home_teams, prices)] # iterate through tuples and get price 283 | [price for team, price in zip(home_teams, prices)] # better way to get price 284 | [price for team, price in zip(home_teams, prices) if team == 'Baltimore Ravens'] # add a condition 285 | 286 | # step 7: create lists of the Ravens home and away game prices 287 | ravens_home = [price for team, price in zip(home_teams, prices) if team == 'Baltimore Ravens'] 288 | ravens_away = [price for team, price in zip(away_teams, prices) if team == 'Baltimore Ravens'] 289 | 290 | # EXERCISE 291 | # step 8: calculate the average of each list 292 | float(sum(ravens_home)) / len(ravens_home) 293 | float(sum(ravens_away)) / len(ravens_away) 294 | -------------------------------------------------------------------------------- /code/01_chipotle_homework_solution.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SOLUTION FILE: Homework with Chipotle data 3 | https://github.com/TheUpshot/chipotle 4 | ''' 5 | 6 | 7 | ''' 8 | PART 1: read in the data, parse it, and store it in a list of lists called 'data' 9 | Hint: this is a tsv file, and csv.reader() needs to be told how to handle it 10 | ''' 11 | 12 | import csv 13 | 14 | # specify that the delimiter is a tab character 15 | with open('chipotle_orders.tsv', 'rU') as f: 16 | data = [row for row in csv.reader(f, delimiter='\t')] 17 | 18 | 19 | ''' 20 | PART 2: separate the header and data into two different lists 21 | ''' 22 | 23 | header = data[0] 24 | data = data[1:] 25 | 26 | 27 | ''' 28 | PART 3: calculate the average price of an order 29 | Hint: examine the data to see if the 'quantity' column is relevant to this calculation 30 | Hint: work smarter, not harder! (this can be done in a few lines of code) 31 | ''' 32 | 33 | # count the number of unique order_id's 34 | # note: you could assume this is 1834 because that's the maximum order_id, but it's best to check 35 | num_orders = len(set([row[0] for row in data])) # 1834 36 | 37 | # create a list of prices 38 | # note: ignore the 'quantity' column because the 'item_price' takes quantity into account 39 | prices = [float(row[4][1:-1]) for row in data] # strip the dollar sign and trailing space 40 | 41 | # calculate the average price of an order and round to 2 digits 42 | round(sum(prices) / num_orders, 2) # $18.81 43 | 44 | 45 | ''' 46 | PART 4: create a list (or set) of all unique sodas and soft drinks that they sell 47 | Note: just look for 'Canned Soda' and 'Canned Soft Drink', and ignore other drinks like 'Izze' 48 | ''' 49 | 50 | # if 'item_name' includes 'Canned', append 'choice_description' to 'sodas' list 51 | sodas = [] 52 | for row in data: 53 | if 'Canned' in row[2]: 54 | sodas.append(row[3][1:-1]) # strip the brackets 55 | 56 | # create a set of unique sodas 57 | unique_sodas = set(sodas) 58 | 59 | 60 | ''' 61 | PART 5: calculate the average number of toppings per burrito 62 | Note: let's ignore the 'quantity' column to simplify this task 63 | Hint: think carefully about the easiest way to count the number of toppings 64 | Hint: 'hello there'.count('e') 65 | ''' 66 | 67 | # keep a running total of burritos and toppings 68 | burrito_count = 0 69 | topping_count = 0 70 | 71 | # calculate number of toppings by counting the commas and adding 1 72 | # note: x += 1 is equivalent to x = x + 1 73 | for row in data: 74 | if 'Burrito' in row[2]: 75 | burrito_count += 1 76 | topping_count += (row[3].count(',') + 1) 77 | 78 | # calculate the average topping count and round to 2 digits 79 | round(topping_count / float(burrito_count), 2) # 5.40 80 | 81 | 82 | ''' 83 | PART 6: create a dictionary in which the keys represent chip orders and 84 | the values represent the total number of orders 85 | Expected output: {'Chips and Roasted Chili-Corn Salsa': 18, ... } 86 | Note: please take the 'quantity' column into account! 87 | Advanced: learn how to use 'defaultdict' to simplify your code 88 | ''' 89 | 90 | # start with an empty dictionary 91 | chips = {} 92 | 93 | # if chip order is not in dictionary, then add a new key/value pair 94 | # if chip order is already in dictionary, then update the value for that key 95 | for row in data: 96 | if 'Chips' in row[2]: 97 | if row[2] not in chips: 98 | chips[row[2]] = int(row[1]) # this is a new key, so create key/value pair 99 | else: 100 | chips[row[2]] += int(row[1]) # this is an existing key, so add to the value 101 | 102 | # defaultdict saves you the trouble of checking whether a key already exists 103 | from collections import defaultdict 104 | dchips = defaultdict(int) 105 | for row in data: 106 | if 'Chips' in row[2]: 107 | dchips[row[2]] += int(row[1]) 108 | 109 | 110 | ''' 111 | BONUS: think of a question about this data that interests you, and then answer it! 112 | ''' 113 | -------------------------------------------------------------------------------- /code/01_reading_files.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Lesson on file reading using Airline Safety Data 3 | https://github.com/fivethirtyeight/data/tree/master/airline-safety 4 | ''' 5 | 6 | # read the whole file at once, return a single string (including newlines) 7 | # 'rU' mode (read universal) converts different line endings into '\n' 8 | f = open('airline_safety.csv', 'rU') 9 | data = f.read() 10 | f.close() 11 | 12 | # use a context manager to automatically close your file 13 | with open('airline_safety.csv', 'rU') as f: 14 | data = f.read() 15 | 16 | # read the whole file at once, return a list of lines 17 | with open('airline_safety.csv', 'rU') as f: 18 | data = f.readlines() 19 | 20 | # use list comprehension to duplicate readlines 21 | with open('airline_safety.csv', 'rU') as f: 22 | data = [row for row in f] 23 | 24 | # use the csv module to create a list of lists 25 | import csv 26 | with open('airline_safety.csv', 'rU') as f: 27 | data = [row for row in csv.reader(f)] 28 | 29 | # alternative method that doesn't require downloading the file 30 | import requests 31 | r = requests.get('https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv') 32 | data = [row for row in csv.reader(r.iter_lines())] 33 | 34 | # separate the header and data 35 | header = data[0] 36 | data = data[1:] 37 | 38 | # EXERCISE: 39 | # create a list of airline names (without the star) 40 | # create a list of the same length that contains 1 if there's a star and 0 if not 41 | airlines = [] 42 | starred = [] 43 | for row in data: 44 | if row[0][-1] == '*': 45 | starred.append(1) 46 | airlines.append(row[0][:-1]) 47 | else: 48 | starred.append(0) 49 | airlines.append(row[0]) 50 | 51 | # EXERCISE: 52 | # create a list that contains the average number of incidents per distance 53 | [(int(row[2]) + int(row[5])) / float(row[1]) for row in data] 54 | 55 | 56 | ''' 57 | A few extra things that will help you with the homework 58 | ''' 59 | 60 | # 'in' statement is useful for lists 61 | my_list = [1, 2, 1] 62 | 1 in my_list # True 63 | 3 in my_list # False 64 | 65 | # 'in' is useful for strings (checks for substrings) 66 | my_string = 'hello there' 67 | 'the' in my_string # True 68 | 'then' in my_string # False 69 | 70 | # 'in' is useful for dictionaries (checks keys but not values) 71 | my_dict = {'name':'Kevin', 'title':'instructor'} 72 | 'name' in my_dict # True 73 | 'Kevin' in my_dict # False 74 | 75 | # 'set' data structure is useful for gathering unique elements 76 | set(my_list) # returns a set of 1, 2 77 | len(set(my_list)) # count of unique elements 78 | 79 | 80 | ''' 81 | Homework with Chipotle data 82 | https://github.com/TheUpshot/chipotle 83 | ''' 84 | 85 | ''' 86 | PART 1: read in the data, parse it, and store it in a list of lists called 'data' 87 | Hint: this is a tsv file, and csv.reader() needs to be told how to handle it 88 | ''' 89 | 90 | ''' 91 | PART 2: separate the header and data into two different lists 92 | ''' 93 | 94 | ''' 95 | PART 3: calculate the average price of an order 96 | Hint: examine the data to see if the 'quantity' column is relevant to this calculation 97 | Hint: work smarter, not harder! (this can be done in a few lines of code) 98 | ''' 99 | 100 | ''' 101 | PART 4: create a list (or set) of all unique sodas and soft drinks that they sell 102 | Note: just look for 'Canned Soda' and 'Canned Soft Drink', and ignore other drinks like 'Izze' 103 | ''' 104 | 105 | ''' 106 | PART 5: calculate the average number of toppings per burrito 107 | Note: let's ignore the 'quantity' column to simplify this task 108 | Hint: think carefully about the easiest way to count the number of toppings 109 | Hint: 'hello there'.count('e') 110 | ''' 111 | 112 | ''' 113 | PART 6: create a dictionary in which the keys represent chip orders and 114 | the values represent the total number of orders 115 | Expected output: {'Chips and Roasted Chili-Corn Salsa': 18, ... } 116 | Note: please take the 'quantity' column into account! 117 | Advanced: learn how to use 'defaultdict' to simplify your code 118 | ''' 119 | 120 | ''' 121 | BONUS: think of a question about this data that interests you, and then answer it! 122 | ''' 123 | -------------------------------------------------------------------------------- /code/03_exploratory_analysis_pandas.py: -------------------------------------------------------------------------------- 1 | """ 2 | CLASS: Pandas for Data Exploration, Analysis, and Visualization 3 | 4 | About the data: 5 | WHO alcohol consumption data: 6 | article: http://fivethirtyeight.com/datalab/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/ 7 | original data: https://github.com/fivethirtyeight/data/tree/master/alcohol-consumption 8 | files: drinks.csv (with additional 'continent' column) 9 | """ 10 | 11 | """ 12 | First, we need to import Pandas into Python. Pandas is a Python package that 13 | allows for easy manipulation of DataFrames. You'll also need to import 14 | matplotlib for plotting. 15 | """ 16 | 17 | #imports 18 | import pandas as pd 19 | import matplotlib.pyplot as plt 20 | import numpy as np 21 | 22 | 23 | ''' 24 | Reading Files, Summarizing, Selecting, Filtering, Sorting 25 | ''' 26 | # Can read a file from a local file on your computer or from a URL 27 | drinks = pd.read_table('drinks.csv', sep=',') # read_table is more general 28 | drinks = pd.read_csv('drinks.csv') # read_csv is specific to CSV and implies sep="," 29 | # Can also read from URLs 30 | drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/drinks.csv') 31 | 32 | 33 | ''' 34 | Key Concept: Dot notation 35 | In Python, you can think of an object as an entity that can have both attributes 36 | and methods. A dot following an object indicates that you are about to access 37 | something within the object, an attribute or a method. Attributes contain 38 | information about the object. They are usually a single "word" following the 39 | dot. A method is somethng the object can do. They are usually a "word" with 40 | parentheses following the dot. 41 | ''' 42 | 43 | # examine the drinks data 44 | drinks # print the first 30 and last 30 rows 45 | type(drinks) # DataFrame 46 | drinks.head() # print the first 5 rows 47 | drinks.head(10) # print the first 10 rows 48 | drinks.tail() # print the last 5 rows 49 | drinks.describe() # summarize all numeric columns 50 | drinks.describe(include='all') # includes non numeric columns; new in pandas 0.15.0 51 | drinks.index # "the index" (aka "the labels") 52 | drinks.columns # column names (which is "an index") 53 | drinks.dtypes # data types of each column 54 | drinks.shape # number of rows and columns 55 | drinks.values # underlying numpy array 56 | drinks.info() # concise summary (includes memory usage as of pandas 0.15.0) 57 | 58 | # Print the 'beer_servings' Series (a single column) 59 | drinks.beer_servings 60 | drinks['beer_servings'] 61 | type(drinks.beer_servings) 62 | 63 | # Print two columns 64 | drinks[['beer_servings','wine_servings']] 65 | cols = ['beer_servings','wine_servings'] 66 | drinks[cols] 67 | 68 | # Calculate the average 'beer_servings' for the entire dataset 69 | drinks.describe() # summarize all numeric columns 70 | drinks.beer_servings.describe() # summarize only the 'beer_servings' Series 71 | drinks.beer_servings.mean() # only calculate the mean 72 | drinks.beer_servings.max() # only calculate the max 73 | drinks.beer_servings.min() # only calculate the min 74 | 75 | # Other aggregation functions 76 | drinks.beer_servings.sum() 77 | drinks.beer_servings.count() 78 | float(drinks.beer_servings.sum())/drinks.beer_servings.count() 79 | 80 | # Count the number of occurrences of each 'continent' value 81 | drinks.continent.value_counts() 82 | 83 | # Simple logical filters 84 | # Print all columns, but only show rows where the country is in Europe 85 | # Let's look at each piece of this. 86 | drinks.continent # Returns all of the continent values 87 | drinks.continent=='EU' # Returns True/False list 88 | drinks[drinks.continent=='EU'] # Returns all rows where True 89 | 90 | # Other logical filters 91 | drinks[drinks.beer_servings > 158] 92 | drinks[drinks.beer_servings <= 10] 93 | type(drinks[drinks.beer_servings <= 10]) # DataFrame 94 | drinks[drinks.beer_servings <= 10][['country','beer_servings']] 95 | 96 | # Calculate the average 'beer_servings' for all of Europe 97 | drinks[drinks.continent=='EU'].beer_servings.mean() 98 | 99 | # More complex logical fitering 100 | # Only show European countries with 'wine_servings' greater than 300 101 | # Note: parentheses are required for each condition, and you can't use 'and' or 'or' keywords 102 | drinks[(drinks.continent=='EU') & (drinks.wine_servings > 300)] 103 | 104 | # Show European countries or countries with 'wine_servings' greater than 300 105 | drinks[(drinks.continent=='EU') | (drinks.wine_servings > 300)] 106 | 107 | # Show countries who have more than the mean beer_servings 108 | drinks[drinks.beer_servings > drinks.beer_servings.mean()] 109 | 110 | ########################################## 111 | ############ Exercise 1 ############ 112 | ########################################## 113 | 114 | # Using the 'drinks' data, answer the following questions: 115 | # 1. What is the maximum number of total litres of pure alcohol? 116 | drinks.total_litres_of_pure_alcohol.max() 117 | 118 | # 2. Which country has the maximum number of total litres of pure alcohol? 119 | drinks[drinks.total_litres_of_pure_alcohol == drinks.total_litres_of_pure_alcohol.max()]['country'] 120 | 121 | # 3. Does Haiti or Belarus consume more servings of spirits? 122 | drinks.spirit_servings[drinks.country=='Haiti'] > drinks.spirit_servings[drinks.country=='Belarus'] 123 | 124 | # 4. How many countries have more than 300 wine servings OR more than 300 125 | # beer servings OR more than 300 spirit servings? 126 | drinks[(drinks.wine_servings > 300) | (drinks.beer_servings > 300) | (drinks.spirit_servings > 300)].country.count() 127 | 128 | # 5. For the countries in the previous question, what is the average total litres 129 | # of pure alcohol? 130 | drinks[(drinks.wine_servings > 300) | (drinks.beer_servings > 300) | (drinks.spirit_servings > 300)].mean() 131 | 132 | 133 | # sorting 134 | drinks.beer_servings.order() # only works for a Series 135 | drinks.sort_index() # sort rows by label 136 | drinks.sort_index(by='beer_servings') # sort rows by a specific column 137 | drinks.sort_index(by='beer_servings', ascending=False) # use descending order instead 138 | drinks.sort_index(by=['beer_servings', 'wine_servings']) # sort by multiple columns 139 | 140 | # Determine which 10 countries have the highest 'total_litres_of_pure_alcohol' 141 | drinks.sort_index(by='total_litres_of_pure_alcohol').tail(10) 142 | 143 | # Determine which country has the highest value for 'beer_servings' 144 | drinks[drinks.beer_servings==drinks.beer_servings.max()].country 145 | 146 | # Use dot notation to string together commands 147 | # How many countries in each continent have beer_servings greater than 182? 148 | # i.e. a beer every two days 149 | drinks[drinks.beer_servings > 182].continent.value_counts() 150 | 151 | # add a new column as a function of existing columns 152 | # note: can't (usually) assign to an attribute (e.g., 'drinks.total_servings') 153 | drinks['total_servings'] = drinks.beer_servings + drinks.spirit_servings + drinks.wine_servings 154 | drinks['alcohol_mL'] = drinks.total_litres_of_pure_alcohol * 1000 155 | drinks.head() 156 | 157 | ''' 158 | Split-Apply-Combine 159 | ''' 160 | 161 | # for each continent, calculate mean beer servings 162 | drinks.groupby('continent').beer_servings.mean() 163 | 164 | # for each continent, calculate mean of all numeric columns 165 | drinks.groupby('continent').mean() 166 | 167 | # for each continent, count number of occurrences 168 | drinks.groupby('continent').continent.count() 169 | drinks.continent.value_counts() 170 | 171 | 172 | ''' 173 | A little numpy 174 | ''' 175 | probs = np.array([0.51, 0.50, 0.02, 0.49, 0.78]) 176 | # np.where functions like an IF statement in Excel 177 | # np.where(condition, value if true, value if false) 178 | np.where(probs >= 0.5, 1, 0) 179 | drinks['lots_of_beer'] = np.where(drinks.beer_servings > 300, 1, 0) 180 | 181 | 182 | 183 | ########################################## 184 | ############ Exercise 2 ############ 185 | ########################################## 186 | 187 | # 1. What is the average number of total litres of pure alcohol for each 188 | # continent? 189 | drinks.groupby('continent').total_litres_of_pure_alcohol.mean() 190 | 191 | 192 | # 2. For each continent, calculate the mean wine_servings for all countries who 193 | # have a spirit_servings greater than the overall spirit_servings mean. 194 | drinks[drinks.spirit_servings > drinks.spirit_servings.mean()].groupby('continent').wine_servings.mean() 195 | 196 | 197 | # 3. Per continent, for all of the countries that drink more beer servings than 198 | # the average number of beer servings, what is the average number of wine 199 | # servings? 200 | drinks[drinks.beer_servings > drinks.beer_servings.mean()].groupby('continent').wine_servings.mean() 201 | 202 | 203 | ''' 204 | Advanced Filtering (of rows) and Selecting (of columns) 205 | ''' 206 | 207 | # loc: filter rows by LABEL, and select columns by LABEL 208 | drinks.loc[0] # row with label 0 209 | drinks.loc[0:3] # rows with labels 0 through 3 210 | drinks.loc[0:3, 'beer_servings':'wine_servings'] # rows 0-3, columns 'beer_servings' through 'wine_servings' 211 | drinks.loc[:, 'beer_servings':'wine_servings'] # all rows, columns 'beer_servings' through 'wine_servings' 212 | drinks.loc[[0,3], ['beer_servings','spirit_servings']] # rows 1 and 4, columns 'beer_servings' and 'spirit_servings' 213 | 214 | # iloc: filter rows by POSITION, and select columns by POSITION 215 | drinks.iloc[0] # row with 0th position (first row) 216 | drinks.iloc[0:3] # rows with positions 0 through 2 (not 3) 217 | drinks.iloc[0:3, 0:3] # rows and columns with positions 0 through 2 218 | drinks.iloc[:, 0:3] # all rows, columns with positions 0 through 2 219 | drinks.iloc[[0,2], [0,1]] # 1st and 3rd row, 1st and 2nd column 220 | 221 | # mixing: select columns by LABEL, then filter rows by POSITION 222 | drinks.wine_servings[0:3] 223 | drinks[['beer_servings', 'spirit_servings', 'wine_servings']][0:3] 224 | 225 | 226 | ########################################## 227 | ############# Homework ############# 228 | ########################################## 229 | ''' 230 | Use the automotive mpg data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.csv) 231 | to complete the following parts. Please turn in your code for each part. 232 | Before each code chunk, give a brief description (one line) of what the code is 233 | doing (e.g. "Loads the data" or "Creates scatter plot of mpg and weight"). If 234 | the code output produces a plot or answers a question, give a brief 235 | interpretation of the output (e.g. "This plot shows X,Y,Z" or "The mean for 236 | group A is higher than the mean for group B which means X,Y,Z"). 237 | ''' 238 | 239 | ''' 240 | Part 1 241 | Load the data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt) 242 | into a DataFrame. Try looking at the "head" of the file in the command line 243 | to see how the file is delimited and how to load it. 244 | Note: You do not need to turn in any command line code you may use. 245 | ''' 246 | 247 | ''' 248 | Part 2 249 | Get familiar with the data. Answer the following questions: 250 | - What is the shape of the data? How many rows and columns are there? 251 | - What variables are available? 252 | - What are the ranges for the values in each numeric column? 253 | - What is the average value for each column? Does that differ significantly 254 | from the median? 255 | ''' 256 | 257 | 258 | ''' 259 | Part 3 260 | Use the data to answer the following questions: 261 | - Which 5 cars get the best gas mileage? 262 | - Which 5 cars with more than 4 cylinders get the best gas mileage? 263 | - Which 5 cars get the worst gas mileage? 264 | - Which 5 cars with 4 or fewer cylinders get the worst gas mileage? 265 | ''' 266 | 267 | ''' 268 | Part 4 269 | Use groupby and aggregations to explore the relationships 270 | between mpg and the other variables. Which variables seem to have the greatest 271 | effect on mpg? 272 | Some examples of things you might want to look at are: 273 | - What is the mean mpg for cars for each number of cylindres (i.e. 3 cylinders, 274 | 4 cylinders, 5 cylinders, etc)? 275 | - Did mpg rise or fall over the years contained in this dataset? 276 | - What is the mpg for the group of lighter cars vs the group of heaver cars? 277 | Note: Be creative in the ways in which you divide up the data. You are trying 278 | to create segments of the data using logical filters and comparing the mpg 279 | for each segment of the data. 280 | ''' 281 | 282 | -------------------------------------------------------------------------------- /code/04_apis.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CLASS: APIs 3 | 4 | Data Science Toolkit text2sentiment API 5 | ''' 6 | 7 | ''' 8 | APIs without wrappers (i.e. there is no nicely formatted function) 9 | ''' 10 | # Import the necessary modules 11 | import requests # Helps construct the request to send to the API 12 | import json # JSON helper functions 13 | 14 | # We have a sentence we want the sentiment of 15 | sample_sentence = 'A couple hundred hours & several thousand lines of code later... thank you @GA_DC!! #DataScience #GAGradNight' 16 | 17 | # We know end URL endpoint to send it to 18 | url = 'http://www.datasciencetoolkit.org/text2sentiment/' 19 | 20 | # First we specify the header 21 | header = {'content-type': 'application/json'} 22 | 23 | # Next we specify the body (the information we want the API to work on) 24 | body = sample_sentence 25 | 26 | # Now we make the request 27 | response = requests.post(url, data=body, headers=header) 28 | # Notice that this is a POST request 29 | 30 | # Let's look at the response 31 | response.status_code 32 | response.ok 33 | response.text 34 | 35 | # Let's turn that text back into JSON 36 | r_json = json.loads(response.text) 37 | r_json 38 | r_json['score'] # 2.0 39 | 40 | ########################################## 41 | ############ Exercise 1 ############ 42 | ########################################## 43 | # Turn the above code into a function 44 | # The function should take in one argument, some text, and return a number, 45 | # the sentiment. Call your function "get_sentiment". 46 | def get_sentiment(text): 47 | url = 'http://www.datasciencetoolkit.org/text2sentiment/' 48 | 49 | #specify header 50 | header = {'content-type': 'application/json'} 51 | 52 | # Next we specify the body (the information we want the API to work on) 53 | body = text 54 | 55 | # Now we make the request 56 | response = requests.post(url, data=body, headers=header) 57 | # Notice that this is a POST request 58 | r_json = json.loads(response.text) 59 | sentiment = r_json['score'] # 2.0 60 | return sentiment 61 | 62 | 63 | 64 | 65 | # Now that we've created our own wrapper, we can use it throughout our code. 66 | # We now have multiple sentences 67 | sentences = ['I love pizza!', 'I hate pizza!', 'I feel nothing about pizza!'] 68 | 69 | # Loop through the sentences 70 | for sentence in sentences: 71 | sentiment = get_sentiment(sentence) 72 | print sentence, sentiment # Print the results 73 | 74 | 75 | ''' 76 | APIs with wrappers (i.e. there is a nicely formatted function) 77 | ''' 78 | # Import the API library 79 | import dstk 80 | 81 | # Remember our sample sentence? 82 | sample_sentence 83 | 84 | # Let's try our new API library 85 | # Instantiate DSTK object 86 | dstk = dstk.DSTK() 87 | dstk.text2sentiment(sample_sentence) # 2.0 88 | 89 | # We can once again loop through our sentences 90 | for sentence in sentences: 91 | sentiment = dstk.text2sentiment(sentence) 92 | print sentence, sentiment['score'] 93 | -------------------------------------------------------------------------------- /code/04_visualization.py: -------------------------------------------------------------------------------- 1 | """ 2 | CLASS: Visualization 3 | """ 4 | 5 | # imports 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | # import the data available at https://raw.githubusercontent.com/justmarkham/DAT5/master/data/drinks.csv 10 | drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/drinks.csv') 11 | 12 | ''' 13 | Visualization 14 | ''' 15 | 16 | # bar plot of number of countries in each continent 17 | drinks.continent.value_counts().plot(kind='bar', title='Countries per Continent') 18 | plt.xlabel('Continent') 19 | plt.ylabel('Count') 20 | plt.show() # show plot window (if it doesn't automatically appear) 21 | plt.savefig('countries_per_continent.png') # save plot to file 22 | 23 | # bar plot of average number of beer servings (per adult per year) by continent 24 | drinks.groupby('continent').beer_servings.mean().plot(kind='bar', title='Average Number of Beer Servings By Continent') 25 | plt.ylabel('Average Number of Beer Servings Per Year') 26 | plt.show() 27 | 28 | # histogram of beer servings (shows the distribution of a numeric column) 29 | drinks.beer_servings.hist(bins=20) 30 | plt.title("Distribution of Beer Servings") 31 | plt.xlabel('Beer Servings') 32 | plt.ylabel('Frequency') 33 | plt.show() 34 | 35 | # density plot of beer servings (smooth version of a histogram) 36 | drinks.beer_servings.plot(kind='density', xlim=(0,500)) 37 | plt.title("Distribution of Beer Servings") 38 | plt.xlabel('Beer Servings') 39 | plt.show() 40 | 41 | # grouped histogram of beer servings (shows the distribution for each group) 42 | drinks.beer_servings.hist(by=drinks.continent) 43 | plt.show() 44 | 45 | drinks.beer_servings.hist(by=drinks.continent, sharex=True) 46 | plt.show() 47 | 48 | drinks.beer_servings.hist(by=drinks.continent, sharex=True, sharey=True) 49 | plt.show() 50 | 51 | drinks.beer_servings.hist(by=drinks.continent, sharey=True, layout=(2, 3)) # change layout (new in pandas 0.15.0) 52 | plt.show() 53 | 54 | # boxplot of beer servings by continent (shows five-number summary and outliers) 55 | drinks.boxplot(column='beer_servings', by='continent') 56 | plt.show() 57 | 58 | # scatterplot of beer servings versus wine servings 59 | drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', alpha=0.3) 60 | plt.show() 61 | 62 | # same scatterplot, except point color varies by 'spirit_servings' 63 | # note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0 64 | drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', c='spirit_servings', colormap='Blues') 65 | plt.show() 66 | 67 | # same scatterplot, except all European countries are colored red 68 | colors = np.where(drinks.continent=='EU', 'r', 'b') 69 | drinks.plot(x='beer_servings', y='wine_servings', kind='scatter', c=colors) 70 | plt.show() 71 | 72 | # Scatter matrix 73 | pd.scatter_matrix(drinks) 74 | plt.show() 75 | 76 | 77 | ########################################## 78 | ############ Exercise 1 ############ 79 | ########################################## 80 | 81 | # 1. Generate a plot showing the average number of total litres of pure alcohol 82 | # by continent. 83 | drinks.groupby('continent').total_litres_of_pure_alcohol.mean().plot(kind='bar') 84 | plt.show() 85 | 86 | # 2. Illustrate the relationship between spirit servings and total litres of 87 | # pure alcohol. What kind of relationship is there? 88 | drinks.plot(kind='scatter', x='spirit_servings', y='total_litres_of_pure_alcohol', alpha=0.4) 89 | plt.show() 90 | 91 | # 3. Generate one plot that shows the distribution of spirit servings for each 92 | # continent. 93 | drinks.spirit_servings.hist(by=drinks.continent, sharex=True, sharey=True) 94 | plt.show() 95 | 96 | 97 | ########################################## 98 | ############# Homework ############# 99 | ########################################## 100 | ''' 101 | Use the automotive mpg data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt) 102 | to complete the following parts. Please turn in your code for each part. 103 | Before each code chunk, give a brief description (one line) of what the code is 104 | doing (e.g. "Loads the data" or "Creates scatter plot of mpg and weight"). If 105 | the code output produces a plot or answers a question, give a brief 106 | interpretation of the output (e.g. "This plot shows X,Y,Z" or "The mean for 107 | group A is higher than the mean for group B which means X,Y,Z"). 108 | ''' 109 | 110 | ''' 111 | Part 1 112 | Produce a plot that compares the mean mpg for the different numbers of cylinders. 113 | ''' 114 | 115 | ''' 116 | Part 2 117 | Use a scatter matrix to explore relationships between different numeric variables. 118 | ''' 119 | 120 | ''' 121 | Part 3 122 | Use a plot to answer the following questions: 123 | -Do heavier or lighter cars get better mpg? 124 | -How are horsepower and displacement related? 125 | -What does the distribution of acceleration look like? 126 | -How is mpg spread for cars with different numbers of cylinders? 127 | -Do cars made before or after 1975 get better average mpg? (Hint: You need to 128 | create a new column that encodes whether a year is before or after 1975.) 129 | ''' -------------------------------------------------------------------------------- /code/05_iris_exercise.py: -------------------------------------------------------------------------------- 1 | ''' 2 | EXERCISE: "Human Learning" with iris data 3 | 4 | Can you predict the species of an iris using petal and sepal measurements? 5 | 6 | TASKS: 7 | 1. Read iris data into a pandas DataFrame, including column names. 8 | 2. Gather some basic information about the data. 9 | 3. Use groupby, sorting, and/or plotting to look for differences between species. 10 | 4. Come up with a set of rules that could be used to predict species based upon measurements. 11 | 12 | BONUS: Define a function that accepts a row of data and returns a predicted species. 13 | Then, use that function to make predictions for all existing rows of data. 14 | ''' 15 | 16 | import pandas as pd 17 | import numpy as np 18 | import matplotlib.pyplot as plt 19 | 20 | 21 | ## TASK 1 22 | 23 | # read the iris data into a pandas DataFrame, including column names 24 | col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] 25 | iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 26 | names=col_names) 27 | 28 | 29 | ## TASK 2 30 | 31 | # gather basic information 32 | iris.shape 33 | iris.head() 34 | iris.describe() 35 | iris.species.value_counts() 36 | iris.dtypes 37 | iris.isnull().sum() 38 | 39 | 40 | ## TASK 3 41 | 42 | # use groupby to look for differences between the species 43 | iris.groupby('species').sepal_length.mean() 44 | iris.groupby('species').mean() 45 | iris.groupby('species').describe() 46 | 47 | # use sorting to look for differences between the species 48 | iris.sort_index(by='sepal_length').values 49 | iris.sort_index(by='sepal_width').values 50 | iris.sort_index(by='petal_length').values 51 | iris.sort_index(by='petal_width').values 52 | 53 | # use plotting to look for differences between the species 54 | iris.petal_width.hist(by=iris.species, sharex=True) 55 | iris.boxplot(column='petal_width', by='species') 56 | iris.boxplot(by='species') 57 | 58 | # map species to a numeric value so that plots can be colored by category 59 | iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}) 60 | iris.plot(kind='scatter', x='petal_length', y='petal_width', c='species_num', colormap='Blues') 61 | pd.scatter_matrix(iris, c=iris.species_num) 62 | 63 | 64 | ## TASK 4 65 | 66 | # If petal length is less than 3, predict setosa. 67 | # Else if petal width is less than 1.8, predict versicolor. 68 | # Otherwise predict virginica. 69 | 70 | 71 | ## BONUS 72 | 73 | # define function that accepts a row of data and returns a predicted species 74 | def classify_iris(row): 75 | if row[2] < 3: # petal_length 76 | return 0 # setosa 77 | elif row[3] < 1.8: # petal_width 78 | return 1 # versicolor 79 | else: 80 | return 2 # virginica 81 | 82 | # predict for a single row 83 | classify_iris(iris.iloc[0, :]) # first row 84 | classify_iris(iris.iloc[149, :]) # last row 85 | 86 | # store predictions for all rows 87 | predictions = [classify_iris(row) for row in iris.values] 88 | 89 | # calculate the percentage of correct predictions 90 | np.mean(iris.species_num == predictions) # 0.96 91 | -------------------------------------------------------------------------------- /code/05_sklearn_knn.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CLASS: Introduction to scikit-learn with iris data 3 | ''' 4 | 5 | # read in iris data 6 | import pandas as pd 7 | col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] 8 | iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 9 | names=col_names) 10 | 11 | # create numeric column for the response 12 | # note: features and response must both be entirely numeric! 13 | iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}) 14 | 15 | # create X (features) three different ways 16 | X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] 17 | X = iris.loc[:, 'sepal_length':'petal_width'] 18 | X = iris.iloc[:, 0:4] 19 | 20 | # create y (response) 21 | y = iris.species_num 22 | 23 | # check the shape of X and y 24 | X.shape # 150 by 4 (n=150, p=4) 25 | y.shape # 150 (must match first dimension of X) 26 | 27 | # scikit-learn 4-step modeling pattern: 28 | 29 | # Step 1: import the class you plan to use 30 | from sklearn.neighbors import KNeighborsClassifier 31 | 32 | # Step 2: instantiate the "estimator" (aka the model) 33 | # note: all unspecified parameters are set to the defaults 34 | knn = KNeighborsClassifier(n_neighbors=1) 35 | 36 | # Step 3: fit the model with data (learn the relationship between X and y) 37 | knn.fit(X, y) 38 | 39 | # Step 4: use the "fitted model" to predict the response for a new observation 40 | knn.predict([3, 5, 4, 2]) 41 | 42 | # predict for multiple observations at once 43 | X_new = [[3, 5, 4, 2], [3, 5, 2, 2]] 44 | knn.predict(X_new) 45 | 46 | # try a different value of K ("tuning parameter") 47 | knn = KNeighborsClassifier(n_neighbors=5) 48 | knn.fit(X, y) 49 | knn.predict(X_new) # predicted classes 50 | knn.predict_proba(X_new) # predicted probabilities of class membership 51 | knn.kneighbors([3, 5, 4, 2]) # distances to nearest neighbors (and identities) 52 | 53 | # calculate Euclidian distance manually for nearest neighbor 54 | import numpy as np 55 | np.sqrt(((X.iloc[106, :] - [3, 5, 4, 2])**2).sum()) 56 | -------------------------------------------------------------------------------- /code/07_glass_id_homework_solution.py: -------------------------------------------------------------------------------- 1 | ''' 2 | HOMEWORK: Glass Identification (aka "Glassification") 3 | ''' 4 | 5 | # TASK 1: read data into a DataFrame 6 | import pandas as pd 7 | df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', 8 | names=['id','ri','na','mg','al','si','k','ca','ba','fe','glass_type'], 9 | index_col='id') 10 | 11 | # TASK 2: briefly explore the data 12 | df.shape 13 | df.head() 14 | df.tail() 15 | df.glass_type.value_counts() 16 | df.isnull().sum() 17 | 18 | # TASK 3: convert to binary classification problem (1/2/3/4 maps to 0, 5/6/7 maps to 1) 19 | import numpy as np 20 | df['binary'] = np.where(df.glass_type < 5, 0, 1) # method 1 21 | df['binary'] = df.glass_type.map({1:0, 2:0, 3:0, 4:0, 5:1, 6:1, 7:1}) # method 2 22 | df.binary.value_counts() 23 | 24 | # TASK 4: create a feature matrix (X) 25 | features = ['ri','na','mg','al','si','k','ca','ba','fe'] # create a list of features 26 | features = df.columns[:-2] # alternative way: slice 'columns' attribute like a list 27 | X = df[features] # create DataFrame X by only selecting features 28 | 29 | # TASK 5: create a response vector (y) 30 | y = df.binary 31 | 32 | # TASK 6: split X and y into training and testing sets 33 | from sklearn.cross_validation import train_test_split 34 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99) 35 | 36 | # TASK 7: fit a KNN model on the training set using K=5 37 | from sklearn.neighbors import KNeighborsClassifier 38 | knn = KNeighborsClassifier(n_neighbors=5) 39 | knn.fit(X_train, y_train) 40 | 41 | # TASK 8: make predictions on the testing set and calculate accuracy 42 | y_pred = knn.predict(X_test) 43 | from sklearn import metrics 44 | print metrics.accuracy_score(y_test, y_pred) # 90.7% accuracy 45 | 46 | # TASK 9: calculate null accuracy 47 | 1 - y.mean() # 76.2% null accuracy 48 | 49 | # BONUS: write a for loop that computes test set accuracy for a range of K values 50 | k_range = range(1, 30, 2) 51 | scores = [] 52 | for k in k_range: 53 | knn = KNeighborsClassifier(n_neighbors=k) 54 | knn.fit(X_train, y_train) 55 | y_pred = knn.predict(X_test) 56 | scores.append(metrics.accuracy_score(y_test, y_pred)) 57 | 58 | # BONUS: plot K versus test set accuracy to choose on optimal value for K 59 | import matplotlib.pyplot as plt 60 | plt.plot(k_range, scores) # optimal value is K=1 61 | -------------------------------------------------------------------------------- /code/08_web_scraping.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CLASS: Web Scraping 3 | 4 | We will be using two packages in particular: requests and Beautiful Soup 4. 5 | ''' 6 | 7 | ''' 8 | Introduction to Beautiful Soup 9 | ''' 10 | 11 | # imports 12 | import requests # How Python gets the webpages 13 | from bs4 import BeautifulSoup # Creates structured, searchable object 14 | import pandas as pd 15 | import matplotlib.pyplot as plt 16 | 17 | # First, let's play with beautiful soup on a "toy" webpage 18 | html_doc = """ 19 | 20 | 21 | 22 | 23 | Brandon's Homepage! 24 | 25 | 26 | 27 |

Brandon's Homepage

28 |

My name is Brandon. I'm love web scraping!

29 |

I'm originally from Louisiana. I went to undergrad at Louisiana Tech and grad school at UNC.

30 |

I currently work as a Product Manager of Linguistics and Analytics at Clarabridge.

31 | 32 |

My Hobbies

33 | 40 | 41 | 42 | """ 43 | type(html_doc) 44 | 45 | # Beautiful soup allows us to create a structured object out of this string 46 | b = BeautifulSoup(html_doc) 47 | type(b) 48 | 49 | # Let's look at "b" 50 | b 51 | 52 | # The most useful methods in a Beautiful Soup object are "find" and "findAll". 53 | # "find" takes several parameters, the most important are "name" and "attrs". 54 | # Let's talk about "name". 55 | b.find(name='body') # Finds the 'body' tag and everything inside of it. 56 | body = b.find(name='body') 57 | type(body) #tag 58 | 59 | # You can search tags also 60 | h1 = body.find(name='h1') # Find the 'h1' tag inside of the 'body' tag 61 | h1 62 | h1.text # Print out just the text inside of the body 63 | 64 | # Now let's find the 'p' tags 65 | p = b.find(name='p') 66 | # This only finds one. This is where 'findAll' comes in. 67 | all_p = b.findAll(name='p') 68 | all_p 69 | type(all_p) # Result sets are a lot like Python lists 70 | all_p[0] # Access specific element with index 71 | all_p[1] 72 | # Iterable like list 73 | for one_p in all_p: 74 | print one_p.text # Print text 75 | 76 | # Access specific attribute of a tag 77 | all_p[0] # Specific tag 78 | all_p[0]['id'] # Speific attribute of a specific tag 79 | 80 | # Now let's talk about 'attrs' 81 | # Beautiful soup also allows us to choose tags with specific attributes 82 | b.find(name='p', attrs={"id":"intro"}) 83 | b.find(name='p', attrs={"id":"background"}) 84 | b.find(name='p', attrs={"id":"current"}) 85 | 86 | ########################################## 87 | ############ Exercise 1 ############ 88 | ########################################## 89 | 90 | # 1. Extact the 'h3' element from Brandon's webpage. 91 | b.find(name='h3') 92 | 93 | # 2. Extract Brandon's hobbies from the html_doc. Print out the text of the hobby. 94 | hobbies = b.findAll(name='ul') 95 | for hobby in hobbies: 96 | print hobby.text 97 | 98 | # 3. Extract Brandon's hobby that has the id "my favorite". 99 | b.find(name='li', attrs={'id':'my favorite'}) 100 | 101 | 102 | ''' 103 | Beautiful Soup from the web 104 | ''' 105 | 106 | # We see data on a web page that we want to get. First we need the HTML. 107 | # This downloads the HTML and puts it into the variable r 108 | r = requests.get('http://www.imdb.com/title/tt1856010/') 109 | # But when we look at it, it's just one giant string. 110 | type(r.text) # Unicode string 111 | r.text[0:200] 112 | 113 | # Beautiful soup allows us to create a structured object out of this string 114 | b = BeautifulSoup(r.text) 115 | type(b) 116 | 117 | 118 | ''' 119 | "find" and "findAll" with the 'name' parameter in Beautiful Soup 120 | ''' 121 | b.find(name='body') # Find a specific HTML tag 122 | body = b.find(name='body') # Store the output of your "find" 123 | type(body) # Let's look at the type 124 | 125 | # Can we still run another "find" command on the output? 126 | img = body.find('img') # Find the image tags 127 | img 128 | type(img) 129 | # Yes, but it only finds one of the "img" tags. We want them all. 130 | imgs = body.findAll(name='img') 131 | imgs # Now we get them all. 132 | type(imgs) # Resultsets are a lot like Python lists 133 | 134 | # Let's look at each individual image 135 | imgs[0] 136 | imgs[1] 137 | 138 | # We're really interested is the 'src' attribute, the actual image location. 139 | # How do we access attributes in a Python object? Using the dot notation or the 140 | # brackets. With Beautiful Soup, we must use the brackets 141 | imgs[0]['src'] 142 | 143 | # Now we can look through each image and print the 'src' attribute. 144 | for img in imgs: 145 | print img['src'] 146 | 147 | # Or maybe we want to create a list of all of the 'src' attributes 148 | src_list = [] 149 | for img in imgs: 150 | src_list.append(img['src']) 151 | 152 | len(src_list) 153 | 154 | 155 | ''' 156 | "find" and "findAll" with the 'attrs' parameter in Beautiful Soup 157 | ''' 158 | # Now let's talk about 'attrs' 159 | # Beautiful soup also allows us to choose tags with specific attributes 160 | title = b.find(name="span", attrs={"class":"itemprop", "itemprop":"name"}) 161 | title # Prints HTML matching that tag, but we want the actual name 162 | title.text # The "text" attribute gives you the text between two HTML tags 163 | 164 | star_rating = b.find(name="div", attrs={"class":"titlePageSprite star-box-giga-star"}) 165 | # How do I get the actual star_rating number? 166 | star_rating.text 167 | 168 | # How do I make this star_rating a number instead of a string? 169 | float(star_rating.text) 170 | 171 | ########################################## 172 | ############ Exercise 2 ############ 173 | ########################################## 174 | ''' 175 | We've retrieved the title of the show, but now we want the show's rating, 176 | duration, and genre. Using "find" and "find all", write code that retrieves 177 | each of these things 178 | Hint: Everything can be found in the "infobar". Try finding that first and 179 | searchng within it. 180 | ''' 181 | 182 | infobar = b.find(name="div", attrs={"class":"infobar"}) 183 | # Retrieve the show's content rating 184 | content_rating = infobar.find(name='meta', attrs={"itemprop":"contentRating"})['content'] 185 | 186 | # Retrieve the show's duration 187 | duration = infobar.find(name='time', attrs={"itemprop":"duration"}).text 188 | 189 | # Retrieve the show's genre 190 | genre = infobar.find(name='span', attrs={"itemprop":"genre"}).text 191 | 192 | 193 | ''' 194 | Looping through 'findAll' results 195 | ''' 196 | # Now we want to get the list of actors and actresses 197 | # First let's get the "div" block with all of the actor info 198 | actors_raw = b.find(name='div', attrs={"class":"txt-block", "itemprop":"actors", "itemscope":"", "itemtype":"http://schema.org/Person"}) 199 | 200 | # Now let's find all of the occurences of the "span" with "itemprop" "name", 201 | # meaning the tags with actors' and actresses' names. 202 | actors = actors_raw.findAll(name="span", attrs={"itemprop":"name"}) 203 | 204 | # Now we want to loop through each one and get the text inside the tags 205 | actors_list = [actor.text for actor in actors] 206 | 207 | ''' 208 | Creating a "Web Scraping" Function 209 | The above code we've written is useful, but we don't want to have to type it 210 | everytime. We want to create a function that takes the URL and outputs the pieces 211 | we want everytime. 212 | ''' 213 | 214 | def getIMDBInfo(url): 215 | r = requests.get(url) # Get HTML 216 | b = BeautifulSoup(r.text) # Create Beautiful Soup object 217 | # Get various attributes and put them in dictionary 218 | results = {} # Initialize empty dictionary 219 | 220 | # Get the title 221 | results['title'] = b.find(name="span", attrs={"class":"itemprop", "itemprop":"name"}).text 222 | 223 | # Rating 224 | results['star_rating'] = float(b.find(name="div", attrs={"class":"titlePageSprite"}).text) 225 | 226 | # Actors/actresses 227 | actors_raw = b.find(name='div', attrs={"class":"txt-block", "itemprop":"actors", "itemscope":"", "itemtype":"http://schema.org/Person"}) 228 | actors = actors_raw.findAll(name="span", attrs={"class":"itemprop", "itemprop":"name"}) 229 | results['actors_list'] = [actor.text for actor in actors] 230 | 231 | # Content Rating 232 | infobar = b.find(name="div", attrs={"class":"infobar"}) 233 | results['content_rating'] = infobar.find(name='meta', attrs={"itemprop":"contentRating"})['content'] 234 | 235 | # Show duration 236 | results['duration'] = int(infobar.find(name='time', attrs={"itemprop":"duration"}).text.strip()[:-4])#infobar.find(name='time', attrs={"itemprop":"duration"}).text 237 | 238 | # Genre 239 | results['genre'] = infobar.find(name='span', attrs={"itemprop":"genre"}).text 240 | 241 | # Return dictionary 242 | return results 243 | 244 | # Let's see if it worked 245 | # We can look at the results of our previous web page, "House of Cards" 246 | getIMDBInfo('http://www.imdb.com/title/tt1856010/') 247 | # Now let's try another one: Interstellar 248 | getIMDBInfo('http://www.imdb.com/title/tt0816692/') 249 | 250 | # Now let's show the true functionality 251 | list_of_title_urls = [] 252 | with open('imdb_movie_urls.csv', 'rU') as f: 253 | list_of_title_urls = f.read().split('\n') 254 | 255 | # Let's get the data for each title in the list 256 | data = [] 257 | for title_url in list_of_title_urls: 258 | imdb_data = getIMDBInfo(title_url) 259 | data.append(imdb_data) 260 | 261 | column_names = ['star_rating', 'title', 'content_rating', 'genre', 'duration', 'actors_list'] 262 | movieRatings = pd.DataFrame(data, columns = column_names) 263 | movieRatings 264 | # Now we have some data we can begin exploring, aggregating, etc. 265 | 266 | 267 | ''' 268 | Bonus material: Getting movie data for the top 1000 movies on IMDB 269 | ''' 270 | 271 | # Or let's build another webscraper to get the IMDB top 1000 272 | movie_links = [] # Create empty list 273 | # Notice that we are creating a list [1,101,201,...] and changing the URL slightly each time. 274 | for i in range(1,1000,100): 275 | # Get url 276 | r = requests.get('http://www.imdb.com/search/title?groups=top_1000&sort=user_rating&start=' + str(i) + '&view=simple') # Get HTML 277 | b = BeautifulSoup(r.text) # Create Beautiful Soup object 278 | links = b.findAll(name='td', attrs={'class':'title'}) # Find all 'td's with 'class'='title' 279 | for link in links: 280 | a_link = link.find('a') # Find liks 281 | movie_links.append('http://www.imdb.com' + str(a_link['href'])) # Add link to list 282 | 283 | # Create dataframe of the top 1000 movies on IMDB 284 | # NOTE: This could take 5-10 minutes. You can skip this part as I've already 285 | # pulled all of this data and saved it to a file. 286 | data = [] 287 | j=0 288 | # Loop through every movie title 289 | for movie_link in movie_links: 290 | try: 291 | imdb_data = getIMDBInfo(movie_link) # Get movie data 292 | data.append(imdb_data) # Put movie data in list 293 | except: 294 | pass 295 | j += 1 296 | if j%50 == 0: 297 | print 'Completed ' + str(j) + ' titles!' # Print progress 298 | 299 | # Create data frame with movies 300 | column_names = ['star_rating', 'title', 'content_rating', 'genre', 'duration', 'actors_list'] 301 | movieRatingsTop1000 = pd.DataFrame(data, columns = column_names) 302 | 303 | # Read in the reated dataframe 304 | movieRatingsTop1000 = pd.read_csv('imdb_movie_ratings_top_1000.csv') 305 | 306 | # Now you're ready to do some analysis 307 | movieRatingsTop1000.describe() 308 | movieRatingsTop1000.groupby('genre').star_rating.mean() 309 | movieRatingsTop1000.groupby('content_rating').star_rating.mean() 310 | movieRatingsTop1000.plot(kind='scatter', x='duration', y='star_rating') 311 | plt.show() -------------------------------------------------------------------------------- /code/10_logistic_regression_confusion_matrix.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CLASS: Logistic Regression and Confusion Matrix 3 | ''' 4 | 5 | ############################################################################### 6 | ### Logistic Regression 7 | ############################################################################### 8 | 9 | # Imports 10 | import pandas as pd 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn import metrics 14 | from math import exp 15 | import numpy as np 16 | import matplotlib.pyplot as plt 17 | 18 | # Read in data 19 | data = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/default.csv') 20 | data.head() 21 | # Change column to number 22 | data['student_bin'] = data.student.map({'No':0, 'Yes':1}) 23 | 24 | # Let's do some cursory analysis. 25 | data.groupby('default').balance.mean() 26 | data.groupby('default').income.mean() 27 | 28 | # Set X and y 29 | feature_cols = ['balance', 'income','student_bin'] 30 | X = data[feature_cols] 31 | y = data.default 32 | 33 | # Train test split 34 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2) 35 | 36 | # Fit model 37 | logreg = LogisticRegression() 38 | logreg.fit(X_train, y_train) 39 | y_pred = logreg.predict(X_test) # Predict 40 | 41 | # Access accuracy 42 | print metrics.accuracy_score(y_test, y_pred) 43 | 44 | 45 | ############################################################################### 46 | ### Null Accuracy Rate 47 | ############################################################################### 48 | 49 | # Compare to null accuracy rate. The null accuracy rate is the accuracy if I 50 | # predict all the majority class. If there are more 1's, I predict all 1's. 51 | # If there are more 0's, I predict all 0's. There are several ways to do this. 52 | 53 | # 1. Create a vector of majority class and use the accuracy_score. 54 | # "If I predicted all 0's, how accurate would I be? 55 | print metrics.accuracy_score(y_test, [0]*len(y_test)) 56 | 57 | # 2. Calculate the mean of y_test (AKA the percentage of 1's) 58 | y_test.mean() 59 | # One minus that number will be the percentage of 0's. This means that if you 60 | # predict all 0's, you will be correct 1-y_test-mean() percent of the time. 61 | 1 - y_test.mean() 62 | 63 | # This puts our accuracy score into context a bit. We can now see that we 64 | # actually didn't do so great! 65 | 66 | 67 | ############################################################################### 68 | ### Intepretting Logistic Regression Coefficients 69 | ############################################################################### 70 | 71 | # Let's look at the coefficients 72 | for col in zip(feature_cols, logreg.coef_[0]): 73 | print col[0], col[1] 74 | 75 | # Let's interpret those. 76 | for col in zip(feature_cols, logreg.coef_[0]): 77 | print 'A unit increase in', col[0], 'equals a', exp(col[1]), 'increase in odds.' 78 | 79 | ############################################################################### 80 | ### Confusion Matrix 81 | ############################################################################### 82 | 83 | # Let's look at the confusion matrix 84 | con_mat = metrics.confusion_matrix(y_test, y_pred) 85 | print con_mat 86 | 87 | # Let's define our true posititves, false positives, true negatives, and false negatives 88 | true_neg = con_mat[0][0] 89 | false_neg = con_mat[1][0] 90 | true_pos = con_mat[1][1] 91 | false_pos = con_mat[0][1] 92 | 93 | # Sensitivity: percent of correct predictions when reference value is 'default' 94 | sensitivity = float(true_pos)/(false_neg + true_pos) 95 | print sensitivity 96 | print metrics.recall_score(y_test, y_pred) 97 | 98 | # Specificity: percent of correct predictions when reference value is 'not default' 99 | specificity = float(true_neg) / (true_neg + false_pos) 100 | print specificity 101 | 102 | ############################################################################### 103 | ### Logistic Regression Thresholds 104 | ############################################################################### 105 | 106 | # Logistic regression is actually predicting the underlying probability. 107 | # However, when you clal the "predict" function, it returns class labels. You 108 | # can still predict the actual probability and set your own threshold if you'd 109 | # like. This can be useful in cases where the "signal" from the model isn't 110 | # strong. 111 | 112 | # Predict probabilities 113 | logreg.predict_proba(X_test).shape 114 | probs = logreg.predict_proba(X_test)[:, 1] 115 | 116 | # The natural threshold for probabilility is 0.5, but you don't have to use 117 | # that. 118 | 119 | # Use 0.5 thrshold for predicting 'default' and confirm we get the same results 120 | preds_05 = np.where(probs >= 0.5, 1, 0) 121 | print metrics.accuracy_score(y_test, preds_05) 122 | con_mat_05 = metrics.confusion_matrix(y_test, preds_05) 123 | print con_mat_05 124 | 125 | # Let's look at a histogram of these probabilities. 126 | plt.hist(probs, bins=20) 127 | plt.title('Distribution of Probabilities') 128 | plt.xlabel('Probability') 129 | plt.ylabel('Frequency') 130 | plt.show() 131 | 132 | # Change cutoff for predicting default to 0.2 133 | preds_02 = np.where(probs > 0.2, 1, 0) 134 | delta = float((preds_02 != preds_05).sum())/len(X_test)*100 135 | print 'Changing the threshold from 0.5 to 0.2 changed %.2f percent of the predictions.' % delta 136 | 137 | # Check the new accuracy, sensitivity, specificity 138 | print metrics.accuracy_score(y_test, preds_02) 139 | con_mat_02 = metrics.confusion_matrix(y_test, preds_02) 140 | print con_mat_02 141 | 142 | # Let's define our true posititves, false positives, true negatives, and false negatives 143 | true_neg = con_mat_02[0][0] 144 | false_neg = con_mat_02[1][0] 145 | true_pos = con_mat_02[1][1] 146 | false_pos = con_mat_02[0][1] 147 | 148 | # Sensitivity: percent of correct predictions when reference value is 'default' 149 | sensitivity = float(true_pos)/(false_neg + true_pos) 150 | print sensitivity 151 | print metrics.recall_score(y_test, preds_02) 152 | 153 | # Specificity: percent of correct predictions when reference value is 'not default' 154 | specificity = float(true_neg) / (true_neg + false_pos) 155 | print specificity 156 | 157 | 158 | ############################################################################### 159 | ### Exercise/Possibly Homework 160 | ############################################################################### 161 | 162 | ''' 163 | Let's use the glass identification dataset again. We've previously run knn 164 | on this dataset. Now, let's try logistic regression. Access the dataset at 165 | http://archive.ics.uci.edu/ml/datasets/Glass+Identification. Complete the 166 | following tasks or answer the following questions. 167 | ''' 168 | ''' 169 | 1. Read the data into a pandas dataframe. 170 | ''' 171 | # Taken from Kevin's 07 HW solution 172 | df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', 173 | names=['id','ri','na','mg','al','si','k','ca','ba','fe','glass_type'], 174 | index_col='id') 175 | 176 | '''' 177 | 2. Explore the data and look at what columns are available. 178 | ''' 179 | # Taken from Kevin's 07 HW solution 180 | df.shape # 214 x 10 181 | df.head() 182 | df.tail() 183 | df.glass_type.value_counts() 184 | df.isnull().sum() # No nulls in our data 185 | 186 | '''' 187 | 3. Convert the 'glass type' column into a binary response. 188 | * If type of class = 1/2/3/4, binary=0. 189 | * If type of glass = 5/6/7, binary=1. 190 | ''' 191 | # Taken from Kevin's 07 HW solution 192 | df['binary'] = np.where(df.glass_type < 5, 0, 1) # method 1 193 | df['binary'] = df.glass_type.map({1:0, 2:0, 3:0, 4:0, 5:1, 6:1, 7:1}) # method 2 194 | df.binary.value_counts() 195 | 196 | ''' 197 | 4. Create a feature matrix and a response vector. 198 | ''' 199 | # Taken from Kevin's 07 HW solution 200 | features = ['ri','na','mg','al','si','k','ca','ba','fe'] # create a list of features 201 | features = df.columns[:-2] # alternative way: slice 'columns' attribute like a list 202 | X = df[features] # create DataFrame X by only selecting features 203 | y = df.binary 204 | 205 | ''' 206 | 5. Split the data into the appropriate training and testing sets. 207 | ''' 208 | # Taken from Kevin's 07 HW solution 209 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99) 210 | 211 | ''' 212 | 6. Create and fit a logistic regression model. 213 | ''' 214 | logreg = LogisticRegression() # Instatiate estimator 215 | logreg.fit(X_train, y_train) # Fit data 216 | 217 | ''' 218 | 7. Make predictions with your new model. 219 | ''' 220 | y_pred = logreg.predict(X_test) # Create predictions 221 | 222 | ''' 223 | 8. Calculate the accuracy rate of your model and compare it to the null accuracy. 224 | ''' 225 | # Calculate accuracy of model 226 | metrics.accuracy_score(y_test, y_pred) 227 | 228 | # Calculate null accuracy 229 | metrics.accuracy_score(y_test, [0]*len(y_test)) 230 | 231 | ''' 232 | 9. Generate a confusion matrix for your predictions. Use this to calculate the 233 | sensitivity and specificity of your model. 234 | ''' 235 | # Let's look at the confusion matrix 236 | con_mat = metrics.confusion_matrix(y_test, y_pred) 237 | print con_mat 238 | 239 | # Let's define our true posititves, false positives, true negatives, and false negatives 240 | true_neg = con_mat[0][0] 241 | false_neg = con_mat[1][0] 242 | true_pos = con_mat[1][1] 243 | false_pos = con_mat[0][1] 244 | 245 | # Sensitivity: percent of correct predictions when reference value is 'default' 246 | sensitivity = float(true_pos)/(false_neg + true_pos) 247 | print sensitivity 248 | 249 | # Specificity: percent of correct predictions when reference value is 'not default' 250 | specificity = float(true_neg) / (true_neg + false_pos) 251 | print specificity -------------------------------------------------------------------------------- /code/13_naive_bayes.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CLASS: Naive Bayes SMS spam classifier 3 | DATA SOURCE: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection 4 | ''' 5 | 6 | ## READING IN THE DATA 7 | 8 | # read tab-separated file using pandas 9 | import pandas as pd 10 | df = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/SMSSpamCollection.txt', 11 | sep='\t', header=None, names=['label', 'msg']) 12 | 13 | # examine the data 14 | df.head(20) 15 | df.label.value_counts() 16 | df.msg.describe() 17 | 18 | # convert label to a binary variable 19 | df['label'] = df.label.map({'ham':0, 'spam':1}) 20 | df.head() 21 | 22 | # split into training and testing sets 23 | from sklearn.cross_validation import train_test_split 24 | X_train, X_test, y_train, y_test = train_test_split(df.msg, df.label, random_state=1) 25 | X_train.shape 26 | X_test.shape 27 | 28 | 29 | ## COUNTVECTORIZER: 'convert text into a matrix of token counts' 30 | ## http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html 31 | 32 | from sklearn.feature_extraction.text import CountVectorizer 33 | 34 | # start with a simple example 35 | train_simple = ['call you tonight', 36 | 'Call me a cab', 37 | 'please call me... PLEASE!'] 38 | 39 | # learn the 'vocabulary' of the training data 40 | vect = CountVectorizer() 41 | vect.fit(train_simple) 42 | vect.get_feature_names() 43 | 44 | # transform training data into a 'document-term matrix' 45 | train_simple_dtm = vect.transform(train_simple) 46 | train_simple_dtm 47 | train_simple_dtm.toarray() 48 | 49 | # examine the vocabulary and document-term matrix together 50 | pd.DataFrame(train_simple_dtm.toarray(), columns=vect.get_feature_names()) 51 | 52 | # transform testing data into a document-term matrix (using existing vocabulary) 53 | test_simple = ["please don't call me"] 54 | test_simple_dtm = vect.transform(test_simple) 55 | test_simple_dtm.toarray() 56 | pd.DataFrame(test_simple_dtm.toarray(), columns=vect.get_feature_names()) 57 | 58 | 59 | ## REPEAT PATTERN WITH SMS DATA 60 | 61 | # instantiate the vectorizer 62 | vect = CountVectorizer() 63 | 64 | # learn vocabulary and create document-term matrix in a single step 65 | train_dtm = vect.fit_transform(X_train) 66 | train_dtm 67 | 68 | # transform testing data into a document-term matrix 69 | test_dtm = vect.transform(X_test) 70 | test_dtm 71 | 72 | # store feature names and examine them 73 | train_features = vect.get_feature_names() 74 | len(train_features) 75 | train_features[:50] 76 | train_features[-50:] 77 | 78 | # convert train_dtm to a regular array 79 | train_arr = train_dtm.toarray() 80 | train_arr 81 | 82 | 83 | ## SIMPLE SUMMARIES OF THE TRAINING DATA 84 | 85 | # refresher on NumPy 86 | import numpy as np 87 | arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) 88 | arr 89 | arr[0, 0] 90 | arr[1, 3] 91 | arr[0, :] 92 | arr[:, 0] 93 | np.sum(arr) 94 | np.sum(arr, axis=0) 95 | np.sum(arr, axis=1) 96 | 97 | # exercise: calculate the number of tokens in the 0th message in train_arr 98 | sum(train_arr[0, :]) 99 | 100 | # exercise: count how many times the 0th token appears across ALL messages in train_arr 101 | sum(train_arr[:, 0]) 102 | 103 | # exercise: count how many times EACH token appears across ALL messages in train_arr 104 | np.sum(train_arr, axis=0) 105 | 106 | # exercise: create a DataFrame of tokens with their counts 107 | train_token_counts = pd.DataFrame({'token':train_features, 'count':np.sum(train_arr, axis=0)}) 108 | train_token_counts.sort('count', ascending=False) 109 | 110 | 111 | ## MODEL BUILDING WITH NAIVE BAYES 112 | ## http://scikit-learn.org/stable/modules/naive_bayes.html 113 | 114 | # train a Naive Bayes model using train_dtm 115 | from sklearn.naive_bayes import MultinomialNB 116 | nb = MultinomialNB() 117 | nb.fit(train_dtm, y_train) 118 | 119 | # make predictions on test data using test_dtm 120 | y_pred = nb.predict(test_dtm) 121 | y_pred 122 | 123 | # compare predictions to true labels 124 | from sklearn import metrics 125 | print metrics.accuracy_score(y_test, y_pred) 126 | print metrics.confusion_matrix(y_test, y_pred) 127 | 128 | # predict (poorly calibrated) probabilities and calculate AUC 129 | y_prob = nb.predict_proba(test_dtm)[:, 1] 130 | y_prob 131 | print metrics.roc_auc_score(y_test, y_prob) 132 | 133 | # exercise: show the message text for the false positives 134 | X_test[y_test < y_pred] 135 | 136 | # exercise: show the message text for the false negatives 137 | X_test[y_test > y_pred] 138 | 139 | 140 | ## COMPARE NAIVE BAYES AND LOGISTIC REGRESSION 141 | ## USING ALL DATA AND CROSS-VALIDATION 142 | 143 | # create a document-term matrix using all data 144 | all_dtm = vect.fit_transform(df.msg) 145 | 146 | # instantiate logistic regression 147 | from sklearn.linear_model import LogisticRegression 148 | logreg = LogisticRegression() 149 | 150 | # compare AUC using cross-validation 151 | # note: this is slightly improper cross-validation... can you figure out why? 152 | from sklearn.cross_validation import cross_val_score 153 | cross_val_score(nb, all_dtm, df.label, cv=10, scoring='roc_auc').mean() 154 | cross_val_score(logreg, all_dtm, df.label, cv=10, scoring='roc_auc').mean() 155 | 156 | 157 | ## EXERCISE: CALCULATE THE 'SPAMMINESS' OF EACH TOKEN 158 | 159 | # create separate DataFrames for ham and spam 160 | df_ham = df[df.label==0] 161 | df_spam = df[df.label==1] 162 | 163 | # learn the vocabulary of ALL messages and save it 164 | vect.fit(df.msg) 165 | all_features = vect.get_feature_names() 166 | 167 | # create document-term matrix of ham, then convert to a regular array 168 | ham_dtm = vect.transform(df_ham.msg) 169 | ham_arr = ham_dtm.toarray() 170 | 171 | # create document-term matrix of spam, then convert to a regular array 172 | spam_dtm = vect.transform(df_spam.msg) 173 | spam_arr = spam_dtm.toarray() 174 | 175 | # count how many times EACH token appears across ALL messages in ham_arr 176 | ham_counts = np.sum(ham_arr, axis=0) 177 | 178 | # count how many times EACH token appears across ALL messages in spam_arr 179 | spam_counts = np.sum(spam_arr, axis=0) 180 | 181 | # create a DataFrame of tokens with their separate ham and spam counts 182 | all_token_counts = pd.DataFrame({'token':all_features, 'ham':ham_counts, 'spam':spam_counts}) 183 | 184 | # add one to ham counts and spam counts so that ratio calculations (below) make more sense 185 | all_token_counts['ham'] = all_token_counts.ham + 1 186 | all_token_counts['spam'] = all_token_counts.spam + 1 187 | 188 | # calculate ratio of spam-to-ham for each token 189 | all_token_counts['spam_ratio'] = all_token_counts.spam / all_token_counts.ham 190 | all_token_counts.sort('spam_ratio') 191 | -------------------------------------------------------------------------------- /code/15_kaggle.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CLASS: Kaggle Stack Overflow competition 3 | ''' 4 | 5 | # read in the file and set the first column as the index 6 | import pandas as pd 7 | train = pd.read_csv('train.csv', index_col=0) 8 | train.head() 9 | 10 | 11 | ''' 12 | What are some assumptions and theories to test? 13 | 14 | PostId: unique within the dataset 15 | OwnerUserId: not unique within the dataset, assigned in order 16 | OwnerCreationDate: users with older accounts have more open questions 17 | ReputationAtPostCreation: higher reputation users have more open questions 18 | OwnerUndeletedAnswerCountAtPostTime: users with more answers have more open questions 19 | Tags: 1 to 5 tags are required, many unique tags 20 | PostClosedDate: should only exist for closed questions 21 | OpenStatus: 1 means open 22 | ''' 23 | 24 | ## OPEN STATUS 25 | 26 | # dataset is perfectly balanced in terms of OpenStatus (not a representative sample) 27 | train.OpenStatus.value_counts() 28 | 29 | 30 | ## USER ID 31 | 32 | # OwnerUserId is not unique within the dataset, let's examine the top 3 users 33 | train.OwnerUserId.value_counts() 34 | 35 | # mostly closed questions, all lowercase, lots of spelling errors 36 | train[train.OwnerUserId==466534] 37 | 38 | # fewer closed questions, better grammar, high reputation but few answers 39 | train[train.OwnerUserId==39677] 40 | 41 | # very few closed questions, lots of answers 42 | train[train.OwnerUserId==34537] 43 | 44 | 45 | ## REPUTATION 46 | 47 | # ReputationAtPostCreation is higher for open questions: possibly use as a feature 48 | train.groupby('OpenStatus').ReputationAtPostCreation.describe() 49 | 50 | # not a useful histogram 51 | train.ReputationAtPostCreation.hist() 52 | 53 | # much more useful histogram 54 | train[train.ReputationAtPostCreation < 1000].ReputationAtPostCreation.hist() 55 | 56 | # grouped histogram 57 | train[train.ReputationAtPostCreation < 1000].ReputationAtPostCreation.hist(by=train.OpenStatus, sharey=True) 58 | 59 | 60 | ## ANSWER COUNT 61 | 62 | # rename column 63 | train.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True) 64 | 65 | # Answers is higher for open questions: possibly use as a feature 66 | train.groupby('OpenStatus').Answers.describe() 67 | 68 | # grouped histogram 69 | train[train.Answers < 50].Answers.hist(by=train.OpenStatus, sharey=True) 70 | 71 | 72 | ## USER ID 73 | 74 | # OwnerUserId is assigned in numerical order 75 | train.sort('OwnerUserId').OwnerCreationDate 76 | 77 | # OwnerUserId is lower for open questions: possibly use as a feature 78 | train.groupby('OpenStatus').OwnerUserId.describe() 79 | 80 | 81 | ## TITLE 82 | 83 | # create a new feature that represents the length of the title (in characters) 84 | train['TitleLength'] = train.Title.apply(len) 85 | 86 | # Title is longer for open questions: possibly use as a feature 87 | train.TitleLength.hist(by=train.OpenStatus) 88 | 89 | 90 | ## BODY 91 | 92 | # create a new feature that represents the length of the body (in characters) 93 | train['BodyLength'] = train.BodyMarkdown.apply(len) 94 | 95 | # BodyMarkdown is longer for open questions: possibly use as a feature 96 | train.BodyLength.hist(by=train.OpenStatus) 97 | 98 | 99 | ## TAGS 100 | 101 | # Tag1 is required, and the rest are optional 102 | train.isnull().sum() 103 | 104 | # there are over 5000 unique tags 105 | len(train.Tag1.unique()) 106 | 107 | # calculate the percentage of open questions for each tag 108 | train.groupby('Tag1').OpenStatus.mean() 109 | 110 | # percentage of open questions varies widely by tag (among popular tags) 111 | train.groupby('Tag1').OpenStatus.agg(['mean','count']).sort('count') 112 | 113 | # create a new feature that represents the number of tags for each question 114 | train['NumTags'] = train.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1) 115 | 116 | # NumTags is higher for open questions: possibly use as a feature 117 | train.NumTags.hist(by=train.OpenStatus) 118 | 119 | 120 | ''' 121 | Define a function that takes in a raw CSV file and returns a DataFrame that 122 | includes all created features (and any other modifications). That way, we 123 | can apply the same changes to both train.csv and test.csv. 124 | ''' 125 | 126 | # define the function 127 | def make_features(filename): 128 | df = pd.read_csv(filename, index_col=0) 129 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True) 130 | df['TitleLength'] = df.Title.apply(len) 131 | df['BodyLength'] = df.BodyMarkdown.apply(len) 132 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1) 133 | return df 134 | 135 | # apply function to both training and testing files 136 | train = make_features('train.csv') 137 | test = make_features('test.csv') 138 | 139 | 140 | ''' 141 | Use train/test split to compare a model that includes 1 feature with a model 142 | that includes 5 features. 143 | ''' 144 | 145 | ## ONE FEATURE 146 | 147 | # define X and y 148 | feature_cols = ['ReputationAtPostCreation'] 149 | X = train[feature_cols] 150 | y = train.OpenStatus 151 | 152 | # split into training and testing sets 153 | from sklearn.cross_validation import train_test_split 154 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 155 | 156 | # fit a logistic regression model 157 | from sklearn.linear_model import LogisticRegression 158 | logreg = LogisticRegression() 159 | logreg.fit(X_train, y_train) 160 | 161 | # examine the coefficient to check that it makes sense 162 | logreg.coef_ 163 | 164 | # predict response classes and predict class probabilities 165 | y_pred = logreg.predict(X_test) 166 | y_prob = logreg.predict_proba(X_test)[:, 1] 167 | 168 | # check how well we did 169 | from sklearn import metrics 170 | metrics.accuracy_score(y_test, y_pred) # 0.538 (better than guessing) 171 | metrics.confusion_matrix(y_test, y_pred) # predicts closed most of the time 172 | metrics.roc_auc_score(y_test, y_prob) # 0.602 (not horrible) 173 | metrics.log_loss(y_test, y_prob) # 0.690 (what is this?) 174 | 175 | # log loss is the competition's evaluation metric, so let's get a feel for it 176 | true = [0, 0, 1, 1] 177 | prob = [0.1, 0.2, 0.8, 0.9] 178 | metrics.log_loss(true, prob) # 0.164 (lower is better) 179 | 180 | # let's try a few other predicted probabilities and check the log loss 181 | prob = [0.4, 0.4, 0.6, 0.6] # 0.511 (predictions are right, but less confident) 182 | prob = [0.4, 0.4, 0.4, 0.6] # 0.612 (one wrong prediction that is a bit off) 183 | prob = [0.4, 0.4, 0.1, 0.6] # 0.959 (one wrong prediction that is way off) 184 | prob = [0.5, 0.5, 0.5, 0.5] # 0.693 (you can get this score without a model) 185 | 186 | 187 | ## FIVE FEATURES 188 | 189 | # define X and y 190 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags'] 191 | X = train[feature_cols] 192 | y = train.OpenStatus 193 | 194 | # split into training and testing sets 195 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 196 | 197 | # fit a logistic regression model 198 | logreg.fit(X_train, y_train) 199 | 200 | # examine the coefficients to check that they make sense 201 | logreg.coef_ 202 | 203 | # predict response classes and predict class probabilities 204 | y_pred = logreg.predict(X_test) 205 | y_prob = logreg.predict_proba(X_test)[:, 1] 206 | 207 | # check how well we did 208 | metrics.accuracy_score(y_test, y_pred) # 0.589 (doing better) 209 | metrics.confusion_matrix(y_test, y_pred) # predicts open more often 210 | metrics.roc_auc_score(y_test, y_prob) # 0.625 (tiny bit better) 211 | metrics.log_loss(y_test, y_prob) # 0.677 (a bit better) 212 | 213 | # let's see if cross-validation gives us similar results 214 | from sklearn.cross_validation import cross_val_score 215 | scores = cross_val_score(logreg, X, y, scoring='log_loss', cv=10) 216 | scores.mean() # 0.677 (identical to train/test split) 217 | scores.std() # very small 218 | 219 | 220 | ''' 221 | Use the model with 5 features to make a submission 222 | ''' 223 | 224 | # make sure that X and y are defined properly 225 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags'] 226 | X = train[feature_cols] 227 | y = train.OpenStatus 228 | 229 | # train the model on ALL data (not X_train and y_train) 230 | logreg.fit(X, y) 231 | 232 | # predict class probabilities for the actual testing data (not X_test) 233 | y_prob = logreg.predict_proba(test[feature_cols])[:, 1] 234 | 235 | # sample submission file indicates we need two columns: PostId and predicted probability 236 | test.index # PostId 237 | y_prob # predicted probability 238 | 239 | # create a DataFrame that has 'id' as the index, then export to a CSV file 240 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id') 241 | sub.to_csv('sub1.csv') 242 | 243 | 244 | ''' 245 | Create a few more features from Title 246 | ''' 247 | 248 | # string methods for a Series are accessed via 'str' 249 | train.Title.str.lower() 250 | 251 | # create a new feature that represents whether a Title is all lowercase 252 | train['TitleLowercase'] = (train.Title.str.lower() == train.Title).astype(int) 253 | 254 | # check if there are a meaningful number of ones 255 | train.TitleLowercase.value_counts() 256 | 257 | # percentage of open questions is lower among questions with lowercase titles: possibly use as a feature 258 | train.groupby('TitleLowercase').OpenStatus.mean() 259 | 260 | # create features that represent whether Title contains certain words 261 | train['TitleQuestion'] = train.Title.str.contains('question', case=False).astype(int) 262 | train['TitleNeed'] = train.Title.str.contains('need', case=False).astype(int) 263 | train['TitleHelp'] = train.Title.str.contains('help', case=False).astype(int) 264 | 265 | 266 | ''' 267 | Build a document-term matrix from Title using CountVectorizer 268 | ''' 269 | 270 | # define X and y 271 | X = train.Title 272 | y = train.OpenStatus 273 | 274 | # split into training and testing sets 275 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 276 | 277 | # use CountVectorizer with the default settings 278 | from sklearn.feature_extraction.text import CountVectorizer 279 | vect = CountVectorizer() 280 | 281 | # fit and transform on X_train, but only transform on X_test 282 | train_dtm = vect.fit_transform(X_train) 283 | test_dtm = vect.transform(X_test) 284 | 285 | # try a Naive Bayes model 286 | from sklearn.naive_bayes import MultinomialNB 287 | nb = MultinomialNB() 288 | nb.fit(train_dtm, y_train) 289 | y_prob = nb.predict_proba(test_dtm)[:, 1] 290 | metrics.log_loss(y_test, y_prob) # 0.659 (a bit better than our previous model) 291 | 292 | # try tuning CountVectorizer and repeat Naive Bayes 293 | vect = CountVectorizer(stop_words='english') 294 | train_dtm = vect.fit_transform(X_train) 295 | test_dtm = vect.transform(X_test) 296 | nb.fit(train_dtm, y_train) 297 | y_prob = nb.predict_proba(test_dtm)[:, 1] 298 | metrics.log_loss(y_test, y_prob) # 0.637 (even better) 299 | 300 | # try switching to logistic regression 301 | logreg.fit(train_dtm, y_train) 302 | y_prob = logreg.predict_proba(test_dtm)[:, 1] 303 | metrics.log_loss(y_test, y_prob) # 0.573 (much better!) 304 | 305 | 306 | ''' 307 | Create features from BodyMarkdown using TextBlob 308 | ''' 309 | 310 | # examine BodyMarkdown for first question 311 | train.iloc[0].BodyMarkdown 312 | 313 | # calculate the number of sentences in that question using TextBlob 314 | from textblob import TextBlob 315 | len(TextBlob(train.iloc[0].BodyMarkdown).sentences) 316 | 317 | # calculate the number of sentences for all questions (raises an error) 318 | train.BodyMarkdown.apply(lambda x: len(TextBlob(x).sentences)) 319 | 320 | # explicitly decode string to unicode to fix error (WARNING: VERY SLOW) 321 | train['BodySentences'] = train.BodyMarkdown.apply(lambda x: len(TextBlob(x.decode('utf-8')).sentences)) 322 | -------------------------------------------------------------------------------- /code/17_ensembling_exercise.py: -------------------------------------------------------------------------------- 1 | # Helper code for class 17 exercise 2 | 3 | # define the function 4 | def make_features(filename): 5 | df = pd.read_csv(filename, index_col=0) 6 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True) 7 | df['TitleLength'] = df.Title.apply(len) 8 | df['BodyLength'] = df.BodyMarkdown.apply(len) 9 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1) 10 | return df 11 | 12 | # apply function to both training and testing files 13 | train = make_features('train.csv') 14 | test = make_features('test.csv') 15 | 16 | # define X and y 17 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags'] 18 | X = train[feature_cols] 19 | y = train.OpenStatus 20 | 21 | ############################################################################### 22 | ##### Create some models with the derived features 23 | ############################################################################### 24 | 25 | 26 | ############################################################################### 27 | ##### Count vectorizer 28 | ############################################################################### 29 | 30 | 31 | # define X and y 32 | X = train.Title 33 | y = train.OpenStatus 34 | 35 | # split into training and testing sets 36 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 37 | 38 | # use CountVectorizer with the default settings 39 | from sklearn.feature_extraction.text import CountVectorizer 40 | vect = CountVectorizer() 41 | 42 | # fit and transform on X_train, but only transform on X_test 43 | train_dtm = vect.fit_transform(X_train) 44 | test_dtm = vect.transform(X_test) 45 | 46 | 47 | ############################################################################### 48 | ##### Create a model with the text features 49 | ############################################################################### 50 | 51 | -------------------------------------------------------------------------------- /code/18_clustering.py: -------------------------------------------------------------------------------- 1 | ''' 2 | THE DATA 3 | 4 | We have data about cars: things like MPG, acceleration, weight, etc. However, 5 | we don't have logical groupings for these cars. We can construct these 6 | manually using our domain knowledge (e.g. we could put all of the high mpg cars 7 | together and all of the low mpg cars together), but we want a more automatic 8 | way of grouping these vehicles that can take into account more features. 9 | ''' 10 | 11 | # Imports 12 | from sklearn.cluster import KMeans # K means model 13 | import matplotlib.pyplot as plt 14 | import pandas as pd 15 | import numpy as np 16 | 17 | # Read in data 18 | data = pd.read_table('auto_mpg.txt', sep='|') # All values range from 0 to 1 19 | data.drop('car_name', axis=1, inplace=True) # Drop labels from dataframe 20 | data.head() 21 | 22 | 23 | 24 | ''' 25 | CLUSTER ANALYSIS 26 | How do we implement a k-means clustering algorithm? 27 | 28 | scikit-learn KMeans documentation for reference: 29 | http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html 30 | ''' 31 | 32 | # Standardize our data 33 | from sklearn.preprocessing import StandardScaler 34 | scaler = StandardScaler() 35 | data_scaled = scaler.fit_transform(data) 36 | 37 | 38 | # Set random seed for reproducibility 39 | np.random.seed(0) 40 | 41 | # Run KMeans 42 | est = KMeans(n_clusters=2, init='random') # Instatiate estimator 43 | est.fit(data_scaled) # Fit your data 44 | y_kmeans = est.predict(data_scaled) # Make cluster "predictions" 45 | 46 | # Inspect the data by looking at the means for each cluster 47 | data.groupby(y_kmeans).mean() 48 | 49 | # This can be compared to the overall means for each variable 50 | data.mean() 51 | 52 | # We can get the coordiantes for the center of each cluster 53 | centers = est.cluster_centers_ 54 | 55 | 56 | 57 | ''' 58 | VISUALIZING THE CLUSTERS 59 | ''' 60 | 61 | # We can create a nice plot to visualize this upon two of the dimensions 62 | colors = np.array(['red', 'green', 'blue', 'yellow', 'orange']) 63 | 64 | plt.figure() 65 | plt.scatter(data_scaled[:, 0], data_scaled[:, 5], c=colors[y_kmeans], s=50) 66 | plt.xlabel('MPG') 67 | plt.ylabel('Acceleration') 68 | plt.scatter(centers[:, 0], centers[:, 5], linewidths=3, marker='+', s=300, c='black') 69 | plt.show() 70 | 71 | # We can generate a scatter matrix to see all of the different dimensions paired 72 | pd.scatter_matrix(data, c=colors[y_kmeans], figsize=(15,15), s = 100) 73 | plt.show() 74 | 75 | 76 | 77 | ''' 78 | DETERMINING THE NUMBER OF CLUSTERS 79 | How do you choose k? There isn't a bright line, but we can evaluate 80 | performance metrics such as the silhouette coefficient across values of k. 81 | 82 | Note: You also have to take into account practical limitations of choosing k 83 | also. Ten clusters may give the best value, but it might not make sense in the 84 | context of your data. 85 | 86 | scikit-learn Clustering metrics documentation: 87 | http://scikit-learn.org/stable/modules/classes.html#clustering-metrics 88 | ''' 89 | 90 | # Create a bunch of different models 91 | k_rng = range(2,15) 92 | k_est = [KMeans(n_clusters = k).fit(data) for k in k_rng] 93 | 94 | # Silhouette Coefficient 95 | # Generally want SC to be closer to 1, while also minimizing k 96 | from sklearn import metrics 97 | silhouette_score = [metrics.silhouette_score(data, e.labels_, metric='euclidean') for e in k_est] 98 | 99 | # Plot the results 100 | plt.figure() 101 | plt.title('Silhouette coefficient for various values of k') 102 | plt.plot(k_rng, silhouette_score, 'b*-') 103 | plt.xlim([1,15]) 104 | plt.grid(True) 105 | plt.ylabel('Silhouette Coefficient') 106 | plt.show() -------------------------------------------------------------------------------- /code/18_regularization.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | ##### Regularization with Linear Regression 3 | ############################################################################### 4 | 5 | ## TASK: Regularized regression 6 | ## FUNCTIONS: Ridge, RidgeCV, Lasso, LassoCV 7 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html 8 | ## DATA: Crime (n=319 non-null, p=122, type=regression) 9 | ## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime 10 | 11 | 12 | ########## Prepare data ########## 13 | # read in data, remove categorical features, remove rows with missing values 14 | import pandas as pd 15 | crime = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?']) 16 | crime = crime.iloc[:, 5:] 17 | crime.dropna(inplace=True) 18 | crime.head() 19 | 20 | # define X and y 21 | X = crime.iloc[:, :-1] 22 | y = crime.iloc[:, -1] 23 | 24 | # split into train/test 25 | from sklearn.cross_validation import train_test_split 26 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 27 | 28 | 29 | ########## Linear Regression Model Without Regularization ########## 30 | # linear regression 31 | from sklearn.linear_model import LinearRegression 32 | lm = LinearRegression() 33 | lm.fit(X_train, y_train) 34 | lm.coef_ 35 | 36 | # make predictions and evaluate 37 | import numpy as np 38 | from sklearn import metrics 39 | preds = lm.predict(X_test) 40 | print 'RMSE (no regularization) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) 41 | 42 | 43 | ########## Ridge Regression Model ########## 44 | # ridge regression (alpha must be positive, larger means more regularization) 45 | from sklearn.linear_model import Ridge 46 | rreg = Ridge(alpha=0.1, normalize=True) 47 | rreg.fit(X_train, y_train) 48 | rreg.coef_ 49 | preds = rreg.predict(X_test) 50 | print 'RMSE (Ridge reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) 51 | 52 | # use RidgeCV to select best alpha 53 | from sklearn.linear_model import RidgeCV 54 | alpha_range = 10.**np.arange(-2, 3) 55 | rregcv = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=alpha_range) 56 | rregcv.fit(X_train, y_train) 57 | rregcv.alpha_ 58 | preds = rregcv.predict(X_test) 59 | print 'RMSE (Ridge CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) 60 | 61 | ########## Lasso Regression Model ########## 62 | # lasso (alpha must be positive, larger means more regularization) 63 | from sklearn.linear_model import Lasso 64 | las = Lasso(alpha=0.01, normalize=True) 65 | las.fit(X_train, y_train) 66 | las.coef_ 67 | preds = las.predict(X_test) 68 | print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) 69 | 70 | # try a smaller alpha 71 | las = Lasso(alpha=0.0001, normalize=True) 72 | las.fit(X_train, y_train) 73 | las.coef_ 74 | preds = las.predict(X_test) 75 | print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) 76 | 77 | # use LassoCV to select best alpha (tries 100 alphas by default) 78 | from sklearn.linear_model import LassoCV 79 | lascv = LassoCV(normalize=True, alphas=alpha_range) 80 | lascv.fit(X_train, y_train) 81 | lascv.alpha_ 82 | lascv.coef_ 83 | preds = lascv.predict(X_test) 84 | print 'RMSE (Lasso CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) 85 | 86 | ############################################################################### 87 | ##### Regularization with Logistic Regression 88 | ############################################################################### 89 | 90 | ## TASK: Regularized classification 91 | ## FUNCTION: LogisticRegression 92 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html 93 | ## DATA: Titanic (n=891, p=5 selected, type=classification) 94 | ## DATA DICTIONARY: https://www.kaggle.com/c/titanic-gettingStarted/data 95 | 96 | 97 | ########## Prepare data ########## 98 | # Get and prepare data 99 | titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/titanic_train.csv') 100 | titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1}) 101 | titanic.Age.fillna(titanic.Age.mean(), inplace=True) 102 | embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:] 103 | titanic = pd.concat([titanic, embarked_dummies], axis=1) 104 | 105 | # define X and y 106 | feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S'] 107 | X = titanic[feature_cols] 108 | y = titanic.Survived 109 | 110 | # split into train/test 111 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 112 | 113 | # standardize our data 114 | from sklearn.preprocessing import StandardScaler 115 | scaler = StandardScaler() 116 | scaler.fit(X_train) 117 | X_train_scaled = scaler.transform(X_train) 118 | X_test_scaled = scaler.transform(X_test) 119 | 120 | 121 | ########## Logistic Regression Model Without Regularization ########## 122 | # logistic regression 123 | from sklearn.linear_model import LogisticRegression 124 | logreg = LogisticRegression() 125 | logreg.fit(X_train_scaled, y_train) 126 | logreg.coef_ 127 | y_pred = logreg.predict(X_test_scaled) 128 | 129 | # Access accuracy 130 | print 'Accuracy (no penalty) =', metrics.accuracy_score(y_test, y_pred) 131 | 132 | 133 | ########## Logistic Regression With L1 Penalty ########## 134 | # logistic regression with L1 penalty (C must be positive, smaller means more regularization) 135 | logreg_l1 = LogisticRegression(C=0.1, penalty='l1') 136 | logreg_l1.fit(X_train_scaled, y_train) 137 | logreg_l1.coef_ 138 | y_pred_l1 = logreg_l1.predict(X_test_scaled) 139 | 140 | # Access accuracy 141 | print 'Accuracy (L1 penalty) =', metrics.accuracy_score(y_test, y_pred_l1) 142 | 143 | 144 | ########## Logistic Regression With L2 Penalty ########## 145 | # logistic regression with L2 penalty (C must be positive, smaller means more regularization) 146 | logreg_l2 = LogisticRegression(C=0.1, penalty='l2') 147 | logreg_l2.fit(X_train_scaled, y_train) 148 | logreg_l2.coef_ 149 | y_pred_l2 = logreg_l2.predict(X_test_scaled) 150 | 151 | # Access accuracy 152 | print 'Accuracy (L2 penalty) =', metrics.accuracy_score(y_test, y_pred_l2) -------------------------------------------------------------------------------- /code/19_advanced_sklearn.py: -------------------------------------------------------------------------------- 1 | ## TASK: Searching for optimal parameters 2 | ## FUNCTION: GridSearchCV 3 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/grid_search.html 4 | ## DATA: Titanic (n=891, p=5 selected, type=classification) 5 | ## DATA DICTIONARY: https://www.kaggle.com/c/titanic-gettingStarted/data 6 | 7 | # read in and prepare titanic data 8 | import pandas as pd 9 | titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/titanic_train.csv') 10 | titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1}) 11 | titanic.Age.fillna(titanic.Age.mean(), inplace=True) 12 | embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:] 13 | titanic = pd.concat([titanic, embarked_dummies], axis=1) 14 | 15 | # define X and y 16 | feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S'] 17 | X = titanic[feature_cols] 18 | y = titanic.Survived 19 | 20 | # use cross-validation to find best max_depth 21 | from sklearn.tree import DecisionTreeClassifier 22 | from sklearn.cross_validation import cross_val_score 23 | 24 | # try max_depth=2 25 | treeclf = DecisionTreeClassifier(max_depth=2, random_state=1) 26 | cross_val_score(treeclf, X, y, cv=10, scoring='roc_auc').mean() 27 | 28 | # try max_depth=3 29 | treeclf = DecisionTreeClassifier(max_depth=3, random_state=1) 30 | cross_val_score(treeclf, X, y, cv=10, scoring='roc_auc').mean() 31 | 32 | # use GridSearchCV to automate the search 33 | from sklearn.grid_search import GridSearchCV 34 | treeclf = DecisionTreeClassifier(random_state=1) 35 | depth_range = range(1, 21) 36 | param_grid = dict(max_depth=depth_range) 37 | grid = GridSearchCV(treeclf, param_grid, cv=10, scoring='roc_auc') 38 | grid.fit(X, y) 39 | 40 | # check the results of the grid search 41 | grid.grid_scores_ 42 | grid_mean_scores = [result[1] for result in grid.grid_scores_] 43 | 44 | # plot the results 45 | import matplotlib.pyplot as plt 46 | plt.plot(depth_range, grid_mean_scores) 47 | 48 | # what was best? 49 | grid.best_score_ 50 | grid.best_params_ 51 | grid.best_estimator_ 52 | 53 | # search a "grid" of parameters 54 | depth_range = range(1, 21) 55 | leaf_range = range(1, 11) 56 | param_grid = dict(max_depth=depth_range, min_samples_leaf=leaf_range) 57 | grid = GridSearchCV(treeclf, param_grid, cv=10, scoring='roc_auc') 58 | grid.fit(X, y) 59 | grid.grid_scores_ 60 | grid.best_score_ 61 | grid.best_params_ 62 | 63 | 64 | ## TASK: Standardization of features (aka "center and scale" or "z-score normalization") 65 | ## FUNCTION: StandardScaler 66 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 67 | ## EXAMPLE: http://nbviewer.ipython.org/github/rasbt/pattern_classification/blob/master/preprocessing/about_standardization_normalization.ipynb 68 | ## DATA: Wine (n=178, p=2 selected, type=classification) 69 | ## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Wine 70 | 71 | # fake data 72 | train = pd.DataFrame({'id':[0,1,2], 'length':[0.9,0.3,0.6], 'mass':[0.1,0.2,0.8], 'rings':[40,50,60]}) 73 | oos = pd.DataFrame({'length':[0.59], 'mass':[0.79], 'rings':[54.9]}) 74 | 75 | # define X and y 76 | X = train[['length','mass','rings']] 77 | y = train.id 78 | 79 | # KNN with k=1 80 | from sklearn.neighbors import KNeighborsClassifier 81 | knn = KNeighborsClassifier(n_neighbors=1) 82 | knn.fit(X, y) 83 | 84 | # what "should" it predict? what does it predict? 85 | knn.predict(oos) 86 | 87 | # standardize the features 88 | from sklearn.preprocessing import StandardScaler 89 | scaler = StandardScaler() 90 | scaler.fit(X) 91 | X_scaled = scaler.transform(X) 92 | 93 | # compare original to standardized 94 | X.values 95 | X_scaled 96 | 97 | # figure out how it standardized 98 | scaler.mean_ 99 | scaler.std_ 100 | (X.values-scaler.mean_) / scaler.std_ 101 | 102 | # try this on real data 103 | wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None, usecols=[0,10,13]) 104 | wine.columns=['label', 'color', 'proline'] 105 | wine.head() 106 | wine.describe() 107 | 108 | # define X and y 109 | X = wine[['color', 'proline']] 110 | y = wine.label 111 | 112 | # split into train/test 113 | from sklearn.cross_validation import train_test_split 114 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 115 | 116 | # standardize X_train 117 | scaler.fit(X_train) 118 | X_train_scaled = scaler.transform(X_train) 119 | 120 | # check that it worked properly 121 | X_train_scaled[:, 0].mean() 122 | X_train_scaled[:, 0].std() 123 | X_train_scaled[:, 1].mean() 124 | X_train_scaled[:, 1].std() 125 | 126 | # standardize X_test 127 | X_test_scaled = scaler.transform(X_test) 128 | 129 | # is this right? 130 | X_test_scaled[:, 0].mean() 131 | X_test_scaled[:, 0].std() 132 | X_test_scaled[:, 1].mean() 133 | X_test_scaled[:, 1].std() 134 | 135 | # compare KNN accuracy on original vs scaled data 136 | knn = KNeighborsClassifier(n_neighbors=3) 137 | knn.fit(X_train, y_train) 138 | knn.score(X_test, y_test) 139 | knn.fit(X_train_scaled, y_train) 140 | knn.score(X_test_scaled, y_test) 141 | 142 | 143 | ## TASK: Chaining steps 144 | ## FUNCTION: Pipeline 145 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/pipeline.html 146 | ## DATA: Wine (n=178, p=2 selected, type=classification) 147 | ## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Wine 148 | 149 | # here is proper cross-validation on the original (unscaled) data 150 | X = wine[['color', 'proline']] 151 | y = wine.label 152 | knn = KNeighborsClassifier(n_neighbors=3) 153 | cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean() 154 | 155 | # why is this improper cross-validation on the scaled data? 156 | scaler = StandardScaler() 157 | X_scaled = scaler.fit_transform(X) 158 | cross_val_score(knn, X_scaled, y, cv=5, scoring='accuracy').mean() 159 | 160 | # fix this using Pipeline 161 | from sklearn.pipeline import make_pipeline 162 | pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3)) 163 | cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean() 164 | 165 | # using GridSearchCV with Pipeline 166 | neighbors_range = range(1, 21) 167 | param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range) 168 | grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy') 169 | grid.fit(X, y) 170 | grid.best_score_ 171 | grid.best_params_ 172 | -------------------------------------------------------------------------------- /code/19_gridsearchcv_exercise.py: -------------------------------------------------------------------------------- 1 | ''' 2 | EXERCISE: GridSearchCV with Stack Overflow competition data 3 | ''' 4 | 5 | import pandas as pd 6 | 7 | # define a function to create features 8 | def make_features(filename): 9 | df = pd.read_csv(filename, index_col=0) 10 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True) 11 | df['TitleLength'] = df.Title.apply(len) 12 | df['BodyLength'] = df.BodyMarkdown.apply(len) 13 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1) 14 | return df 15 | 16 | # apply function to both training and testing files 17 | train = make_features('train.csv') 18 | test = make_features('test.csv') 19 | 20 | # define X and y 21 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags'] 22 | X = train[feature_cols] 23 | y = train.OpenStatus 24 | 25 | 26 | ''' 27 | MAIN TASK: Use GridSearchCV to find optimal parameters for KNeighborsClassifier. 28 | - For "n_neighbors", try 5 different integer values. 29 | - For "weights", try 'uniform' and 'distance'. 30 | - Use 5-fold cross-validation (instead of 10-fold) to save computational time. 31 | - Remember that log loss is your evaluation metric! 32 | 33 | BONUS TASK #1: Once you have found optimal parameters, train your KNN model using 34 | those parameters, make predictions on the test set, and submit those predictions. 35 | 36 | BONUS TASK #2: Read the scikit-learn documentation for GridSearchCV to find the 37 | shortcut for accomplishing bonus task #1. 38 | ''' 39 | 40 | # MAIN TASK 41 | from sklearn.neighbors import KNeighborsClassifier 42 | knn = KNeighborsClassifier() 43 | from sklearn.grid_search import GridSearchCV 44 | neighbors_range = [20, 40, 60, 80, 100] 45 | weight_options = ['uniform', 'distance'] 46 | param_grid = dict(n_neighbors=neighbors_range, weights=weight_options) 47 | grid = GridSearchCV(knn, param_grid, cv=5, scoring='log_loss') 48 | grid.fit(X, y) 49 | grid.grid_scores_ 50 | grid.best_score_ 51 | grid.best_params_ 52 | 53 | # BONUS TASK #1 54 | knn = KNeighborsClassifier(n_neighbors=100, weights='uniform') 55 | knn.fit(X, y) 56 | y_prob = knn.predict_proba(test[feature_cols])[:, 1] 57 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id') 58 | sub.to_csv('sub.csv') 59 | 60 | # BONUS TASK #2 61 | y_prob = grid.predict_proba(test[feature_cols])[:, 1] 62 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id') 63 | sub.to_csv('sub.csv') 64 | -------------------------------------------------------------------------------- /code/19_regex_exercise.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Regular Expressions Exercise 3 | ''' 4 | 5 | # open file and store each line as one row 6 | with open('../data/homicides.txt', 'rU') as f: 7 | raw = [row for row in f] 8 | 9 | 10 | ''' 11 | Create a list of ages 12 | ''' 13 | 14 | import re 15 | 16 | ages = [] 17 | for row in raw: 18 | match = re.search(r'\d+ years old', row) 19 | if match: 20 | ages.append(match.group()) 21 | else: 22 | ages.append('0') 23 | 24 | # split the string on spaces, only keep the first element, and convert to int 25 | ages = [int(element.split()[0]) for element in ages] 26 | 27 | # check that 'raw' and 'ages' are the same length 28 | assert(len(raw)==len(ages)) 29 | 30 | # simplify process using a lookahead 31 | ages = [] 32 | for row in raw: 33 | match = re.search(r'\d+(?= years)', row) 34 | if match: 35 | ages.append(int(match.group())) 36 | else: 37 | ages.append(0) 38 | -------------------------------------------------------------------------------- /code/19_regex_reference.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Regular Expressions (regex) Reference Guide 3 | 4 | Sources: 5 | https://developers.google.com/edu/python/regular-expressions 6 | https://docs.python.org/2/library/re.html 7 | ''' 8 | 9 | ''' 10 | Basic Patterns: 11 | 12 | Ordinary characters match themselves exactly 13 | . matches any single character except newline \n 14 | \w matches a word character (letter, digit, underscore) 15 | \W matches any non-word character 16 | \b matches boundary between word and non-word 17 | \s matches single whitespace character (space, newline, return, tab, form) 18 | \S matches single non-whitespace character 19 | \d matches single digit (0 through 9) 20 | \t matches tab 21 | \n matches newline 22 | \r matches return 23 | \ match a special character, such as period: \. 24 | 25 | Rules for Searching: 26 | 27 | Search proceeds through string from start to end, stopping at first match 28 | All of the pattern must be matched 29 | 30 | Basic Search Function: 31 | 32 | match = re.search(r'pattern', string_to_search) 33 | Returns match object 34 | If there is a match, access match using match.group() 35 | If there is no match, match is None 36 | Use 'r' in front of pattern to designate a raw string 37 | ''' 38 | 39 | import re 40 | 41 | s = 'my 1st string!!' 42 | 43 | match = re.search(r'my', s) # returns match object 44 | if match: # checks whether match was found 45 | print match.group() # if match was found, then print result 46 | 47 | re.search(r'my', s).group() # single-line version (without error handling) 48 | re.search(r'st', s).group() # 'st' 49 | re.search(r'sta', s).group() # error 50 | re.search(r'\w\w\w', s).group() # '1st' 51 | re.search(r'\W', s).group() # ' ' 52 | re.search(r'\W\W', s).group() # '!!' 53 | re.search(r'\s', s).group() # ' ' 54 | re.search(r'\s\s', s).group() # error 55 | re.search(r'..t', s).group() # '1st' 56 | re.search(r'\s\St', s).group() # ' st' 57 | re.search(r'\bst', s).group() # 'st' 58 | 59 | 60 | ''' 61 | Repetition: 62 | 63 | + 1 or more occurrences of the pattern to its left 64 | * 0 or more occurrences 65 | ? 0 or 1 occurrence 66 | 67 | + and * are 'greedy': they try to use up as much of the string as possible 68 | 69 | Add ? after + or * to make them non-greedy: +? or *? 70 | ''' 71 | 72 | s = 'sid is missing class' 73 | 74 | re.search(r'miss\w+', s).group() # 'missing' 75 | re.search(r'is\w+', s).group() # 'issing' 76 | re.search(r'is\w*', s).group() # 'is' 77 | 78 | s = '

my heading

' 79 | 80 | re.search(r'<.+>', s).group() # '

my heading

' 81 | re.search(r'<.+?>', s).group() # '

' 82 | 83 | 84 | ''' 85 | Positions: 86 | 87 | ^ match start of a string 88 | $ match end of a string 89 | ''' 90 | 91 | s = 'sid is missing class' 92 | 93 | re.search(r'^miss', s).group() # error 94 | re.search(r'..ss', s).group() # 'miss' 95 | re.search(r'..ss$', s).group() # 'lass' 96 | 97 | 98 | ''' 99 | Brackets: 100 | 101 | [abc] match a or b or c 102 | \w, \s, etc. work inside brackets, except period just means a literal period 103 | [a-z] match any lowercase letter (dash indicates range unless it's last) 104 | [abc-] match a or b or c or - 105 | [^ab] match anything except a or b 106 | ''' 107 | 108 | s = 'my email is john-doe@gmail.com' 109 | 110 | re.search(r'\w+@\w+', s).group() # 'doe@gmail' 111 | re.search(r'[\w.-]+@[\w.-]+', s).group() # 'john-doe@gmail.com' 112 | 113 | 114 | ''' 115 | Lookarounds: 116 | 117 | Lookahead matches a pattern only if it is followed by another pattern 118 | 100(?= dollars) matches '100' only if it is followed by ' dollars' 119 | 120 | Lookbehind matches a pattern only if it is preceded by another pattern 121 | (?<=\$)100 matches '100' only if it is preceded by '$' 122 | ''' 123 | 124 | s = 'Name: Cindy, 30 years old' 125 | 126 | re.search(r'\d+(?= years? old)', s).group() # '30' 127 | re.search(r'(?<=Name: )\w+', s).group() # 'Cindy' 128 | 129 | 130 | ''' 131 | Match Groups: 132 | 133 | Parentheses create logical groups inside of match text 134 | match.group(1) corresponds to first group 135 | match.group(2) corresponds to second group 136 | match.group() corresponds to entire match text (as usual) 137 | ''' 138 | 139 | s = 'my email is john-doe@gmail.com' 140 | 141 | match = re.search(r'([\w.-]+)@([\w.-]+)', s) 142 | if match: 143 | match.group(1) # 'john-doe' 144 | match.group(2) # 'gmail.com' 145 | match.group() # 'john-doe@gmail.com' 146 | 147 | 148 | ''' 149 | Finding All Matches: 150 | 151 | re.findall() finds all matches and returns them as a list of strings 152 | list_of_strings = re.findall(r'pattern', string_to_search) 153 | 154 | If pattern includes parentheses, a list of tuples is returned 155 | ''' 156 | 157 | s = 'emails: joe@gmail.com, bob@gmail.com' 158 | 159 | re.findall(r'[\w.-]+@[\w.-]+', s) # ['joe@gmail.com', 'bob@gmail.com'] 160 | re.findall(r'([\w.-]+)@([\w.-]+)', s) # [('joe', 'gmail.com'), ('bob', 'gmail.com')] 161 | 162 | 163 | ''' 164 | Option Flags: 165 | 166 | Options flags modify the behavior of the pattern matching 167 | 168 | default: matching is case sensitive 169 | re.IGNORECASE: ignore uppercase/lowercase differences ('a' matches 'a' or 'A') 170 | 171 | default: period matches any character except newline 172 | re.DOTALL: allow period to match newline 173 | 174 | default: within a string of many lines, ^ and $ match start and end of entire string 175 | re.MULTILINE: allow ^ and $ to match start and end of each line 176 | 177 | Option flag is third argument to re.search() or re.findall(): 178 | re.search(r'pattern', string_to_search, re.IGNORECASE) 179 | re.findall(r'pattern', string_to_search, re.IGNORECASE) 180 | ''' 181 | 182 | s = 'emails: nicole@ga.co, joe@gmail.com, PAT@GA.CO' 183 | 184 | re.findall(r'\w+@ga\.co', s) # ['nicole@ga.co'] 185 | re.findall(r'\w+@ga\.co', s, re.IGNORECASE) # ['nicole@ga.co', 'PAT@GA.CO'] 186 | 187 | 188 | ''' 189 | Substitution: 190 | 191 | re.sub() finds all matches and replaces them with a specified string 192 | new_string = re.sub(r'pattern', r'replacement', string_to_search) 193 | 194 | Replacement string can refer to text from matching groups: 195 | \1 refers to group(1) 196 | \2 refers to group(2) 197 | etc. 198 | ''' 199 | 200 | s = 'sid is missing class' 201 | 202 | re.sub(r'is ', r'was ', s) # 'sid was missing class' 203 | 204 | s = 'emails: joe@gmail.com, bob@gmail.com' 205 | 206 | re.sub(r'([\w.-]+)@([\w.-]+)', r'\1@yahoo.com', s) # 'emails: joe@yahoo.com, bob@yahoo.com' 207 | 208 | 209 | ''' 210 | Useful, But Not Covered: 211 | 212 | re.split() splits a string by the occurrences of a pattern 213 | re.compile() compiles a pattern (for improved performance if it's used many times) 214 | A|B indicates a pattern that can match A or B 215 | ''' 216 | -------------------------------------------------------------------------------- /code/21_ensembles_example.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Imports 3 | ''' 4 | import pandas as pd 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.cross_validation import cross_val_score 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.feature_extraction.text import CountVectorizer 9 | from sklearn.pipeline import make_pipeline 10 | 11 | 12 | ''' 13 | Define a function that takes in a raw CSV file and returns a DataFrame that 14 | includes all created features (and any other modifications). That way, we 15 | can apply the same changes to both train.csv and test.csv. 16 | ''' 17 | 18 | # Define the function 19 | def make_features(filename): 20 | # Read in dataframe 21 | df = pd.read_csv(filename, index_col=0) 22 | 23 | #Rename columns 24 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True) 25 | 26 | # Get length of title of post 27 | df['TitleLength'] = df.Title.apply(len) 28 | 29 | # Get length of body of post 30 | df['BodyLength'] = df.BodyMarkdown.apply(len) 31 | 32 | # Number of tags for post 33 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1) 34 | 35 | # Is the title lowercase? 36 | df['TitleLowercase'] = (df.Title.str.lower() == df.Title).astype(int) 37 | 38 | # Create features that represent whether Title contains certain words 39 | df['TitleQuestion'] = df.Title.str.contains('question', case=False).astype(int) 40 | df['TitleNeed'] = df.Title.str.contains('need', case=False).astype(int) 41 | df['TitleHelp'] = df.Title.str.contains('help', case=False).astype(int) 42 | 43 | return df 44 | 45 | # Apply function to the training data 46 | train = make_features('train.csv') 47 | X = train.drop('OpenStatus', axis=1) 48 | y = train.OpenStatus 49 | 50 | # Read in test data 51 | test = make_features('test.csv') 52 | 53 | 54 | # Split into training and testing sets 55 | #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 56 | 57 | ''' 58 | Five feature logistic regression model 59 | ''' 60 | # Define feature cols 61 | feature_cols_logreg = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags'] 62 | 63 | # Perform cross validation to get an idea of the performance of the model 64 | logreg = LogisticRegression() 65 | -cross_val_score(logreg, X[feature_cols_logreg], y, scoring="log_loss", cv=5).mean() 66 | 67 | # Predict class probabilities for the actual testing data 68 | logreg.fit(X[feature_cols_logreg], y) 69 | y_prob_logreg = logreg.predict_proba(test[feature_cols_logreg])[:, 1] 70 | 71 | ''' 72 | Five feature random forest model 73 | ''' 74 | # Define feature cols 75 | feature_cols_rf = ['TitleLowercase', 'TitleQuestion', 'TitleNeed', 'TitleHelp'] 76 | 77 | # Perform cross validation to get an idea of the performance of the model 78 | rf = RandomForestClassifier() 79 | -cross_val_score(rf, X[feature_cols_rf], y, scoring="log_loss", cv=5).mean() 80 | 81 | # Predict class probabilities for the actual testing data 82 | rf.fit(X[feature_cols_rf], y) 83 | y_prob_rf = rf.predict_proba(test[feature_cols_rf])[:, 1] 84 | 85 | 86 | 87 | ''' 88 | Text logistic regression model on 'Title' using pipeline 89 | ''' 90 | 91 | # Make pipleline 92 | pipe = make_pipeline(CountVectorizer(stop_words='english'), LogisticRegression()) 93 | 94 | # Perform cross validation to get an idea of the performance of the model 95 | -cross_val_score(pipe, X['Title'], y, scoring="log_loss", cv=5).mean() 96 | 97 | # Predict class probabilities for the actual testing data 98 | pipe.fit(X['Title'], y) 99 | y_prob_pipe = pipe.predict_proba(test['Title'])[:, 1] 100 | 101 | 102 | ''' 103 | Create submission 104 | ''' 105 | # Ensemble predictions 106 | y_prob_combined = (y_prob_logreg + y_prob_rf + 2*y_prob_pipe) / 3 107 | 108 | # Create a DataFrame that has 'id' as the index, then export to a CSV file 109 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob_combined}).set_index('id') 110 | sub.to_csv('sub_ensemble.csv') 111 | -------------------------------------------------------------------------------- /data/airline_safety.csv: -------------------------------------------------------------------------------- 1 | airline,avail_seat_km_per_week,incidents_85_99,fatal_accidents_85_99,fatalities_85_99,incidents_00_14,fatal_accidents_00_14,fatalities_00_14 2 | Aer Lingus,320906734,2,0,0,0,0,0 3 | Aeroflot*,1197672318,76,14,128,6,1,88 4 | Aerolineas Argentinas,385803648,6,0,0,1,0,0 5 | Aeromexico*,596871813,3,1,64,5,0,0 6 | Air Canada,1865253802,2,0,0,2,0,0 7 | Air France,3004002661,14,4,79,6,2,337 8 | Air India*,869253552,2,1,329,4,1,158 9 | Air New Zealand*,710174817,3,0,0,5,1,7 10 | Alaska Airlines*,965346773,5,0,0,5,1,88 11 | Alitalia,698012498,7,2,50,4,0,0 12 | All Nippon Airways,1841234177,3,1,1,7,0,0 13 | American*,5228357340,21,5,101,17,3,416 14 | Austrian Airlines,358239823,1,0,0,1,0,0 15 | Avianca,396922563,5,3,323,0,0,0 16 | British Airways*,3179760952,4,0,0,6,0,0 17 | Cathay Pacific*,2582459303,0,0,0,2,0,0 18 | China Airlines,813216487,12,6,535,2,1,225 19 | Condor,417982610,2,1,16,0,0,0 20 | COPA,550491507,3,1,47,0,0,0 21 | Delta / Northwest*,6525658894,24,12,407,24,2,51 22 | Egyptair,557699891,8,3,282,4,1,14 23 | El Al,335448023,1,1,4,1,0,0 24 | Ethiopian Airlines,488560643,25,5,167,5,2,92 25 | Finnair,506464950,1,0,0,0,0,0 26 | Garuda Indonesia,613356665,10,3,260,4,2,22 27 | Gulf Air,301379762,1,0,0,3,1,143 28 | Hawaiian Airlines,493877795,0,0,0,1,0,0 29 | Iberia,1173203126,4,1,148,5,0,0 30 | Japan Airlines,1574217531,3,1,520,0,0,0 31 | Kenya Airways,277414794,2,0,0,2,2,283 32 | KLM*,1874561773,7,1,3,1,0,0 33 | Korean Air,1734522605,12,5,425,1,0,0 34 | LAN Airlines,1001965891,3,2,21,0,0,0 35 | Lufthansa*,3426529504,6,1,2,3,0,0 36 | Malaysia Airlines,1039171244,3,1,34,3,2,537 37 | Pakistan International,348563137,8,3,234,10,2,46 38 | Philippine Airlines,413007158,7,4,74,2,1,1 39 | Qantas*,1917428984,1,0,0,5,0,0 40 | Royal Air Maroc,295705339,5,3,51,3,0,0 41 | SAS*,682971852,5,0,0,6,1,110 42 | Saudi Arabian,859673901,7,2,313,11,0,0 43 | Singapore Airlines,2376857805,2,2,6,2,1,83 44 | South African,651502442,2,1,159,1,0,0 45 | Southwest Airlines,3276525770,1,0,0,8,0,0 46 | Sri Lankan / AirLanka,325582976,2,1,14,4,0,0 47 | SWISS*,792601299,2,1,229,3,0,0 48 | TACA,259373346,3,1,3,1,1,3 49 | TAM,1509195646,8,3,98,7,2,188 50 | TAP - Air Portugal,619130754,0,0,0,0,0,0 51 | Thai Airways,1702802250,8,4,308,2,1,1 52 | Turkish Airlines,1946098294,8,3,64,8,2,84 53 | United / Continental*,7139291291,19,8,319,14,2,109 54 | US Airways / America West*,2455687887,16,7,224,11,2,23 55 | Vietnam Airlines,625084918,7,3,171,1,0,0 56 | Virgin Atlantic,1005248585,1,0,0,0,0,0 57 | Xiamen Airlines,430462962,9,1,82,2,0,0 58 | -------------------------------------------------------------------------------- /data/drinks.csv: -------------------------------------------------------------------------------- 1 | country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent Afghanistan,0,0,0,0,AS Albania,89,132,54,4.9,EU Algeria,25,0,14,0.7,AF Andorra,245,138,312,12.4,EU Angola,217,57,45,5.9,AF Antigua & Barbuda,102,128,45,4.9,NAm Argentina,193,25,221,8.3,SA Armenia,21,179,11,3.8,EU Australia,261,72,212,10.4,OC Austria,279,75,191,9.7,EU Azerbaijan,21,46,5,1.3,EU Bahamas,122,176,51,6.3,NAm Bahrain,42,63,7,2,AS Bangladesh,0,0,0,0,AS Barbados,143,173,36,6.3,NAm Belarus,142,373,42,14.4,EU Belgium,295,84,212,10.5,EU Belize,263,114,8,6.8,NAm Benin,34,4,13,1.1,AF Bhutan,23,0,0,0.4,AS Bolivia,167,41,8,3.8,SA Bosnia-Herzegovina,76,173,8,4.6,EU Botswana,173,35,35,5.4,AF Brazil,245,145,16,7.2,SA Brunei,31,2,1,0.6,AS Bulgaria,231,252,94,10.3,EU Burkina Faso,25,7,7,4.3,AF Burundi,88,0,0,6.3,AF Cote d'Ivoire,37,1,7,4,AF Cabo Verde,144,56,16,4,AF Cambodia,57,65,1,2.2,AS Cameroon,147,1,4,5.8,AF Canada,240,122,100,8.2,NAm Central African Republic,17,2,1,1.8,AF Chad,15,1,1,0.4,AF Chile,130,124,172,7.6,SA China,79,192,8,5,AS Colombia,159,76,3,4.2,SA Comoros,1,3,1,0.1,AF Congo,76,1,9,1.7,AF Cook Islands,0,254,74,5.9,OC Costa Rica,149,87,11,4.4,NAm Croatia,230,87,254,10.2,EU Cuba,93,137,5,4.2,NAm Cyprus,192,154,113,8.2,EU Czech Republic,361,170,134,11.8,EU North Korea,0,0,0,0,AS DR Congo,32,3,1,2.3,AF Denmark,224,81,278,10.4,EU Djibouti,15,44,3,1.1,AF Dominica,52,286,26,6.6,NAm Dominican Republic,193,147,9,6.2,NAm Ecuador,162,74,3,4.2,SA Egypt,6,4,1,0.2,AF El Salvador,52,69,2,2.2,NAm Equatorial Guinea,92,0,233,5.8,AF Eritrea,18,0,0,0.5,AF Estonia,224,194,59,9.5,EU Ethiopia,20,3,0,0.7,AF Fiji,77,35,1,2,OC Finland,263,133,97,10,EU France,127,151,370,11.8,EU Gabon,347,98,59,8.9,AF Gambia,8,0,1,2.4,AF Georgia,52,100,149,5.4,EU Germany,346,117,175,11.3,EU Ghana,31,3,10,1.8,AF Greece,133,112,218,8.3,EU Grenada,199,438,28,11.9,NAm Guatemala,53,69,2,2.2,NAm Guinea,9,0,2,0.2,AF Guinea-Bissau,28,31,21,2.5,AF Guyana,93,302,1,7.1,SA Haiti,1,326,1,5.9,NAm Honduras,69,98,2,3,NAm Hungary,234,215,185,11.3,EU Iceland,233,61,78,6.6,EU India,9,114,0,2.2,AS Indonesia,5,1,0,0.1,AS Iran,0,0,0,0,AS Iraq,9,3,0,0.2,AS Ireland,313,118,165,11.4,EU Israel,63,69,9,2.5,AS Italy,85,42,237,6.5,EU Jamaica,82,97,9,3.4,NAm Japan,77,202,16,7,AS Jordan,6,21,1,0.5,AS Kazakhstan,124,246,12,6.8,AS Kenya,58,22,2,1.8,AF Kiribati,21,34,1,1,OC Kuwait,0,0,0,0,AS Kyrgyzstan,31,97,6,2.4,AS Laos,62,0,123,6.2,AS Latvia,281,216,62,10.5,EU Lebanon,20,55,31,1.9,AS Lesotho,82,29,0,2.8,AF Liberia,19,152,2,3.1,AF Libya,0,0,0,0,AF Lithuania,343,244,56,12.9,EU Luxembourg,236,133,271,11.4,EU Madagascar,26,15,4,0.8,AF Malawi,8,11,1,1.5,AF Malaysia,13,4,0,0.3,AS Maldives,0,0,0,0,AS Mali,5,1,1,0.6,AF Malta,149,100,120,6.6,EU Marshall Islands,0,0,0,0,OC Mauritania,0,0,0,0,AF Mauritius,98,31,18,2.6,AF Mexico,238,68,5,5.5,NAm Micronesia,62,50,18,2.3,OC Monaco,0,0,0,0,EU Mongolia,77,189,8,4.9,AS Montenegro,31,114,128,4.9,EU Morocco,12,6,10,0.5,AF Mozambique,47,18,5,1.3,AF Myanmar,5,1,0,0.1,AS Namibia,376,3,1,6.8,AF Nauru,49,0,8,1,OC Nepal,5,6,0,0.2,AS Netherlands,251,88,190,9.4,EU New Zealand,203,79,175,9.3,OC Nicaragua,78,118,1,3.5,NAm Niger,3,2,1,0.1,AF Nigeria,42,5,2,9.1,AF Niue,188,200,7,7,OC Norway,169,71,129,6.7,EU Oman,22,16,1,0.7,AS Pakistan,0,0,0,0,AS Palau,306,63,23,6.9,OC Panama,285,104,18,7.2,NAm Papua New Guinea,44,39,1,1.5,OC Paraguay,213,117,74,7.3,SA Peru,163,160,21,6.1,SA Philippines,71,186,1,4.6,AS Poland,343,215,56,10.9,EU Portugal,194,67,339,11,EU Qatar,1,42,7,0.9,AS South Korea,140,16,9,9.8,AS Moldova,109,226,18,6.3,EU Romania,297,122,167,10.4,EU Russian Federation,247,326,73,11.5,AS Rwanda,43,2,0,6.8,AF St. Kitts & Nevis,194,205,32,7.7,NAm St. Lucia,171,315,71,10.1,NAm St. Vincent & the Grenadines,120,221,11,6.3,NAm Samoa,105,18,24,2.6,OC San Marino,0,0,0,0,EU Sao Tome & Principe,56,38,140,4.2,AF Saudi Arabia,0,5,0,0.1,AS Senegal,9,1,7,0.3,AF Serbia,283,131,127,9.6,EU Seychelles,157,25,51,4.1,AF Sierra Leone,25,3,2,6.7,AF Singapore,60,12,11,1.5,AS Slovakia,196,293,116,11.4,EU Slovenia,270,51,276,10.6,EU Solomon Islands,56,11,1,1.2,OC Somalia,0,0,0,0,AF South Africa,225,76,81,8.2,AF Spain,284,157,112,10,EU Sri Lanka,16,104,0,2.2,AS Sudan,8,13,0,1.7,AF Suriname,128,178,7,5.6,SA Swaziland,90,2,2,4.7,AF Sweden,152,60,186,7.2,EU Switzerland,185,100,280,10.2,EU Syria,5,35,16,1,AS Tajikistan,2,15,0,0.3,AS Thailand,99,258,1,6.4,AS Macedonia,106,27,86,3.9,EU Timor-Leste,1,1,4,0.1,AS Togo,36,2,19,1.3,AF Tonga,36,21,5,1.1,OC Trinidad & Tobago,197,156,7,6.4,NAm Tunisia,51,3,20,1.3,AF Turkey,51,22,7,1.4,AS Turkmenistan,19,71,32,2.2,AS Tuvalu,6,41,9,1,OC Uganda,45,9,0,8.3,AF Ukraine,206,237,45,8.9,EU United Arab Emirates,16,135,5,2.8,AS United Kingdom,219,126,195,10.4,EU Tanzania,36,6,1,5.7,AF USA,249,158,84,8.7,NAm Uruguay,115,35,220,6.6,SA Uzbekistan,25,101,8,2.4,AS Vanuatu,21,18,11,0.9,OC Venezuela,333,100,3,7.7,SA Vietnam,111,2,1,2,AS Yemen,6,0,0,0.1,AS Zambia,32,19,4,2.5,AF Zimbabwe,64,18,4,4.7,AF -------------------------------------------------------------------------------- /data/imdb_movie_ratings_top_1000.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/data/imdb_movie_ratings_top_1000.csv -------------------------------------------------------------------------------- /data/imdb_movie_urls.csv: -------------------------------------------------------------------------------- 1 | http://www.imdb.com/title/tt1856010/ http://www.imdb.com/title/tt0816692/ http://www.imdb.com/title/tt1826940/ http://www.imdb.com/title/tt0993846/ http://www.imdb.com/title/tt0285403/ http://www.imdb.com/title/tt2084970/ http://www.imdb.com/title/tt2980516/ http://www.imdb.com/title/tt0386676/ http://www.imdb.com/title/tt1266020/ -------------------------------------------------------------------------------- /data/sales.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/data/sales.db -------------------------------------------------------------------------------- /data/vehicles.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/data/vehicles.db -------------------------------------------------------------------------------- /data/vehicles_test.csv: -------------------------------------------------------------------------------- 1 | price,year,miles,doors,type 2 | 3000,2003,130000,4,truck 3 | 6000,2005,82500,4,car 4 | 12000,2010,60000,2,car 5 | -------------------------------------------------------------------------------- /data/vehicles_train.csv: -------------------------------------------------------------------------------- 1 | price,year,miles,doors,type 2 | 22000,2012,13000,2,car 3 | 14000,2010,30000,2,car 4 | 13000,2010,73500,4,car 5 | 9500,2009,78000,4,car 6 | 9000,2007,47000,4,car 7 | 4000,2006,124000,2,car 8 | 3000,2004,177000,4,car 9 | 2000,2004,209000,4,truck 10 | 3000,2003,138000,2,car 11 | 1900,2003,160000,4,car 12 | 2500,2003,190000,2,truck 13 | 5000,2001,62000,4,car 14 | 1800,1999,163000,2,truck 15 | 1300,1997,138000,4,car 16 | -------------------------------------------------------------------------------- /homework/02_command_line_hw_soln.md: -------------------------------------------------------------------------------- 1 | ## Command Line Homework Solution 2 | #### The following solution assumes you are working from the class "DAT5" directory. 3 | * How many text messages are there? 4 | * Answer: 5574 5 | * Code: `wc data/SMSSpamCollection.txt` gives you the line count, word count, and character count 6 | 7 | * What is the average number of words per text? What is the average number of characters per text? 8 | * Answer: Words per text: 15.6 or 16.6 (see below for explanation); Characters per text: 85.7 or 81.9 (see below) 9 | * Code: 10 | * `wc data/SMSSpamCollection.txt` gives you the line count, word count, and character count. You can divide the word count by the line count (so the number of words in each line which represents one text) to get 92482/5574 = 16.6 words per text. However, if you want to be more technical about it, each line contains an extra word that is not part of the text, the "spam" or "ham" label. You could remove the number of "spam"/"ham" labels (one per line) from the total word count to get (92482 - 5574)/5574 = 15.6. 11 | * Similarly, using the line count and character count from the `wc` command, you can divide the character count by the line count to get 477907/5574 = 85.7. If you remove the characters counted for the "spam" and "ham" labels, you get (477907 - 4*(# of hams) - 5*(# of spams) )/5574 = (477907 - 4*(4827) - 5*(747) )/5574 = 81.9. **Note**: The point of this wasn't to necessarily get the exact numbers but to identify that you can use `wc` to get a quick idea of features and labels in your data without having to open it. 12 | 13 | 14 | * How many messages are spam? How many are ham? 15 | * Answer: Spam: 4827 Ham: 747 16 | * Code: 17 | * `grep -w 'ham' data/SMSSpamCollection.txt | wc` gives you the line count of lines labeled 'ham' in the file. 18 | * `grep -w 'spam' data/SMSSpamCollection.txt | wc` gives you the line count of lines labeled 'spam' in the file. 19 | 20 | * Is there a difference between the number of words per text and characters per text in messages that are spam vs. those that are ham? What are these numbers? 21 | * Answer: Yes, there is a difference. It seems that the "spam" texts have a much higher words per text and characters per text. Using the simplified calculations (i.e. not remove the "spam" and "ham" from the word counts), we get the following numbers. 22 | ``` 23 | Words per Text Char per Text 24 | Ham: 15.3 76.6 25 | Spam: 24.9 145.12 26 | ``` 27 | * Code: 28 | * `grep -w 'ham' data/SMSSpamCollection.txt | wc` gives the line, word, and character count for all of the lines labeled 'ham'. You can divide the word count by line count to get the 'Words per Text' and divide the character count by line count to get the 'Characters per Text'. 29 | * `grep -w 'spam' data/SMSSpamCollection.txt | wc` gives the line, word, and character count for all of the lines labeled 'spam'. You can divide the word count by line count to get the 'Words per Text' and divide the character count by line count to get the 'Characters per Text'. 30 | 31 | * **Bonus**: If you feel that this is too easy, research the `awk` command to learn how to calculate and print out these averages in the console. 32 | * Answer: See below 33 | * Code: 34 | * `grep -w 'spam' data/SMSSpamCollection.txt | wc | awk '{print "Words per text: "$2/$1}'` will give you the words per text. Notice the format of `awk` here. You are telling it to print something and pass it column number labels. `wc` prints out three columns: lines, words, and characters. For the words per text (i.e. line) we want to divide the second column by the first. 35 | * `grep -w 'spam' data/SMSSpamCollection.txt | wc | awk '{print "Characters per text: "$3/$1}'` will give you the character per text. 36 | 37 | * Separate the spam and ham messages into files "spam_messages.txt" and "ham_messages.txt". 38 | * Answer: The code below accomplishes this. 39 | * Code: 40 | * `grep -w 'ham' data/SMSSpamCollection.txt > ham.txt` takes the output of the `grep`, which is all of the lines that have a label ham, and puts it into a file called `ham.txt` using the `>` operator. 41 | * `grep -w 'spam' data/SMSSpamCollection.txt > spam.txt` takes the output of the `grep`, which is all of the lines that have a label spam, and puts it into a file called `spam.txt` using the `>` operator. 42 | -------------------------------------------------------------------------------- /homework/03_pandas_hw_soln.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Exploratory Data Analysis Homework Solution 3 | ''' 4 | 5 | ''' 6 | Use the automotive mpg data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.csv) 7 | to complete the following parts. Please turn in your code for each part. 8 | Before each code chunk, give a brief description (one line) of what the code is 9 | doing (e.g. "Loads the data" or "Creates scatter plot of mpg and weight"). If 10 | the code output produces a plot or answers a question, give a brief 11 | interpretation of the output (e.g. "This plot shows X,Y,Z" or "The mean for 12 | group A is higher than the mean for group B which means X,Y,Z"). 13 | ''' 14 | 15 | ''' 16 | Part 1 17 | Load the data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt) 18 | into a DataFrame. Try looking at the "head" of the file in the command line 19 | to see how the file is delimited and how to load it. 20 | Note: You do not need to turn in any command line code you may use. 21 | ''' 22 | 23 | # Imports 24 | import pandas as pd 25 | 26 | # Reads text file and uses '|' as separator 27 | auto = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt', sep='|') 28 | auto = pd.read_table('auto_mpg.txt', sep='|') # This is if you are reading from you computer 29 | # Note: This assumes that '.../DAT5/data' is your working directory. 30 | 31 | 32 | ''' 33 | Part 2 34 | Get familiar with the data. Answer the following questions: 35 | - What is the shape of the data? How many rows and columns are there? 36 | - What variables are available? 37 | - What are the ranges for the values in each numeric column? 38 | - What is the average value for each column? Does that differ significantly 39 | from the median? 40 | ''' 41 | 42 | auto.shape # There are 392 rows and 9 columns 43 | 44 | auto.columns # This lists the column names that are available 45 | auto.info() # This lists the column names as well as their data type. 46 | 47 | # You can infer the range from the information available in describe 48 | auto.describe() # This will give you the five number summary for all numeric variables 49 | auto.min(numeric_only=True) # This will give you all of the minimums for numeric variables 50 | auto.max(numeric_only=True) # This will give you all of the maximums for numeric variables 51 | # You can calculate the range with the above info as shown below. 52 | auto.max(numeric_only=True) - auto.min(numeric_only=True) # Range 53 | 54 | auto.mean() # Means for all numeric variables 55 | auto.median() # Medians for all numeric variables 56 | # How much greater is the mean than the median? 57 | auto.mean() - auto.median() 58 | # The means are somewhat greater than the medians. 59 | 60 | 61 | ''' 62 | Part 3 63 | Use the data to answer the following questions: 64 | - Which 5 cars get the best gas mileage? 65 | - Which 5 cars with more than 4 cylinders get the best gas mileage? 66 | - Which 5 cars get the worst gas mileage? 67 | - Which 5 cars with 4 or fewer cylinders get the worst gas mileage? 68 | ''' 69 | 70 | # 5 cars that get best gas mileage 71 | auto.sort_index(by='mpg', ascending=False)[0:5][['car_name','mpg']] 72 | 73 | # 5 cars with more than 4 cylinders that get the best gas mileage 74 | auto[auto.cylinders > 4].sort_index(by='mpg', ascending=False)[0:5][['car_name','mpg']] 75 | 76 | # 5 cars that get worst gas mileage 77 | auto.sort_index(by='mpg')[0:5][['car_name','mpg']] 78 | 79 | # 5 cars with 4 or fewer cylinders that get the worst gas mileage 80 | auto[auto.cylinders > 4].sort_index(by='mpg')[0:5][['car_name','mpg']] 81 | 82 | 83 | ''' 84 | Part 4 85 | Use groupby and aggregations to explore the relationships 86 | between mpg and the other variables. Which variables seem to have the greatest 87 | effect on mpg? 88 | Some examples of things you might want to look at are: 89 | - What is the mean mpg for cars for each number of cylindres (i.e. 3 cylinders, 90 | 4 cylinders, 5 cylinders, etc)? 91 | - Did mpg rise or fall over the years contained in this dataset? 92 | - What is the mpg for the group of lighter cars vs the group of heaver cars? 93 | Note: Be creative in the ways in which you divide up the data. You are trying 94 | to create segments of the data using logical filters and comparing the mpg 95 | for each segment of the data. 96 | ''' 97 | 98 | # Mean mpg for cars for each number of cylinders 99 | auto.groupby(by='cylinders').mpg.mean() 100 | 101 | # Mpg usually rose over the years contianed in this dataset 102 | auto.groupby(by='model_year').mpg.mean() 103 | 104 | # The mpg for the gorup of lighter cars vs the group of heavier cars 105 | # We can divide the dataset in half by the median (the lower half being the 106 | # lighter cars and the upper half being the heavier cars). 107 | auto[auto.weight <= auto.weight.median()].mpg.mean() # light cars mean mpg 108 | auto[auto.weight > auto.weight.median()].mpg.mean() # heavier cars mean mpg 109 | # It appears that the lighter cars get better gas mileage than the heavier cars 110 | 111 | # This question was pretty open ended, but here are some other things you could have looked at 112 | 113 | # The average mpg for the four quartiles of displacement 114 | # We didn't talk about the 'quantile' function in class, but it's a useful one! 115 | auto[auto.displacement <= auto.displacement.quantile(0.25)].mpg.mean() 116 | auto[(auto.displacement > auto.displacement.quantile(0.25)) & (auto.displacement <= auto.displacement.quantile(0.50))].mpg.mean() 117 | auto[(auto.displacement > auto.displacement.quantile(0.50)) & (auto.displacement <= auto.displacement.quantile(0.75))].mpg.mean() 118 | auto[auto.displacement > auto.displacement.quantile(0.75)].mpg.mean() 119 | # It appears that as engine displacement (size) increases, the average mpg decreases. This makes sense. 120 | 121 | # Instead of using the somewhat complicated logic of the 'quantile', you can easily divide your dataset 122 | # into buckets using the `cut` function. 123 | auto.groupby(pd.cut(auto.horsepower,5)).mpg.mean() 124 | # It appears that as horsepower increases, the average mpg decreases. This makes sense. 125 | 126 | auto.groupby(pd.cut(auto.acceleration, 5)).mpg.mean() 127 | # It appears that as acceleration increases, the average mpg increases. 128 | 129 | 130 | ''' 131 | I'll also include something I found particularly cool from Lloyd's homework. 132 | He wanted to look at how MPG has changed over time, but he also wanted to consider 133 | how specific groups have changed. He wanted to look at low, mid, and high power 134 | cars based upon their horsepower and see how these groups have changed over time. 135 | His code is below. In his data, he called the original dataset 'auto'. 136 | ''' 137 | # Now to look at how efficency has changed over time based on power and weight classes, 138 | # two things that we know play a large role in gas mileage. First, we create a table of 139 | # efficeincy by power class and year. 140 | 141 | horsey = pd.DataFrame() 142 | 143 | # Defines low power as below 100 horsepower 144 | horsey['low_power'] = auto[(auto.horsepower < 100)].groupby('model_year').mpg.mean() 145 | 146 | # Defines mid power as between 100 and 150 (inclusive) horsepower 147 | horsey['mid_power'] = auto[(auto.horsepower >= 100) & (auto.horsepower <= 150)].groupby('model_year').mpg.mean() 148 | 149 | # Defines high power as above 150 horsepower 150 | horsey['high_power'] = auto[auto.horsepower > 150].groupby('model_year').mpg.mean() 151 | ''' 152 | low_power mid_power high_power 153 | model_year 154 | 70 23.300000 18.333333 13.076923 155 | 71 26.357143 17.285714 13.333333 156 | 72 23.500000 15.000000 12.857143 157 | 73 22.166667 16.352941 12.727273 158 | 74 27.312500 15.500000 NaN 159 | 75 22.470588 17.500000 16.000000 160 | 76 25.750000 17.071429 15.500000 161 | 77 28.433333 18.100000 15.666667 162 | 78 28.363158 19.350000 17.700000 163 | 79 29.225000 20.266667 16.900000 164 | 80 34.516667 28.100000 NaN 165 | 81 31.372727 25.833333 NaN 166 | 82 32.607143 23.500000 NaN 167 | 168 | We can see from the data here that low power cars have seen much better gains in efficiency than 169 | mid or high power cars. I then wanted to see how much car weights have changed in that same time. 170 | ''' -------------------------------------------------------------------------------- /homework/04_visualization_hw_soln.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Visualization Homework Solution 3 | ''' 4 | 5 | ''' 6 | Use the automotive mpg data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt) 7 | to complete the following parts. Please turn in your code for each part. 8 | Before each code chunk, give a brief description (one line) of what the code is 9 | doing (e.g. "Loads the data" or "Creates scatter plot of mpg and weight"). If 10 | the code output produces a plot or answers a question, give a brief 11 | interpretation of the output (e.g. "This plot shows X,Y,Z" or "The mean for 12 | group A is higher than the mean for group B which means X,Y,Z"). 13 | ''' 14 | # Imports 15 | import pandas as pd 16 | import numpy as np 17 | import matplotlib.pyplot as plt 18 | 19 | # Reads text file and uses '|' as separator 20 | auto = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt', sep='|') 21 | 22 | ''' 23 | Part 1 24 | Produce a plot that compares the mean mpg for the different numbers of cylinders. 25 | ''' 26 | 27 | # The first part of creating this plot is to generate the appropriate data. 28 | # Since we want mean mpg FOR EACH number of cylinders, we should use a 'groupby'. 29 | auto.groupby('cylinders').mpg.mean() # Give us mean mpg for each cylinder count 30 | 31 | # Now that we have the data we want, we can think about how to plot it. 32 | # The keyword 'compare' indiciated that you probably want to use a bar chart. 33 | auto.groupby('cylinders').mpg.mean().plot(kind='bar') # Create plot from data 34 | plt.title("Comparing Mean MPG for Different Numbers of Cylinders") # Add title 35 | plt.xlabel("Number of Cylinders") # Add x label 36 | plt.ylabel("Average MPG") # Add y label 37 | plt.show() # Show plot 38 | # With the exception of the three cylinder car (of which there are only 4), 39 | # we can see that mean mpg decreases as number of cylinders increases. 40 | 41 | 42 | ''' 43 | Part 2 44 | Use a scatter matrix to explore relationships between different numeric variables. 45 | ''' 46 | pd.scatter_matrix(auto) # Generate scatter matrix 47 | pd.scatter_matrix(auto, c=auto.mpg) # Consider adding color to your scatter matrix too. 48 | plt.show() # Show plot 49 | ''' 50 | There are several things to notice here. First, we can talk about different 51 | variables' relationships with mpg. Looking across the top row, where mpg is on 52 | the y axis, we see that there is a clearly negative relationship between mpg 53 | and number of cylinders, displacement, horsepower, and weight. There is a 54 | clearly positive relationship between mpg and model_year. There is a vaguely 55 | positive relationships between mpg and acceleration, though it's not a very 56 | clear one. There also seems to be a weakly positive relationship between mpg 57 | and origin. 58 | 59 | There are also several other relationships you may notice: 60 | * Dipslacement and horsepower have a positive relationship. This makes sense, 61 | because horsepower should increase as the engine volume (displacement) gets 62 | larger. 63 | * Displacement and weight have a positive relationship. This makes sense, 64 | because heavier cars tend to need bigger engines. 65 | * Horsepower and weight have a positive relationship. This makes sense, 66 | because larger cars tend to have higher horsepower engines. 67 | 68 | There may be other inferences you could draw from this plot as well, but this 69 | demonstrates the usefulness of the scatter matrix in understanding your data 70 | visually. 71 | ''' 72 | 73 | 74 | ''' 75 | Part 3 76 | Use a plot to answer the following questions: 77 | ''' 78 | 79 | ''' 80 | -Do heavier or lighter cars get better mpg? 81 | ''' 82 | # Since we want to look at the relationship between two numeric variables, we 83 | # can use a scatterplot to see how they "move" with each other. 84 | auto.plot(kind='scatter', x='weight', y='mpg', alpha=0.5) # Create scatter plot 85 | plt.title('Car MPG by Weight') 86 | plt.xlabel('Car weight') 87 | plt.ylabel('MPG') 88 | plt.show() 89 | # From the plot, it appears lighter cars get better mpg. As weight increase, 90 | # mpg decreases. 91 | 92 | ''' 93 | -How are horsepower and displacement related? 94 | ''' 95 | # Once again, since we want to look at the relationship between two numeric 96 | # variables, we can use a scatterplot. 97 | # Notice that I didn't specify whether displacement or horsepower should be on 98 | # the x-axis. However, using my (limited) domain expertise, I would think that 99 | # horsepower is affected by the displacement of the engine. So I put 100 | # displacement on the x-axis and horsepower on the y-axis. 101 | auto.plot(kind='scatter', x='displacement', y='horsepower', alpha=0.5) 102 | plt.title('Horsepower by Engine Displacement') 103 | plt.xlabel('Engine Displacement') 104 | plt.ylabel('Horsepower ') 105 | plt.show() 106 | # This plot shows that displacement and horsepower have a positive relationship. 107 | 108 | ''' 109 | -What does the distribution of acceleration look like? 110 | ''' 111 | # Since I'm interested in the distribution of acceleration, I can use a 112 | # histogram to investigate that. 113 | auto.acceleration.hist() 114 | plt.title('Distribution of Acceleration') 115 | plt.xlabel('Acceleration') 116 | plt.ylabel('Frequency') 117 | plt.show() 118 | # We can see that acceleration has an almost normal distribution. The most 119 | # frequent value of acceleration is around 16. The values of acceleration 120 | # range from 8 to 25. 121 | 122 | ''' 123 | -How is mpg spread for cars with different numbers of cylinders? 124 | ''' 125 | # Since we are interested in the spread (as in the range of different values) 126 | # for each of the different cylinder counts, we should use a boxplot as it 127 | # illustrates the spread of a numeric variable and accepts the "by" parameter, 128 | # which allows us to generate a plot for each value of a variable. 129 | auto.boxplot('mpg', by='cylinders') 130 | plt.title('Car MPG by Number of Cylinders') 131 | plt.xlabel('Number of Cylinders') 132 | plt.ylabel('MPG') 133 | plt.show() 134 | # This plot gives us a lot of information. I'll list a few things to notice: 135 | # * The range for 3 cylinders is pretty small, which might be because there are 136 | # 4 observations. 137 | # * As shown in our earlier plots, mpg decreases as number of cylinders increases. 138 | # * Interestingly, there are 4 cylinder cars that get relatively low gas mileage. 139 | # * Over half of the 4 cylinder cars get better mpg than all of the 8 cylinders cars. 140 | 141 | ''' 142 | -Do cars made before or after 1975 get better average mpg? (Hint: You need to 143 | create a new column that encodes whether a year is before or after 1975.) 144 | ''' 145 | # There are several different ways to do this one. The most straightforward 146 | # way could be to create a new column called 'before_1975' that contains a 147 | # 'Before 1975' or 'After 1975'. We'll include 1975 in 'After 1975'. 148 | auto['before_1975'] = np.where(auto.model_year < 75,'Before 1975', 'After 1975') 149 | # Remember that np.where is like the IF function in Excel: 150 | # np.where(, , ) 151 | 152 | # Now we can get the data we need by use a group by. 153 | auto.groupby('before_1975').mpg.mean().plot(kind='bar') 154 | plt.title('Average MPG Before and After 1975') 155 | plt.xlabel('') 156 | plt.ylabel('Average MPG') 157 | plt.show() 158 | 159 | # The labels are a little cut off, so you can use some extra matplotlib for 160 | # formatting. 'set_xticklabels' let's you set several things. 161 | auto.groupby('before_1975').mpg.mean().plot(kind='bar').set_xticklabels(['After 1975','Before 1975'], rotation=0) 162 | plt.title('Average MPG Before and After 1975') 163 | plt.xlabel('') 164 | plt.ylabel('Average MPG') 165 | plt.show() 166 | # We can see that the average mpg for cars after 1975 is higher. 167 | 168 | # This could have been done without creating the extra variable. 169 | auto.groupby(auto.model_year < 75).mpg.mean().plot(kind='bar').set_xticklabels(['After 1975','Before 1975'], rotation=0) 170 | plt.title('Average MPG Before and After 1975') 171 | plt.xlabel('') 172 | plt.ylabel('Average MPG') 173 | plt.show() 174 | # We get the same results but without the intermediate step. -------------------------------------------------------------------------------- /homework/06_bias_variance.md: -------------------------------------------------------------------------------- 1 | ## Class 6 Pre-work: Bias-Variance Tradeoff 2 | 3 | Read this excellent article, [Understanding the Bias-Variance Tradeoff](http://scott.fortmann-roe.com/docs/BiasVariance.html), and be prepared to **discuss it in class** on Monday. 4 | 5 | **Note:** You can ignore sections 4.2 and 4.3. 6 | 7 | Here are some questions to think about while you read: 8 | * In the Party Registration example, what are the features? What is the response? Is this a regression or classification problem? 9 | * Conceptually, how is KNN being applied to this problem to make a prediction? 10 | * How do the four visualizations in section 3 relate to one another? Change the value of K using the slider, and make sure you understand what changed in the visualizations (and why it changed). 11 | * In figures 4 and 5, what do the lighter colors versus the darker colors mean? How is the darkness calculated? 12 | * What does the black line in figure 5 represent? What predictions would an ideal machine learning model make, with respect to this line? 13 | * Choose a very small value of K, and click the button "Generate New Training Data" a number of times. Do you "see" low variance or high variance? Do you "see" low bias or high bias? 14 | * Repeat this with a very large value of K. Do you "see" low variance or high variance? Do you "see" low bias or high bias? 15 | * Try using other values of K. What value of K do you think is "best"? How do you define "best"? 16 | * Why should we care about variance at all? Shouldn't we just minimize bias and ignore variance? 17 | * Does a high value for K cause "overfitting" or "underfitting"? 18 | -------------------------------------------------------------------------------- /homework/07_glass_identification.md: -------------------------------------------------------------------------------- 1 | ## Class 7 Homework: Glass Identification 2 | 3 | Let's practice what we have learned using the [Glass Identification dataset](http://archive.ics.uci.edu/ml/datasets/Glass+Identification). 4 | 5 | 1. Read the data into a DataFrame. 6 | 2. Briefly explore the data to make sure the DataFrame matches your expectations. 7 | 3. Let's convert this into a binary classification problem. Create a new DataFrame column called "binary": 8 | * If type of glass = 1/2/3/4, binary = 0. 9 | * If type of glass = 5/6/7, binary = 1. 10 | 4. Create a feature matrix "X". (Think carefully about which columns are actually features!) 11 | 5. Create a response vector "y" from the "binary" column. 12 | 6. Split X and y into training and testing sets. 13 | 7. Fit a KNN model on the training set using K=5. 14 | 8. Make predictions on the testing set and calculate accuracy. 15 | 9. Calculate the "null accuracy", which is the classification accuracy that could be achieved by always predicting the majority class. 16 | 17 | **Bonus:** 18 | * Write a for loop that computes the test set accuracy for a range of K values. 19 | * Plot K versus test set accuracy to help you choose an optimal value for K. 20 | -------------------------------------------------------------------------------- /homework/11_roc_auc.md: -------------------------------------------------------------------------------- 1 | ## Class 11 Pre-work: ROC Curves and Area Under the Curve (AUC) 2 | 3 | Before learning about ROC curves, it's important to be comfortable with the following terms: true positive, true negative, false positive, false negative, sensitivity, and specificity. If you aren't yet comfortable, Rahul Patwari has excellent videos on [Intuitive sensitivity and specificity](https://www.youtube.com/watch?v=U4_3fditnWg) (9 minutes) and [The tradeoff between sensitivity and specificity](https://www.youtube.com/watch?v=vtYDyGGeQyo) (13 minutes). 4 | 5 | Then, watch Kevin's video on [ROC Curves and Area Under the Curve](https://www.youtube.com/watch?v=OAl6eAyP-yo) (14 minutes), and be prepared to **discuss it in class** on Wednesday. (There's a blog post containing the [video transcript and screenshots](http://www.dataschool.io/roc-curves-and-auc-explained/), which might serve as a useful reference.) You can also play with the [visualization](http://www.navan.name/roc/) shown in the video. Optionally, you could also watch Rahul Patwari's video on [ROC curves](https://www.youtube.com/watch?v=21Igj5Pr6u4) (12 minutes). 6 | 7 | Here are some questions to think about: 8 | 9 | - If you have a classification model that outputs predicted probabilities, how could you convert those probabilities to class predictions? 10 | - What are the methods in scikit-learn that output predicted probabilities and class predictions? 11 | - Why are predicted probabilities (rather than just class predictions) required to generate an ROC curve? 12 | - Could you use an ROC curve for a regression problem? Why or why not? 13 | - What's another term for True Positive Rate? 14 | - If I wanted to increase specificity, how would I change the classification threshold? 15 | - Is it possible to adjust your classification threshold such that both sensitivity and specificity increase simultaneously? Why or why not? 16 | - What are the primary benefits of ROC curves over classification accuracy? 17 | - What should you do if your AUC is 0.2? 18 | - What would the plot of reds and blues look like for a dataset in which each observation was a credit card transaction, and the response variable was whether or not the transaction was fraudulent? (0 = not fraudulent, 1 = fraudulent) 19 | - Let's say your classifier has a sensitivity of 0.95 and a specificity of 0.3, and the classes are balanced. Would it result in more false positives or false negatives? 20 | - What's a real-world scenario in which you would prefer a high specificity (rather than a high sensitivity) for your classifier? 21 | -------------------------------------------------------------------------------- /homework/11_roc_auc_annotated.md: -------------------------------------------------------------------------------- 1 | ## Class 11 Pre-work: ROC Curves and Area Under the Curve (AUC) 2 | 3 | Before learning about ROC curves, it's important to be comfortable with the following terms: true positive, true negative, false positive, false negative, sensitivity, and specificity. If you aren't yet comfortable, Rahul Patwari has excellent videos on [Intuitive sensitivity and specificity](https://www.youtube.com/watch?v=U4_3fditnWg) (9 minutes) and [The tradeoff between sensitivity and specificity](https://www.youtube.com/watch?v=vtYDyGGeQyo) (13 minutes). 4 | 5 | Then, watch Kevin's video on [ROC Curves and Area Under the Curve](https://www.youtube.com/watch?v=OAl6eAyP-yo) (14 minutes), and be prepared to **discuss it in class** on Wednesday. (There's a blog post containing the [video transcript and screenshots](http://www.dataschool.io/roc-curves-and-auc-explained/), which might serve as a useful reference.) You can also play with the [visualization](http://www.navan.name/roc/) shown in the video. Optionally, you could also watch Rahul Patwari's video on [ROC curves](https://www.youtube.com/watch?v=21Igj5Pr6u4) (12 minutes). 6 | 7 | Here are some questions to think about: 8 | 9 | - If you have a classification model that outputs predicted probabilities, how could you convert those probabilities to class predictions? 10 | - Set a threshold, and classify everything above the threshold as a 1 and everything below the threshold as a 0. 11 | - What are the methods in scikit-learn that output predicted probabilities and class predictions? 12 | - predict_proba and predict. 13 | - Why are predicted probabilities (rather than just class predictions) required to generate an ROC curve? 14 | - Because an ROC curve is measuring the performance of a classifier at all possible thresholds, and thresholds only make sense in the context of predicted probabilities. 15 | - Could you use an ROC curve for a regression problem? Why or why not? 16 | - No, because ROC is a plot of TPR vs FPR, and those concepts have no meaning in a regression problem. 17 | - What's another term for True Positive Rate? 18 | - Sensitivity or recall. 19 | - If I wanted to increase specificity, how would I change the classification threshold? 20 | - Increase it. 21 | - Is it possible to adjust your classification threshold such that both sensitivity and specificity increase simultaneously? Why or why not? 22 | - No, because increasing either of those requires moving the threshold in opposite directions. 23 | - What are the primary benefits of ROC curves over classification accuracy? 24 | - Doesn't require setting a classification threshold, allows you to visualize the performance of your classifier, works well for unbalanced classes. 25 | - What should you do if your AUC is 0.2? 26 | - Reverse your predictions so that your AUC is 0.8. 27 | - What would the plot of reds and blues look like for a dataset in which each observation was a credit card transaction, and the response variable was whether or not the transaction was fraudulent? (0 = not fraudulent, 1 = fraudulent) 28 | - Blues would be significantly larger, lots of overlap between blues and reds. 29 | - Let's say your classifier has a sensitivity of 0.95 and a specificity of 0.3, and the classes are balanced. Would it result in more false positives or false negatives? 30 | - False positives, meaning it falsely predicted positive (the true status is negative). 31 | - What's a real-world scenario in which you would prefer a high specificity (rather than a high sensitivity) for your classifier? 32 | - Speed cameras issuing speeding tickets. 33 | -------------------------------------------------------------------------------- /homework/13_spam_filtering.md: -------------------------------------------------------------------------------- 1 | ## Class 13 Pre-work: Spam Filtering 2 | 3 | Read Paul Graham's [A Plan for Spam](http://www.paulgraham.com/spam.html) and be prepared to **discuss it in class on Wednesday**. 4 | 5 | Here are some questions to think about while you read: 6 | * Should a spam filter optimize for sensitivity or specificity, in Paul's opinion? 7 | * Before he tried the "statistical approach" to spam filtering, what was his approach? 8 | * What are the key components of his statistical filtering system? In other words, how does it work? 9 | * What did Paul say were some of the benefits of the statistical approach? 10 | * How good was his prediction of the "spam of the future"? 11 | -------------------------------------------------------------------------------- /homework/13_spam_filtering_annotated.md: -------------------------------------------------------------------------------- 1 | ## Class 13 Pre-work: Spam Filtering 2 | 3 | Read Paul Graham's [A Plan for Spam](http://www.paulgraham.com/spam.html) and be prepared to **discuss it in class on Wednesday**. 4 | 5 | Here are some questions to think about while you read: 6 | * Should a spam filter optimize for sensitivity or specificity, in Paul's opinion? 7 | * specificity to minimize false positives 8 | * Before he tried the "statistical approach" to spam filtering, what was his approach? 9 | * hand engineering features and computing a "score" 10 | * What are the key components of his statistical filtering system? In other words, how does it work? 11 | * scan the entire text (including headers) and tokenize it 12 | * count number of occurrences of each token in ham corpus and spam corpus 13 | * assign each token a spam score based upon its relative frequency 14 | * for new mail, only take 15 most interesting tokens into account 15 | * What did Paul say were some of the benefits of the statistical approach? 16 | * it works better (almost no false positives) 17 | * less work for him because it discovers features automatically 18 | * you know what the "score" means 19 | * can easily be tuned to the individual user 20 | * evolves with the spam 21 | * creates an implicit whitelist/blacklist of email addresses, server names, etc. 22 | * How good was his prediction of the "spam of the future"? 23 | * great! 24 | -------------------------------------------------------------------------------- /notebooks/11_titanic_exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:557342100eb7ce91ca76e7e4f24737943f3625640543427282904a15759174c8" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Titanic Exercise" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "collapsed": false, 21 | "input": [ 22 | "import pandas as pd\n", 23 | "from sklearn.cross_validation import train_test_split, cross_val_score\n", 24 | "from sklearn.linear_model import LogisticRegression\n", 25 | "from sklearn import metrics\n", 26 | "import numpy as np\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "%matplotlib inline" 29 | ], 30 | "language": "python", 31 | "metadata": {}, 32 | "outputs": [] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## Preparing the data" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Read in the data and look at the first 10 rows." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "collapsed": false, 51 | "input": [], 52 | "language": "python", 53 | "metadata": {}, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "Check for missing values." 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "collapsed": false, 66 | "input": [], 67 | "language": "python", 68 | "metadata": {}, 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "We are going to focus on Pclass, Sex, Age, and Embarked:\n", 76 | "\n", 77 | "- **Pclass:** leave as-is\n", 78 | "- **Sex:** convert \"male\" to 0 and \"female\" to 1\n", 79 | "- **Age:** fill in missing values using the mean\n", 80 | "- **Embarked:** create dummy variables" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "collapsed": false, 86 | "input": [], 87 | "language": "python", 88 | "metadata": {}, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "Create X and y using the features we have chosen." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "collapsed": false, 101 | "input": [], 102 | "language": "python", 103 | "metadata": {}, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Train/Test Split" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Split X and y into training and testing sets." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "collapsed": false, 123 | "input": [], 124 | "language": "python", 125 | "metadata": {}, 126 | "outputs": [] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Logistic Regression" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "Fit a logistic regression model on the training data." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "collapsed": false, 145 | "input": [], 146 | "language": "python", 147 | "metadata": {}, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Print the model's intercept." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "collapsed": false, 160 | "input": [], 161 | "language": "python", 162 | "metadata": {}, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "Print the model's coefficients. How do we interpret them?" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "collapsed": false, 175 | "input": [], 176 | "language": "python", 177 | "metadata": {}, 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Predict the probability of survival for the first person in X_train using scikit-learn." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "collapsed": false, 190 | "input": [], 191 | "language": "python", 192 | "metadata": {}, 193 | "outputs": [] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "Do this same calculation manually." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "collapsed": false, 205 | "input": [], 206 | "language": "python", 207 | "metadata": {}, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "Pretend this person was 10 years older, and calculate their probability of survival (manually)." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "collapsed": false, 220 | "input": [], 221 | "language": "python", 222 | "metadata": {}, 223 | "outputs": [] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "Pretend this person was a woman, and calculate their probability of survival (manually)." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "collapsed": false, 235 | "input": [], 236 | "language": "python", 237 | "metadata": {}, 238 | "outputs": [] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "## Model Evaluation" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "Make predictions on the testing data and calculate the accuracy." 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "collapsed": false, 257 | "input": [], 258 | "language": "python", 259 | "metadata": {}, 260 | "outputs": [] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "Compare this to the null accuracy." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "collapsed": false, 272 | "input": [], 273 | "language": "python", 274 | "metadata": {}, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "Print the confusion matrix. Does this model tend towards specificity or sensitivity?" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "collapsed": false, 287 | "input": [], 288 | "language": "python", 289 | "metadata": {}, 290 | "outputs": [] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "Calculate the specificity and the sensitivity." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "collapsed": false, 302 | "input": [], 303 | "language": "python", 304 | "metadata": {}, 305 | "outputs": [] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "Change the threshold to make the model more sensitive, then print the new confusion matrix." 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "collapsed": false, 317 | "input": [], 318 | "language": "python", 319 | "metadata": {}, 320 | "outputs": [] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "Recalculate the specificity and the sensitivity." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "collapsed": false, 332 | "input": [], 333 | "language": "python", 334 | "metadata": {}, 335 | "outputs": [] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "Plot the ROC curve. How can we interpret the results?" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "collapsed": false, 347 | "input": [], 348 | "language": "python", 349 | "metadata": {}, 350 | "outputs": [] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "Calculate the AUC." 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "collapsed": false, 362 | "input": [], 363 | "language": "python", 364 | "metadata": {}, 365 | "outputs": [] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "## Cross-Validation" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "Use cross-validation to check the AUC for the current model." 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "collapsed": false, 384 | "input": [], 385 | "language": "python", 386 | "metadata": {}, 387 | "outputs": [] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "Remove Embarked from the model and check AUC again using cross-validation." 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "collapsed": false, 399 | "input": [], 400 | "language": "python", 401 | "metadata": {}, 402 | "outputs": [] 403 | } 404 | ], 405 | "metadata": {} 406 | } 407 | ] 408 | } -------------------------------------------------------------------------------- /notebooks/13_naive_bayes_spam.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:b7e3a62e1216c53fa3e7d0fa56c5373dfe7f58c3817a1468b0abbc52dfe7b6a7" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Applying Naive Bayes classification to spam email" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Let's pretend we have an email with three words: \"Send money now.\" We want to classify that email as ham or spam.\n", 23 | "\n", 24 | "We'll use Naive Bayes classification:\n", 25 | "\n", 26 | "$$P(spam | \\text{send money now}) = \\frac {P(\\text{send money now} | spam) \\times P(spam)} {P(\\text{send money now})}$$\n", 27 | "\n", 28 | "By assuming that the features (the words) are conditionally independent, we can simplify the likelihood function:\n", 29 | "\n", 30 | "$$P(spam | \\text{send money now}) \\approx \\frac {P(\\text{send} | spam) \\times P(\\text{money} | spam) \\times P(\\text{now} | spam) \\times P(spam)} {P(\\text{send money now})}$$\n", 31 | "\n", 32 | "We could calculate all of the values in the numerator by examining a corpus of spam:\n", 33 | "\n", 34 | "$$P(spam | \\text{send money now}) \\approx \\frac {0.2 \\times 0.1 \\times 0.1 \\times 0.9} {P(\\text{send money now})} = \\frac {0.0018} {P(\\text{send money now})}$$\n", 35 | "\n", 36 | "We could repeat this process to calculate the probability that the email is ham:\n", 37 | "\n", 38 | "$$P(ham | \\text{send money now}) \\approx \\frac {0.05 \\times 0.01 \\times 0.1 \\times 0.1} {P(\\text{send money now})} = \\frac {0.000005} {P(\\text{send money now})}$$\n", 39 | "\n", 40 | "All we care about is whether spam or ham has the higher probability, and so we predict that the email is spam.\n", 41 | "\n", 42 | "What have we learned from this exercise?\n", 43 | "\n", 44 | "- The \"naive\" assumption of Naive Bayes (that the features are conditionally independent) is critical to making these calculations simple.\n", 45 | "- The normalization constant (the denominator) can be ignored since it's the same for all classes.\n", 46 | "- The prior probability is basically irrelevant once you have a lot of features.\n", 47 | "- The Naive Bayes classifier can handle a lot of irrelevant features." 48 | ] 49 | } 50 | ], 51 | "metadata": {} 52 | } 53 | ] 54 | } -------------------------------------------------------------------------------- /notebooks/18_regularization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:283eafa4edacfbbb8b51d404c8feab98319104a044aaa4138d97957373762033" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## Overfitting, revisited\n", 16 | "\n", 17 | "What is overfitting? Here are a few ways of explaining it:\n", 18 | "\n", 19 | "- Building a model that matches the training set too closely.\n", 20 | "- Building a model that does well on the training data, but doesn't generalize to out-of-sample data.\n", 21 | "- Learning from the noise in the data, rather than just the signal." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Overfitting\n", 29 | "\n", 30 | "" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Underfitting vs Overfitting\n", 38 | "" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "**What are some ways to overfit the data?**\n", 46 | "\n", 47 | "- Train and test on the same data\n", 48 | "- Create a model that is overly complex (one that doesn't generalize well)\n", 49 | " - Example: KNN in which K is too low\n", 50 | " - Example: Decision tree that is grown too deep\n", 51 | "\n", 52 | "An overly complex model has **low bias** but **high variance**." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Linear Regression, revisited\n", 60 | "\n", 61 | "**Question:** Are linear regression models high bias/low variance, or low bias/high variance?\n", 62 | "\n", 63 | "**Answer:** High bias/low variance (generally speaking)\n", 64 | "\n", 65 | "Great! So as long as we don't train and test on the same data, we don't have to worry about overfitting, right? Not so fast...." 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Overfitting with Linear Regression (part 1)\n", 73 | "\n", 74 | "Linear models can overfit if you include irrelevant features.\n", 75 | "\n", 76 | "**Question:** Why would that be the case?\n", 77 | "\n", 78 | "**Answer:** Because it will learn a coefficient for any feature you feed into the model, regardless of whether that feature has the signal or the noise.\n", 79 | "\n", 80 | "This is especially a problem when **p (number of features) is close to n (number of observations)**, because that model will naturally have high variance." 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### Overfitting with Linear Regression (part 2)\n", 88 | "\n", 89 | "Linear models can also overfit when the included features are highly correlated. From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares):\n", 90 | "\n", 91 | "> \"...coefficient estimates for Ordinary Least Squares rely on the independence of the model terms. When terms are correlated and the columns of the design matrix X have an approximate linear dependence, the design matrix becomes close to singular and as a result, the least-squares estimate becomes highly sensitive to random errors in the observed response, producing a large variance.\"" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "### Overfitting with Linear Regression (part 3)\n", 99 | "\n", 100 | "Linear models can also overfit if the coefficients are too large.\n", 101 | "\n", 102 | "**Question:** Why would that be the case?\n", 103 | "\n", 104 | "**Answer:** Because the larger the absolute value of the coefficient, the more power it has to change the predicted response. Thus it tends toward high variance, which can result in overfitting." 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Regularization\n", 112 | "\n", 113 | "Regularization is a method for \"constraining\" or \"regularizing\" the size of the coefficients, thus \"shrinking\" them towards zero. It tends to reduce variance more than it increases bias, and thus minimizes overfitting.\n", 114 | "\n", 115 | "Common regularization techniques for linear models:\n", 116 | "\n", 117 | "- **Ridge regression** (also known as \"L2 regularization\"): shrinks coefficients toward zero (but they never reach zero)\n", 118 | "- **Lasso regularization** (also known as \"L1 regularization\"): shrinks coefficients all the way to zero\n", 119 | "- **ElasticNet regularization**: balance between Ridge and Lasso\n", 120 | "\n", 121 | "Lasso regularization is useful if we believe many features are irrelevant, since a feature with a zero coefficient is essentially removed from the model. Thus, it is a useful technique for feature selection.\n", 122 | "\n", 123 | "How does regularization work?\n", 124 | "\n", 125 | "- A tuning parameter alpha (or sometimes lambda) imposes a penalty on the size of coefficients.\n", 126 | "- Instead of minimizing the \"loss function\" (mean squared error), it minimizes the \"loss plus penalty\".\n", 127 | "- A tiny alpha imposes no penalty on the coefficient size, and is equivalent to a normal linear model.\n", 128 | "- Increasing the alpha penalizes the coefficients and shrinks them toward zero." 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Bias-variance trade-off\n", 136 | "\n", 137 | "Our goal is to locate the optimum model complexity, and thus regularization is useful when we believe our model is too complex.\n", 138 | "\n", 139 | "" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "### Standardizing features\n", 147 | "\n", 148 | "It's usually recommended to standardize your features when using regularization.\n", 149 | "\n", 150 | "**Question:** Why would that be the case?\n", 151 | "\n", 152 | "**Answer:** If you don't standardize, features would be penalized simply because of their scale. Also, standardizing avoids penalizing the intercept (which wouldn't make intuitive sense)." 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "### Ridge vs Lasso Coefficient Plots\n", 160 | "\n", 161 | "Below is a visualization of what happens when you apply regularization. The general idea is that you are restricting the \"space\" in which your coefficients can be fit. This means you are shriking the coefficient space. You still want the coefficients that give the \"best\" model (as determined by you metric, e.g. RMSE, accuracy, AUC, etc), but you are restricting the area in which you can evaluate coefficients.\n", 162 | "\n", 163 | "In this specific image, we are fitting a model with two predictors, B1 and B2. The x-axis shows B1 and the y-axis shows B2. There is a third dimension here, our evaluation metric. For the sake of example, we can assume this is linear regression, so we are trying to minimize our Root Mean Squared Error (RMSE). B-hat represents the set of coefficients, B1 and B2, where RMSE is minimized. While this is the \"best\" model according to our criterion, we've imposed a penalty that restricts the coefficients to the blue box. So we want to find the point (representing two coeffcients B1 and B2) where RMSE is minimized within our blue box. Technically, the RMSE will be higher here, but it will be the lowest within our penalized box. Due to the shape or space for the regression problem and the shape of our penalty box, many of the \"optimal\" coefficients will be close to zero for Ridge Regression and exactly zero for LASSO Regression.\n", 164 | "\n", 165 | "" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### Ridge vs Lasso path diagrams\n", 173 | "\n", 174 | "Larger alpha (on the left here) means more regularization, which means more coefficients close to zero.\n", 175 | "" 176 | ] 177 | } 178 | ], 179 | "metadata": {} 180 | } 181 | ] 182 | } -------------------------------------------------------------------------------- /notebooks/images/18_bias_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_bias_variance.png -------------------------------------------------------------------------------- /notebooks/images/18_overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_overfitting.png -------------------------------------------------------------------------------- /notebooks/images/18_ridge_lasso_path.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_ridge_lasso_path.png -------------------------------------------------------------------------------- /notebooks/images/18_ridge_lasso_regression_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_ridge_lasso_regression_coefficients.png -------------------------------------------------------------------------------- /notebooks/images/18_underfitting_overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_underfitting_overfitting.png -------------------------------------------------------------------------------- /notebooks/images/cross_validation_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/cross_validation_diagram.png -------------------------------------------------------------------------------- /notebooks/images/cross_validation_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/cross_validation_example.png -------------------------------------------------------------------------------- /notebooks/images/estimating_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/estimating_coefficients.png -------------------------------------------------------------------------------- /notebooks/images/obama_clinton_tree.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/obama_clinton_tree.jpg -------------------------------------------------------------------------------- /notebooks/images/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/overfitting.png -------------------------------------------------------------------------------- /notebooks/images/r_squared.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/r_squared.png -------------------------------------------------------------------------------- /notebooks/images/salary_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_color.png -------------------------------------------------------------------------------- /notebooks/images/salary_regions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_regions.png -------------------------------------------------------------------------------- /notebooks/images/salary_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_tree.png -------------------------------------------------------------------------------- /notebooks/images/salary_tree_annotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_tree_annotated.png -------------------------------------------------------------------------------- /notebooks/images/salary_unpruned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_unpruned.png -------------------------------------------------------------------------------- /notebooks/images/slope_intercept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/slope_intercept.png -------------------------------------------------------------------------------- /notebooks/images/train_test_split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/train_test_split.png -------------------------------------------------------------------------------- /notebooks/images/training_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/training_error.png -------------------------------------------------------------------------------- /notebooks/images/tree_titanic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/tree_titanic.png -------------------------------------------------------------------------------- /notebooks/images/tree_vehicles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/tree_vehicles.png -------------------------------------------------------------------------------- /notebooks/images/tree_vs_linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/tree_vs_linear.png -------------------------------------------------------------------------------- /notebooks/images/underfitting_overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/underfitting_overfitting.png -------------------------------------------------------------------------------- /other/peer_review.md: -------------------------------------------------------------------------------- 1 | ## Peer Review Guidelines 2 | 3 | You will be assigned to review the project drafts of two of your peers. You will have one week to provide them with feedback. 4 | 5 | Expectations: 6 | * Read everything they wrote! 7 | * If they provided their data, review it and try to understand it. 8 | * Read their code and try to understand their thought process. 9 | * If their code can be run, try running it. 10 | * Spend at least one hour reviewing their project (including the time it takes to write the feedback). 11 | 12 | Your feedback would ideally consist of: 13 | * Strengths of their project (things you particularly like about it) 14 | * Comments about things you think could be improved 15 | * Questions about things you don't understand 16 | * Comments about their code 17 | * Suggestions for next steps 18 | * Guiding principle: Give feedback that would be helpful to you if it was your project! 19 | 20 | You should take a quick glance through their project as soon as possible, to make sure you understand what they have given you and what files you should be reviewing. If you're unclear, ask them about it! 21 | -------------------------------------------------------------------------------- /other/project.md: -------------------------------------------------------------------------------- 1 | # Course Project 2 | 3 | 4 | ## Overview 5 | 6 | The final project should represent significant original work applying data science techniques to an interesting problem. Final projects are individual attainments, but you should be talking frequently with your instructors and classmates about them. 7 | 8 | Address a data-related problem in your professional field or a field you're interested in. Pick a subject that you're passionate about; if you're strongly interested in the subject matter it'll be more fun for you and you'll produce a better project! 9 | 10 | To stimulate your thinking, here is an excellent list of [public data sources](public_data.md). Using public data is the most common choice. If you have access to private data, that's also an option, though you'll have to be careful about what results you can release. You are also welcome to compete in a [Kaggle competition](http://www.kaggle.com/) as your project, in which case the data will be provided to you. 11 | 12 | You should also take a look at [past projects](https://github.com/justmarkham/DAT-project-examples) from other GA Data Science students, to get a sense of the variety and scope of projects. 13 | 14 | 15 | ## Project Milestones 16 | 17 | 18 | ### March 30: Deadline for Discussing Project Ideas with an Instructor 19 | 20 | By March 30, you should talk with one of your instructors about your project idea(s). They can help you to choose between different project ideas, advise you on the appropriate scope for your project, and ensure that your project question might reasonably be answerable using the data science tools and techniques taught in the course. 21 | 22 | 23 | ### April 6: Project Question and Dataset 24 | 25 | Create a GitHub repository for your project. It should include a document that answers these questions: 26 | 27 | What is the question you hope to answer? What data are you planning to use to answer that question? What do you know about the data so far? Why did you choose this topic? 28 | 29 | Example: 30 | * I'm planning to predict passenger survival on the Titanic. 31 | * I have Kaggle's Titanic dataset with 10 passenger characteristics. 32 | * I know that many of the fields have missing values, that some of the text fields are messy and will require cleaning, and that about 38% of the passengers in the training set survive. 33 | * I chose this topic because I'm fascinated by the history of the Titanic. 34 | 35 | 36 | ### April 27: Project Presentation #1: Data Exploration and Analysis Plan 37 | 38 | You'll be giving a presentation to the class about the work you have done so far, as well as your plans for the project going forward. Your presentation should use slides (or a similar format). Your slides, exploratory code, and visualizations should be included in your GitHub repository. Here are some questions that you should address in your presentation: 39 | 40 | What data have you gathered, and how did you gather it? What steps have you taken to explore the data? Which areas of the data have you cleaned, and which areas still need cleaning? What insights have you gained from your exploration? Will you be able to answer your question with this data, or do you need to gather more data (or adjust your question)? How might you use modeling to answer your question? 41 | 42 | Example: 43 | * I've created visualizations and numeric summaries to explore how survivability differs by passenger characteristic, and it appears that gender and class have a large role in determining survivability. 44 | * I estimated missing values for age using the titles provided in the Name column. 45 | * I created features to represent "spouse on board" and "child on board" by further analyzing names. 46 | * I think that the fare and ticket columns might be useful for predicting survival, but I still need to clean those columns. 47 | * I analyzed the differences between the training and testing sets, and found that the average fare was slightly higher in the testing set. 48 | * Since I'm predicting a binary outcome, I plan to use a classification method such as logistic regression to make my predictions. 49 | 50 | 51 | ### May 18: First Draft Due 52 | 53 | **At a minimum**, your project repository on GitHub should contain: 54 | * A draft of your project paper (in the format specified [below](#june-3-project-presentation-2)) 55 | * Code, with lots of comments 56 | * Visualizations of your data 57 | 58 | **Ideally**, you would also include: 59 | * Draft slides for presentation #2 60 | * Data and data dictionary 61 | 62 | Your peers and instructors will provide feedback by May 25, according to [these guidelines](peer_review.md). 63 | 64 | **Tips for success:** 65 | * The work should stand "on its own", and should not depend upon the reader remembering your first project presentation. 66 | * The better you explain your project, and the easier it is to follow, the more useful feedback you will receive! 67 | * If your reviewers can actually run your code on the provided data, they will be able to give you more useful feedback on your code. (It can be very hard to make useful code suggestions on code that can't be run!) 68 | 69 | 70 | ### June 3: Project Presentation #2 71 | 72 | Your **project paper** should be written with a technical audience in mind. Here are the components you should cover: 73 | 74 | * Problem statement and hypothesis 75 | * Description of your data set and how it was obtained 76 | * Description of any pre-processing steps you took 77 | * What you learned from exploring the data, including visualizations 78 | * How you chose which features to use in your analysis 79 | * Details of your modeling process, including how you selected your models and validated them 80 | * Your challenges and successes 81 | * Possible extensions or business applications of your project 82 | * Conclusions and key learnings 83 | 84 | Your **presentation** should cover these components with less breadth and depth. Focus on creating an engaging, clear, and informative presentation that tells the story of your project and is suitable for a non-technical audience. 85 | 86 | Your project repository on GitHub should contain the following: 87 | 88 | * **Project paper:** any format (PDF, Markdown, etc.) 89 | * **Presentation slides:** any format (PDF, PowerPoint, Google Slides, etc.) 90 | * **Code:** commented Python scripts, and any other code you used in the project 91 | * **Visualizations:** integrated into your paper and/or slides 92 | * **Data:** data files in "raw" or "processed" format 93 | * **Data dictionary (aka "code book"):** description of each variable, including units 94 | 95 | If it's not possible or practical to include your entire dataset, you should link to your data source and provide a sample of the data. (GitHub has a [size limit](https://help.github.com/articles/what-is-my-disk-quota/) of 100 MB per file and 1 GB per repository.) If your data is private, you can either include an "anonymized" version of your data or create a private GitHub repository. 96 | -------------------------------------------------------------------------------- /other/public_data.md: -------------------------------------------------------------------------------- 1 | ## Public Data Sources 2 | 3 | * Open data catalogs from various governments and NGOs: 4 | * [NYC Open Data](https://nycopendata.socrata.com/) 5 | * [DC Open Data Catalog](http://data.dc.gov/) / [OpenDataDC](http://www.opendatadc.org/) 6 | * [DataLA](https://data.lacity.org/) 7 | * [data.gov](https://www.data.gov/) (see also: [Project Open Data Dashboard](http://data.civicagency.org/)) 8 | * [data.gov.uk](http://data.gov.uk/) 9 | * [US Census Bureau](http://www.census.gov/) 10 | * [World Bank Open Data](http://data.worldbank.org/) 11 | * [Humanitarian Data Exchange](http://docs.hdx.rwlabs.org/) 12 | * [Sunlight Foundation](http://sunlightfoundation.com/api/): government-focused data 13 | * [ProPublica Data Store](https://projects.propublica.org/data-store/) 14 | * Datasets hosted by academic institutions: 15 | * [UC Irvine Machine Learning Repository](http://archive.ics.uci.edu/ml/): datasets specifically designed for machine learning 16 | * [Stanford Large Network Dataset Collection](http://snap.stanford.edu/data/): graph data 17 | * [Inter-university Consortium for Political and Social Research](http://www.icpsr.umich.edu/) 18 | * [Pittsburgh Science of Learning Center's DataShop](http://www.learnlab.org/technologies/datashop/) 19 | * [Academic Torrents](http://academictorrents.com/): distributed network for sharing large research datasets 20 | * [Dataverse Project](http://dataverse.org/): searchable archive of research data 21 | * Datasets hosted by private companies: 22 | * [Quandl](https://www.quandl.com/): over 10 million financial, economic, and social datasets 23 | * [Amazon Web Services Public Data Sets](http://aws.amazon.com/datasets/) 24 | * [Kaggle](http://www.kaggle.com/) provides datasets with their challenges, but each competition has its own rules as to whether the data can be used outside of the scope of the competition. 25 | * Big lists of datasets: 26 | * [Awesome Public Datasets](https://github.com/caesar0301/awesome-public-datasets): Well-organized and frequently updated 27 | * [Rdatasets](http://vincentarelbundock.github.io/Rdatasets/): collection of 700+ datasets originally distributed with R packages 28 | * [RDataMining.com](http://www.rdatamining.com/resources/data) 29 | * [KDnuggets](http://www.kdnuggets.com/datasets/index.html) 30 | * [inside-R](http://www.inside-r.org/howto/finding-data-internet) 31 | * [100+ Interesting Data Sets for Statistics](http://rs.io/2014/05/29/list-of-data-sets.html) 32 | * [20 Free Big Data Sources](http://smartdatacollective.com/bernardmarr/235366/big-data-20-free-big-data-sources-everyone-should-know) 33 | * [Sebastian Raschka](https://github.com/rasbt/pattern_classification/blob/master/resources/dataset_collections.md): datasets categorized by format and topic 34 | * APIs: 35 | * [Apigee](https://apigee.com/providers): explore dozens of popular APIs 36 | * [Mashape](https://www.mashape.com/): explore hundreds of APIs 37 | * [Python APIs](http://www.pythonforbeginners.com/api/list-of-python-apis): Python wrappers for many APIs 38 | * Other interesting datasets: 39 | * [FiveThirtyEight](https://github.com/fivethirtyeight/data): data and code related to their articles 40 | * [The Upshot](https://github.com/TheUpshot/): data related to their articles 41 | * [Yelp Dataset Challenge](http://www.yelp.com/dataset_challenge): Yelp reviews, business attributes, users, and more from 10 cities 42 | * [Donors Choose](http://data.donorschoose.org/open-data/overview/): data related to their projects 43 | * [200,000+ Jeopardy questions](http://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/) 44 | * [CrowdFlower](http://www.crowdflower.com/data-for-everyone): interesting datasets created or enhanced by their contributors 45 | * [UFO reports](https://github.com/planetsig/ufo-reports): geolocated and time-standardized UFO reports for close to a century 46 | * [Reddit Top 2.5 Million](https://github.com/umbrae/reddit-top-2.5-million): all-time top 1,000 posts from each of the top 2,500 subreddits 47 | * Other resources: 48 | * [Datasets subreddit](http://www.reddit.com/r/datasets/): ask for help finding a specific data set, or post your own 49 | * [Center for Data Innovation](http://www.datainnovation.org/category/publications/data-set-blog/): blog posts about interesting, recently-released data sets. 50 | 51 | This is just the tip of the iceberg; there's a lot of data out there! 52 | -------------------------------------------------------------------------------- /other/resources.md: -------------------------------------------------------------------------------- 1 | # Resources for Continued Learning 2 | 3 | 4 | ## Blogs 5 | 6 | * [Simply Statistics](http://simplystatistics.org/): Written by the Biostatistics professors at Johns Hopkins University who also run Coursera's [Data Science Specialization](https://www.coursera.org/specialization/jhudatascience/1) 7 | * [yhat's blog](http://blog.yhathq.com/): Beginner-friendly content, usually in Python 8 | * [No Free Hunch](http://blog.kaggle.com/) (Kaggle's blog): Mostly interviews with competition winners, or updates on their competitions 9 | * [FastML](http://fastml.com/): Various machine learning content, often with code 10 | * [Edwin Chen](http://blog.echen.me/): Infrequently updated, but long and thoughtful pieces 11 | * [FiveThirtyEight](http://fivethirtyeight.com/): Tons of timely data-related content 12 | * [Machine Learning Mastery](http://machinelearningmastery.com/blog/): Frequent posts on machine learning, very accessible 13 | * [Data School](http://www.dataschool.io/): Kevin Markham's blog! Beginner-focused, with reference guides and videos 14 | * [MLWave](http://mlwave.com/): Detailed posts on Kaggle competitions, by a Kaggle Master 15 | * [Data Science 101](http://101.datascience.community/): Short, frequent content about all aspects of data science 16 | * [ML in the Valley](http://ml.posthaven.com/): Thoughtful pieces by the Director of Analytics at Codecademy 17 | 18 | 19 | ## Aggregators 20 | 21 | * [DataTau](http://www.datatau.com/): Like [Hacker News](https://news.ycombinator.com/), but for data 22 | * [MachineLearning on reddit](http://www.reddit.com/r/MachineLearning/): Very active subreddit 23 | * [Quora's Machine Learning section](http://www.quora.com/Machine-Learning): Lots of interesting Q&A 24 | * [Quora's Data Science topic FAQ](https://www.quora.com/What-is-the-Data-Science-topic-FAQ) 25 | * [KDnuggets](http://www.kdnuggets.com/): Data mining news, jobs, classes and more 26 | 27 | 28 | ## DC Data Groups 29 | 30 | * [Data Community DC](http://www.datacommunitydc.org/): Coordinates six local data-related meetup groups 31 | * [District Data Labs](http://www.districtdatalabs.com/): Offers courses and other projects to local data scientists 32 | 33 | 34 | ## Online Classes 35 | 36 | * [Coursera's Data Science Specialization](https://www.coursera.org/specialization/jhudatascience/1): Nine courses (running every month) and a Capstone project, taught in R 37 | * [Stanford's Statistical Learning](http://online.stanford.edu/course/statistical-learning): By the authors of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) and [Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/), taught in R, highly recommended (preview the [lecture videos](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/)) 38 | * [Coursera's Machine Learning](https://www.coursera.org/learn/machine-learning/): Andrew Ng's acclaimed course, taught in MATLAB/Octave 39 | * [Caltech's Learning from Data](http://work.caltech.edu/telecourse.html): Widely praised, not language-specific 40 | * [Udacity's Data Analyst Nanodegree](https://www.udacity.com/course/nd002): Project-based curriculum using Python, R, MapReduce, MongoDB 41 | * [Coursera's Data Mining Specialization](https://www.coursera.org/specialization/datamining/20): New specialization that began February 2015 42 | * [Coursera's Natural Language Processing](https://www.coursera.org/course/nlp): No upcoming sessions, but [lectures](https://class.coursera.org/nlp/lecture) and [slides](http://web.stanford.edu/~jurafsky/NLPCourseraSlides.html) are available 43 | * [SlideRule's Data Analysis Learning Path](https://www.mysliderule.com/learning-paths/data-analysis): Curated content from various online classes 44 | * [Udacity's Intro to Artificial Intelligence](https://www.udacity.com/course/intro-to-artificial-intelligence--cs271): Taught by Peter Norvig and Sebastian Thrun 45 | * [Coursera's Neural Networks for Machine Learning](https://www.coursera.org/course/neuralnets): Taught by Geoffrey Hinton, no upcoming sessions 46 | * [statistics.com](http://www.statistics.com/data-science/): Many online courses in data science 47 | * [CourseTalk](http://www.coursetalk.com/): Read reviews of online courses 48 | 49 | 50 | ## Online Content from Offline Classes 51 | 52 | * [Harvard's CS109 Data Science](http://cs109.github.io/2014/): Similar topics as General Assembly's course 53 | * [Columbia's Data Mining Class](http://www2.research.att.com/~volinsky/DataMining/Columbia2011/Columbia2011.html): Excellent slides 54 | * [Harvard's CS171 Visualization](http://www.cs171.org/2015/index.html): Includes programming in D3 55 | 56 | 57 | ## Face-to-Face Educational Programs 58 | 59 | * [Comparison of data science bootcamps](http://yet-another-data-blog.blogspot.com/2014/04/data-science-bootcamp-landscape-full.html): Up-to-date list maintained by a Zipfian Academy graduate 60 | * [The Complete List of Data Science Bootcamps & Fellowships](http://www.skilledup.com/articles/list-data-science-bootcamps/) 61 | * [Galvanize](http://www.galvanize.com/) (acquired [Zipfian Academy](http://www.zipfianacademy.com/)): Offers Data Science Immersive (Denver, Seattle, San Francisco) 62 | * [GalvanizeU](http://www.galvanizeu.com/): Offers Master of Engineering in Big Data (San Francisco) 63 | * [Data Science Retreat](http://datascienceretreat.com/): Primarily uses R (Berlin) 64 | * [Metis Data Science Bootcamp](http://www.thisismetis.com/data-science): Newer bootcamp (New York) 65 | * [Persontyle](http://www.persontyle.com/): Various course offerings (based in London) 66 | * [Software Carpentry](http://software-carpentry.org/): Two-day workshops, primarily for researchers and hosted by universities (worldwide) 67 | * [Colleges and Universities with Data Science Degrees](http://datascience.community/colleges) 68 | 69 | 70 | ## Conferences 71 | 72 | * [Knowledge Discovery and Data Mining (KDD)](http://www.kdd.org/): Hosted by ACM 73 | * [O'Reilly Strata + Hadoop World](http://strataconf.com/): Big focus on "big data" (San Jose, London, New York) 74 | * [PyData](http://pydata.org/): For developers and users of Python data tools (worldwide) 75 | * [PyCon](https://us.pycon.org/): For developers and users of Python (Portland in 2016) 76 | 77 | 78 | ## Books 79 | 80 | * [An Introduction to Statistical Learning with Applications in R](http://www-bcf.usc.edu/~gareth/ISL/) (free PDF) 81 | * [Elements of Statistical Learning](http://www-stat.stanford.edu/~tibs/ElemStatLearn/) (free PDF) 82 | * [Think Stats](http://www.greenteapress.com/thinkstats/) (free PDF or HTML) 83 | * [Mining of Massive Datasets](http://www.mmds.org/) (free PDF) 84 | * [Python for Informatics](http://www.pythonlearn.com/book.php) (free PDF or HTML) 85 | * [Statistics: Methods and Applications](http://www.statsoft.com/Textbook) (free HTML) 86 | * [Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do) 87 | * [Data Smart: Using Data Science to Transform Information into Insight](http://www.amazon.com/gp/product/111866146X/) 88 | * [Sams Teach Yourself SQL in 10 Minutes](http://www.amazon.com/Sams-Teach-Yourself-Minutes-Edition/dp/0672336073) 89 | 90 | 91 | ## Other Resources 92 | 93 | * [Open Source Data Science Masters](https://github.com/datasciencemasters/go): Huge list of resources 94 | * [Data Science Trello Board](https://trello.com/b/rbpEfMld/data-science): Another list of resources 95 | * [The Hitchhiker's Guide to Python](http://docs.python-guide.org/en/latest/): Online guide to understanding Python and getting good at it 96 | * [Python Reference](https://github.com/rasbt/python_reference): Python tips, tutorials, and more 97 | * [videolectures.net](http://videolectures.net/Top/Computer_Science/): Tons of academic videos 98 | * [Metacademy](http://www.metacademy.org/list): Quick summary of many machine learning terms, with links to resources for learning more 99 | * [Terms in data science defined in one paragraph](https://github.com/rasbt/pattern_classification/blob/master/resources/data_glossary.md) 100 | -------------------------------------------------------------------------------- /slides/01_course_overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/01_course_overview.pdf -------------------------------------------------------------------------------- /slides/01_course_overview.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/01_course_overview.pptx -------------------------------------------------------------------------------- /slides/02_git_github.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/02_git_github.pdf -------------------------------------------------------------------------------- /slides/02_git_github.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/02_git_github.pptx -------------------------------------------------------------------------------- /slides/04_apis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/04_apis.pdf -------------------------------------------------------------------------------- /slides/04_apis.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/04_apis.pptx -------------------------------------------------------------------------------- /slides/04_visualization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/04_visualization.pdf -------------------------------------------------------------------------------- /slides/04_visualization.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/04_visualization.pptx -------------------------------------------------------------------------------- /slides/05_intro_to_data_science.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/05_intro_to_data_science.pdf -------------------------------------------------------------------------------- /slides/05_intro_to_data_science.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/05_intro_to_data_science.pptx -------------------------------------------------------------------------------- /slides/05_machine_learning_knn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/05_machine_learning_knn.pdf -------------------------------------------------------------------------------- /slides/05_machine_learning_knn.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/05_machine_learning_knn.pptx -------------------------------------------------------------------------------- /slides/08_web_scraping.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/08_web_scraping.pdf -------------------------------------------------------------------------------- /slides/08_web_scraping.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/08_web_scraping.pptx -------------------------------------------------------------------------------- /slides/10_logistic_regression_confusion_matrix.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/10_logistic_regression_confusion_matrix.pdf -------------------------------------------------------------------------------- /slides/10_logistic_regression_confusion_matrix.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/10_logistic_regression_confusion_matrix.pptx -------------------------------------------------------------------------------- /slides/11_drawing_roc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/11_drawing_roc.pdf -------------------------------------------------------------------------------- /slides/11_drawing_roc.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/11_drawing_roc.pptx -------------------------------------------------------------------------------- /slides/13_bayes_theorem.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/13_bayes_theorem.pdf -------------------------------------------------------------------------------- /slides/13_bayes_theorem.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/13_bayes_theorem.pptx -------------------------------------------------------------------------------- /slides/13_naive_bayes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/13_naive_bayes.pdf -------------------------------------------------------------------------------- /slides/13_naive_bayes.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/13_naive_bayes.pptx -------------------------------------------------------------------------------- /slides/15_kaggle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/15_kaggle.pdf -------------------------------------------------------------------------------- /slides/15_kaggle.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/15_kaggle.pptx -------------------------------------------------------------------------------- /slides/18_clustering.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/18_clustering.pdf -------------------------------------------------------------------------------- /slides/18_clustering.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/18_clustering.pptx -------------------------------------------------------------------------------- /slides/20_sales_db_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/20_sales_db_schema.png -------------------------------------------------------------------------------- /slides/20_sql.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/20_sql.pdf -------------------------------------------------------------------------------- /slides/20_sql.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/20_sql.pptx --------------------------------------------------------------------------------