├── .gitignore
├── README.md
├── code
├── 00_python_beginner_workshop.py
├── 00_python_intermediate_workshop.py
├── 01_chipotle_homework_solution.py
├── 01_reading_files.py
├── 03_exploratory_analysis_pandas.py
├── 04_apis.py
├── 04_visualization.py
├── 05_iris_exercise.py
├── 05_sklearn_knn.py
├── 07_glass_id_homework_solution.py
├── 08_web_scraping.py
├── 10_logistic_regression_confusion_matrix.py
├── 13_naive_bayes.py
├── 15_kaggle.py
├── 17_ensembling_exercise.py
├── 18_clustering.py
├── 18_regularization.py
├── 19_advanced_sklearn.py
├── 19_gridsearchcv_exercise.py
├── 19_regex_exercise.py
├── 19_regex_reference.py
├── 20_sql.py
└── 21_ensembles_example.py
├── data
├── SMSSpamCollection.txt
├── airline_safety.csv
├── auto_mpg.txt
├── chipotle_orders.tsv
├── default.csv
├── drinks.csv
├── homicides.txt
├── imdb_movie_ratings_top_1000.csv
├── imdb_movie_urls.csv
├── kaggle_tweets.csv
├── sales.db
├── titanic_train.csv
├── vehicles.db
├── vehicles_test.csv
└── vehicles_train.csv
├── homework
├── 02_command_line_hw_soln.md
├── 03_pandas_hw_soln.py
├── 04_visualization_hw_soln.py
├── 06_bias_variance.md
├── 07_glass_identification.md
├── 11_roc_auc.md
├── 11_roc_auc_annotated.md
├── 13_spam_filtering.md
└── 13_spam_filtering_annotated.md
├── notebooks
├── 06_bias_variance.ipynb
├── 06_model_evaluation_procedures.ipynb
├── 09_linear_regression.ipynb
├── 11_cross_validation.ipynb
├── 11_roc_auc.ipynb
├── 11_titanic_exercise.ipynb
├── 13_bayes_iris.ipynb
├── 13_naive_bayes_spam.ipynb
├── 14_nlp.ipynb
├── 16_decision_trees.ipynb
├── 17_ensembling.ipynb
├── 18_regularization.ipynb
└── images
│ ├── 18_bias_variance.png
│ ├── 18_overfitting.png
│ ├── 18_ridge_lasso_path.png
│ ├── 18_ridge_lasso_regression_coefficients.png
│ ├── 18_underfitting_overfitting.png
│ ├── cross_validation_diagram.png
│ ├── cross_validation_example.png
│ ├── estimating_coefficients.png
│ ├── obama_clinton_tree.jpg
│ ├── overfitting.png
│ ├── r_squared.png
│ ├── salary_color.png
│ ├── salary_regions.png
│ ├── salary_tree.png
│ ├── salary_tree_annotated.png
│ ├── salary_unpruned.png
│ ├── slope_intercept.png
│ ├── train_test_split.png
│ ├── training_error.png
│ ├── tree_titanic.png
│ ├── tree_vehicles.png
│ ├── tree_vs_linear.png
│ └── underfitting_overfitting.png
├── other
├── peer_review.md
├── project.md
├── public_data.md
└── resources.md
└── slides
├── 01_course_overview.pdf
├── 01_course_overview.pptx
├── 02_Introduction_to_the_Command_Line.md
├── 02_git_github.pdf
├── 02_git_github.pptx
├── 04_apis.pdf
├── 04_apis.pptx
├── 04_visualization.pdf
├── 04_visualization.pptx
├── 05_intro_to_data_science.pdf
├── 05_intro_to_data_science.pptx
├── 05_machine_learning_knn.pdf
├── 05_machine_learning_knn.pptx
├── 08_web_scraping.pdf
├── 08_web_scraping.pptx
├── 10_logistic_regression_confusion_matrix.pdf
├── 10_logistic_regression_confusion_matrix.pptx
├── 11_drawing_roc.pdf
├── 11_drawing_roc.pptx
├── 13_bayes_theorem.pdf
├── 13_bayes_theorem.pptx
├── 13_naive_bayes.pdf
├── 13_naive_bayes.pptx
├── 15_kaggle.pdf
├── 15_kaggle.pptx
├── 18_clustering.pdf
├── 18_clustering.pptx
├── 20_sales_db_schema.png
├── 20_sql.pdf
└── 20_sql.pptx
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .DS_Store
3 | *.pyc
4 |
--------------------------------------------------------------------------------
/code/00_python_beginner_workshop.py:
--------------------------------------------------------------------------------
1 | '''
2 | Multi-line comments go between 3 quotation marks.
3 | You can use single or double quotes.
4 | '''
5 |
6 | # One-line comments are preceded by the pound symbol
7 |
8 |
9 | # BASIC DATA TYPES
10 |
11 | x = 5 # creates an object
12 | print type(x) # check the type: int (not declared explicitly)
13 | type(x) # automatically prints
14 | type(5) # assigning it to a variable is not required
15 |
16 | type(5.0) # float
17 | type('five') # str
18 | type(True) # bool
19 |
20 |
21 | # LISTS
22 |
23 | nums = [5, 5.0, 'five'] # multiple data types
24 | nums # print the list
25 | type(nums) # check the type: list
26 | len(nums) # check the length: 3
27 | nums[0] # print first element
28 | nums[0] = 6 # replace a list element
29 |
30 | nums.append(7) # list 'method' that modifies the list
31 | help(nums.append) # help on this method
32 | help(nums) # help on a list object
33 | nums.remove('five') # another list method
34 |
35 | sorted(nums) # 'function' that does not modify the list
36 | nums # it was not affected
37 | nums = sorted(nums) # overwrite the original list
38 | sorted(nums, reverse=True) # optional argument
39 |
40 | # list slicing [start:end:stride]
41 | weekdays = ['mon','tues','wed','thurs','fri']
42 | weekdays[0] # element 0
43 | weekdays[0:3] # elements 0, 1, 2
44 | weekdays[:3] # elements 0, 1, 2
45 | weekdays[3:] # elements 3, 4
46 | weekdays[-1] # last element (element 4)
47 | weekdays[::2] # every 2nd element (0, 2, 4)
48 | weekdays[::-1] # backwards (4, 3, 2, 1, 0)
49 |
50 | days = weekdays + ['sat','sun'] # concatenate lists
51 |
52 |
53 | # FUNCTIONS
54 |
55 | def give_me_five(): # function definition ends with colon
56 | return 5 # indentation required for function body
57 |
58 | give_me_five() # prints the return value (5)
59 | num = give_me_five() # assigns return value to a variable, doesn't print it
60 |
61 | def calc(x, y, op): # three parameters (without any defaults)
62 | if op == 'add': # conditional statement
63 | return x + y
64 | elif op == 'subtract':
65 | return x - y
66 | else:
67 | print 'Valid operations: add, subtract'
68 |
69 | calc(5, 3, 'add')
70 | calc(5, 3, 'subtract')
71 | calc(5, 3, 'multiply')
72 | calc(5, 3)
73 |
74 |
75 | # EXERCISE: Write a function that takes two parameters (hours and rate), and
76 | # returns the total pay.
77 |
78 | def compute_pay(hours, rate):
79 | return hours * rate
80 |
81 | compute_pay(40, 10.50)
82 |
83 |
84 | # EXERCISE: Update your function to give the employee 1.5 times the hourly rate
85 | # for hours worked above 40 hours.
86 |
87 | def compute_more_pay(hours, rate):
88 | if hours <= 40:
89 | return hours * rate
90 | else:
91 | return 40*rate + (hours-40)*(rate*1.5)
92 |
93 | compute_more_pay(30, 10)
94 | compute_more_pay(45, 10)
95 |
96 |
97 | # STRINGS
98 |
99 | # create a string
100 | s = str(42) # convert another data type into a string
101 | s = 'I like you'
102 |
103 | # examine a string
104 | s[0] # returns 'I'
105 | len(s) # returns 10
106 |
107 | # string slicing like lists
108 | s[:6] # returns 'I like'
109 | s[7:] # returns 'you'
110 | s[-1] # returns 'u'
111 |
112 | # split a string into a list of substrings separated by a delimiter
113 | s.split(' ') # returns ['I','like','you']
114 | s.split() # same thing
115 |
116 | # concatenate strings
117 | s3 = 'The meaning of life is'
118 | s4 = '42'
119 | s3 + ' ' + s4 # returns 'The meaning of life is 42'
120 | s3 + ' ' + str(42) # same thing
121 |
122 |
123 | # EXERCISE: Given a string s, return a string made of the first 2 and last 2
124 | # characters of the original string, so 'spring' yields 'spng'. However, if the
125 | # string length is less than 2, instead return the empty string.
126 |
127 | def both_ends(s):
128 | if len(s) < 2:
129 | return ''
130 | else:
131 | return s[:2] + s[-2:]
132 |
133 | both_ends('spring')
134 | both_ends('cat')
135 | both_ends('a')
136 |
137 |
138 | # FOR LOOPS
139 |
140 | # range returns a list of integers
141 | range(0, 3) # returns [0, 1, 2]: includes first value but excludes second value
142 | range(3) # same thing: starting at zero is the default
143 |
144 | # simple for loop
145 | for i in range(5):
146 | print i
147 |
148 | # print each list element in uppercase
149 | fruits = ['apple', 'banana', 'cherry']
150 | for i in range(len(fruits)):
151 | print fruits[i].upper()
152 |
153 | # better for loop
154 | for fruit in fruits:
155 | print fruit.upper()
156 |
157 |
158 | # EXERCISE: Write a program that prints the numbers from 1 to 100. But for
159 | # multiples of 3 print 'fizz' instead of the number, and for the multiples of
160 | # 5 print 'buzz'. For numbers which are multiples of both 3 and 5 print 'fizzbuzz'.
161 |
162 | def fizz_buzz():
163 | nums = range(1, 101)
164 | for num in nums:
165 | if num % 15 == 0:
166 | print 'fizzbuzz'
167 | elif num % 3 == 0:
168 | print 'fizz'
169 | elif num % 5 == 0:
170 | print 'buzz'
171 | else:
172 | print num
173 |
174 | fizz_buzz()
175 |
176 |
177 | # EXERCISE: Given a list of strings, return a list with the strings
178 | # in sorted order, except group all the strings that begin with 'x' first.
179 | # e.g. ['mix', 'xyz', 'apple', 'xanadu', 'aardvark'] returns
180 | # ['xanadu', 'xyz', 'aardvark', 'apple', 'mix']
181 | # Hint: this can be done by making 2 lists and sorting each of them
182 | # before combining them.
183 |
184 | def front_x(words):
185 | lista=[]
186 | listb=[]
187 | for word in words:
188 | if word[0]=='x':
189 | lista.append(word)
190 | else:
191 | listb.append(word)
192 | return sorted(lista) + sorted(listb)
193 |
194 | front_x(['mix', 'xyz', 'apple', 'xanadu', 'aardvark'])
195 |
--------------------------------------------------------------------------------
/code/00_python_intermediate_workshop.py:
--------------------------------------------------------------------------------
1 | ## QUIZ TO REVIEW BEGINNER WORKSHOP
2 |
3 | a = 5
4 | b = 5.0
5 | c = a/2
6 | d = b/2
7 |
8 | '''
9 | What is type(a)?
10 | int
11 | What is type(b)?
12 | float
13 | What is c?
14 | 2
15 | What is d?
16 | 2.5
17 | '''
18 |
19 | e = [a, b]
20 | f = range(10)
21 |
22 | '''
23 | What is type(e)?
24 | list
25 | What is len(e)?
26 | 2
27 | What is type(f)?
28 | list
29 | What are the contents of f?
30 | integers 0 through 9
31 | What is 'range' called?
32 | a function
33 | How do I get help on 'range'?
34 | help(range)
35 | '''
36 |
37 | g = ['mon','tues','wed','thurs','fri']
38 |
39 | '''
40 | How do I slice out 'mon'?
41 | g[0]
42 | How do I slice out 'mon' through 'wed'?
43 | g[0:3]
44 | What are two ways to slice out 'fri'?
45 | g[4] or g[-1]
46 | How do I check the type of 'mon'?
47 | type(g[0])
48 | '''
49 |
50 | g.remove('wed')
51 | sorted(g)
52 | h = sorted(g, reverse=True)
53 |
54 | '''
55 | What are the contents of g?
56 | ['mon','tues','thurs','fri']
57 | What are the contents of h?
58 | ['tues','thurs','mon','fri']
59 | What is 'remove' called?
60 | a list method
61 | How do I get help on 'remove'?
62 | help(g.remove)
63 | What is 'reverse=True' called?
64 | an optional argument
65 | '''
66 |
67 | i = 'Hello'
68 | j = 'friend'
69 | k = i + j
70 | l = i + 3
71 | m = i[0]
72 |
73 | '''
74 | What is 'k'?
75 | 'Hellofriend'
76 | What is 'l'?
77 | undefined (due to error)
78 | What is 'm'?
79 | 'H'
80 | '''
81 |
82 |
83 |
84 | ## FOR LOOPS AND BASIC LIST COMPREHENSIONS
85 |
86 | # print 1 through 5
87 | nums = range(1, 6)
88 | for num in nums:
89 | print num
90 |
91 | # for loop to create a list of cubes
92 | cubes = []
93 | for num in nums:
94 | cubes.append(num**3)
95 |
96 | # equivalent list comprehension
97 | cubes = [num**3 for num in nums] # [1, 8, 27, 64, 125]
98 |
99 | '''
100 | EXERCISE:
101 | Given that: letters = ['a','b','c']
102 | Write a list comprehension that returns: ['A','B','C']
103 | Hint: 'hello'.upper() returns 'HELLO'
104 |
105 | [letter.upper() for letter in letters]
106 |
107 | BONUS EXERCISE:
108 | Given that: word = 'abc'
109 | Write a list comprehension that returns: ['A','B','C']
110 |
111 | [letter.upper() for letter in word]
112 | '''
113 |
114 |
115 |
116 | ## LIST COMPREHENSIONS WITH CONDITIONS
117 |
118 | nums = range(1, 6)
119 |
120 | # for loop to create a list of cubes of even numbers
121 | cubes_of_even = []
122 | for num in nums:
123 | if num % 2 == 0:
124 | cubes_of_even.append(num**3)
125 |
126 | # equivalent list comprehension
127 | # syntax: [expression for variable in iterable if condition]
128 | cubes_of_even = [num**3 for num in nums if num % 2 == 0] # [8, 64]
129 |
130 |
131 |
132 | ## DICTIONARIES
133 |
134 | # dictionaries are similar to lists:
135 | # - both can contain multiple data types
136 | # - both are iterable
137 | # - both are mutable
138 |
139 | # dictionaries are different from lists:
140 | # - dictionaries are unordered
141 | # - dictionary lookup time is constant regardless of dictionary size
142 |
143 | # dictionaries are like real dictionaries:
144 | # - dictionaries are made of key-value pairs (word and definition)
145 | # - dictionary keys must be unique (each word is only defined once)
146 | # - you can use the key to look up the value, but not the other way around
147 |
148 | # create a dictionary (and open Variable Explorer in Spyder)
149 | family = {'dad':'homer', 'mom':'marge', 'size':6}
150 |
151 | # examine a dictionary
152 | family[0] # throws an error (there is no ordering)
153 | family['dad'] # returns 'homer'
154 | len(family) # returns 3
155 | family.keys() # returns list: ['dad', 'mom', 'size']
156 | family.values() # returns list: ['homer', 'marge', 6]
157 | family.items() # returns list of tuples:
158 | # [('dad', 'homer'), ('mom', 'marge'), ('size', 6)]
159 |
160 | # modify a dictionary
161 | family['cat'] = 'snowball' # add a new entry
162 | family['cat'] = 'snowball ii' # edit an existing entry
163 | del family['cat'] # delete an entry
164 | family['kids'] = ['bart', 'lisa'] # value can be a list
165 |
166 | # accessing a list element within a dictionary
167 | family['kids'][0] # returns 'bart'
168 |
169 | '''
170 | EXERCISE:
171 | Given that: d = {'a':10, 'b':20, 'c':[30, 40]}
172 | First, print the value for 'a'
173 | Then, change the value for 'b' to be 25
174 | Then, change the 30 to be 35
175 | Finally, append 45 to the end of the list that contains 35 and 40
176 |
177 | d['a']
178 | d['b'] = 25
179 | d['c'][0] = 35
180 | d['c'].append(45)
181 |
182 | BONUS EXERCISE:
183 | Write a list comprehension that returns a list of the keys in uppercase
184 |
185 | [key.upper() for key in d.keys()]
186 | '''
187 |
188 |
189 |
190 | ## APIs
191 |
192 | # API Providers: https://apigee.com/providers
193 | # Echo Nest API Console: https://apigee.com/console/echonest
194 | # Echo Nest Developer Center: http://developer.echonest.com/
195 |
196 | import requests # import module (make its functions available)
197 |
198 | # use requests to talk to the web
199 | r = requests.get('http://www.google.com')
200 | r.text
201 | type(r.text)
202 |
203 | # request data from the Echo Nest API
204 | r = requests.get('http://developer.echonest.com/api/v4/artist/top_hottt?api_key=KBGUPZPJZS9PHWNIN&format=json')
205 | r.text
206 | r.json() # decode JSON
207 | type(r.json())
208 | top = r.json()
209 |
210 | # pretty print for easier readability
211 | import pprint
212 | pprint.pprint(top)
213 |
214 | # pull out the artist data
215 | artists = top['response']['artists'] # list of 15 dictionaries
216 |
217 | # reformat data into a table structure
218 | artists_data = [artist.values() for artist in artists] # list of 15 lists
219 | artists_header = artists[0].keys() # list of 2 strings
220 |
221 |
222 |
223 | ## WORKING WITH PUBLIC DATA
224 |
225 | # List of data sources: https://github.com/justmarkham/DAT5/blob/master/other/public_data.md
226 | # FiveThirtyEight: http://fivethirtyeight.com/
227 | # FiveThirtyEight data: https://github.com/fivethirtyeight/data
228 | # NFL ticket prices data: https://github.com/fivethirtyeight/data/tree/master/nfl-ticket-prices
229 |
230 | # Question: What is the average ticket price for Ravens' home vs away games?
231 |
232 | # open a CSV file from a URL
233 | import csv
234 | r = requests.get('https://raw.githubusercontent.com/fivethirtyeight/data/master/nfl-ticket-prices/2014-average-ticket-price.csv')
235 | data = [row for row in csv.reader(r.iter_lines())] # list of lists
236 |
237 | # open a downloaded CSV file from your working directory
238 | with open('2014-average-ticket-price.csv', 'rU') as f:
239 | data = [row for row in csv.reader(f)] # list of lists
240 |
241 | # examine the data
242 | type(data)
243 | len(data)
244 | data[0]
245 | data[1]
246 |
247 | # save the data we want
248 | data = data[1:97]
249 |
250 | # step 1: create a list that only contains events
251 | data[0][0]
252 | data[1][0]
253 | data[2][0]
254 | events = [row[0] for row in data]
255 |
256 | # EXERCISE
257 | # step 2: create a list that only contains prices (stored as integers)
258 | prices = [int(row[2]) for row in data]
259 |
260 | # step 3: figure out how to locate the away teams
261 | events[0]
262 | events[0].find(' at ')
263 | stop = events[0].find(' at ')
264 | events[0][:stop]
265 |
266 | # step 4: use a for loop to make a list of the away teams
267 | away_teams = []
268 | for event in events:
269 | stop = event.find(' at ')
270 | away_teams.append(event[:stop])
271 |
272 | # EXERCISE
273 | # step 5: use a for loop to make a list of the home teams
274 | home_teams = []
275 | for event in events:
276 | start = event.find(' at ') + 4
277 | stop = event.find(' Tickets ')
278 | home_teams.append(event[start:stop])
279 |
280 | # step 6: figure out how to get prices only for Ravens home games
281 | zip(home_teams, prices) # list of tuples
282 | [pair[1] for pair in zip(home_teams, prices)] # iterate through tuples and get price
283 | [price for team, price in zip(home_teams, prices)] # better way to get price
284 | [price for team, price in zip(home_teams, prices) if team == 'Baltimore Ravens'] # add a condition
285 |
286 | # step 7: create lists of the Ravens home and away game prices
287 | ravens_home = [price for team, price in zip(home_teams, prices) if team == 'Baltimore Ravens']
288 | ravens_away = [price for team, price in zip(away_teams, prices) if team == 'Baltimore Ravens']
289 |
290 | # EXERCISE
291 | # step 8: calculate the average of each list
292 | float(sum(ravens_home)) / len(ravens_home)
293 | float(sum(ravens_away)) / len(ravens_away)
294 |
--------------------------------------------------------------------------------
/code/01_chipotle_homework_solution.py:
--------------------------------------------------------------------------------
1 | '''
2 | SOLUTION FILE: Homework with Chipotle data
3 | https://github.com/TheUpshot/chipotle
4 | '''
5 |
6 |
7 | '''
8 | PART 1: read in the data, parse it, and store it in a list of lists called 'data'
9 | Hint: this is a tsv file, and csv.reader() needs to be told how to handle it
10 | '''
11 |
12 | import csv
13 |
14 | # specify that the delimiter is a tab character
15 | with open('chipotle_orders.tsv', 'rU') as f:
16 | data = [row for row in csv.reader(f, delimiter='\t')]
17 |
18 |
19 | '''
20 | PART 2: separate the header and data into two different lists
21 | '''
22 |
23 | header = data[0]
24 | data = data[1:]
25 |
26 |
27 | '''
28 | PART 3: calculate the average price of an order
29 | Hint: examine the data to see if the 'quantity' column is relevant to this calculation
30 | Hint: work smarter, not harder! (this can be done in a few lines of code)
31 | '''
32 |
33 | # count the number of unique order_id's
34 | # note: you could assume this is 1834 because that's the maximum order_id, but it's best to check
35 | num_orders = len(set([row[0] for row in data])) # 1834
36 |
37 | # create a list of prices
38 | # note: ignore the 'quantity' column because the 'item_price' takes quantity into account
39 | prices = [float(row[4][1:-1]) for row in data] # strip the dollar sign and trailing space
40 |
41 | # calculate the average price of an order and round to 2 digits
42 | round(sum(prices) / num_orders, 2) # $18.81
43 |
44 |
45 | '''
46 | PART 4: create a list (or set) of all unique sodas and soft drinks that they sell
47 | Note: just look for 'Canned Soda' and 'Canned Soft Drink', and ignore other drinks like 'Izze'
48 | '''
49 |
50 | # if 'item_name' includes 'Canned', append 'choice_description' to 'sodas' list
51 | sodas = []
52 | for row in data:
53 | if 'Canned' in row[2]:
54 | sodas.append(row[3][1:-1]) # strip the brackets
55 |
56 | # create a set of unique sodas
57 | unique_sodas = set(sodas)
58 |
59 |
60 | '''
61 | PART 5: calculate the average number of toppings per burrito
62 | Note: let's ignore the 'quantity' column to simplify this task
63 | Hint: think carefully about the easiest way to count the number of toppings
64 | Hint: 'hello there'.count('e')
65 | '''
66 |
67 | # keep a running total of burritos and toppings
68 | burrito_count = 0
69 | topping_count = 0
70 |
71 | # calculate number of toppings by counting the commas and adding 1
72 | # note: x += 1 is equivalent to x = x + 1
73 | for row in data:
74 | if 'Burrito' in row[2]:
75 | burrito_count += 1
76 | topping_count += (row[3].count(',') + 1)
77 |
78 | # calculate the average topping count and round to 2 digits
79 | round(topping_count / float(burrito_count), 2) # 5.40
80 |
81 |
82 | '''
83 | PART 6: create a dictionary in which the keys represent chip orders and
84 | the values represent the total number of orders
85 | Expected output: {'Chips and Roasted Chili-Corn Salsa': 18, ... }
86 | Note: please take the 'quantity' column into account!
87 | Advanced: learn how to use 'defaultdict' to simplify your code
88 | '''
89 |
90 | # start with an empty dictionary
91 | chips = {}
92 |
93 | # if chip order is not in dictionary, then add a new key/value pair
94 | # if chip order is already in dictionary, then update the value for that key
95 | for row in data:
96 | if 'Chips' in row[2]:
97 | if row[2] not in chips:
98 | chips[row[2]] = int(row[1]) # this is a new key, so create key/value pair
99 | else:
100 | chips[row[2]] += int(row[1]) # this is an existing key, so add to the value
101 |
102 | # defaultdict saves you the trouble of checking whether a key already exists
103 | from collections import defaultdict
104 | dchips = defaultdict(int)
105 | for row in data:
106 | if 'Chips' in row[2]:
107 | dchips[row[2]] += int(row[1])
108 |
109 |
110 | '''
111 | BONUS: think of a question about this data that interests you, and then answer it!
112 | '''
113 |
--------------------------------------------------------------------------------
/code/01_reading_files.py:
--------------------------------------------------------------------------------
1 | '''
2 | Lesson on file reading using Airline Safety Data
3 | https://github.com/fivethirtyeight/data/tree/master/airline-safety
4 | '''
5 |
6 | # read the whole file at once, return a single string (including newlines)
7 | # 'rU' mode (read universal) converts different line endings into '\n'
8 | f = open('airline_safety.csv', 'rU')
9 | data = f.read()
10 | f.close()
11 |
12 | # use a context manager to automatically close your file
13 | with open('airline_safety.csv', 'rU') as f:
14 | data = f.read()
15 |
16 | # read the whole file at once, return a list of lines
17 | with open('airline_safety.csv', 'rU') as f:
18 | data = f.readlines()
19 |
20 | # use list comprehension to duplicate readlines
21 | with open('airline_safety.csv', 'rU') as f:
22 | data = [row for row in f]
23 |
24 | # use the csv module to create a list of lists
25 | import csv
26 | with open('airline_safety.csv', 'rU') as f:
27 | data = [row for row in csv.reader(f)]
28 |
29 | # alternative method that doesn't require downloading the file
30 | import requests
31 | r = requests.get('https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv')
32 | data = [row for row in csv.reader(r.iter_lines())]
33 |
34 | # separate the header and data
35 | header = data[0]
36 | data = data[1:]
37 |
38 | # EXERCISE:
39 | # create a list of airline names (without the star)
40 | # create a list of the same length that contains 1 if there's a star and 0 if not
41 | airlines = []
42 | starred = []
43 | for row in data:
44 | if row[0][-1] == '*':
45 | starred.append(1)
46 | airlines.append(row[0][:-1])
47 | else:
48 | starred.append(0)
49 | airlines.append(row[0])
50 |
51 | # EXERCISE:
52 | # create a list that contains the average number of incidents per distance
53 | [(int(row[2]) + int(row[5])) / float(row[1]) for row in data]
54 |
55 |
56 | '''
57 | A few extra things that will help you with the homework
58 | '''
59 |
60 | # 'in' statement is useful for lists
61 | my_list = [1, 2, 1]
62 | 1 in my_list # True
63 | 3 in my_list # False
64 |
65 | # 'in' is useful for strings (checks for substrings)
66 | my_string = 'hello there'
67 | 'the' in my_string # True
68 | 'then' in my_string # False
69 |
70 | # 'in' is useful for dictionaries (checks keys but not values)
71 | my_dict = {'name':'Kevin', 'title':'instructor'}
72 | 'name' in my_dict # True
73 | 'Kevin' in my_dict # False
74 |
75 | # 'set' data structure is useful for gathering unique elements
76 | set(my_list) # returns a set of 1, 2
77 | len(set(my_list)) # count of unique elements
78 |
79 |
80 | '''
81 | Homework with Chipotle data
82 | https://github.com/TheUpshot/chipotle
83 | '''
84 |
85 | '''
86 | PART 1: read in the data, parse it, and store it in a list of lists called 'data'
87 | Hint: this is a tsv file, and csv.reader() needs to be told how to handle it
88 | '''
89 |
90 | '''
91 | PART 2: separate the header and data into two different lists
92 | '''
93 |
94 | '''
95 | PART 3: calculate the average price of an order
96 | Hint: examine the data to see if the 'quantity' column is relevant to this calculation
97 | Hint: work smarter, not harder! (this can be done in a few lines of code)
98 | '''
99 |
100 | '''
101 | PART 4: create a list (or set) of all unique sodas and soft drinks that they sell
102 | Note: just look for 'Canned Soda' and 'Canned Soft Drink', and ignore other drinks like 'Izze'
103 | '''
104 |
105 | '''
106 | PART 5: calculate the average number of toppings per burrito
107 | Note: let's ignore the 'quantity' column to simplify this task
108 | Hint: think carefully about the easiest way to count the number of toppings
109 | Hint: 'hello there'.count('e')
110 | '''
111 |
112 | '''
113 | PART 6: create a dictionary in which the keys represent chip orders and
114 | the values represent the total number of orders
115 | Expected output: {'Chips and Roasted Chili-Corn Salsa': 18, ... }
116 | Note: please take the 'quantity' column into account!
117 | Advanced: learn how to use 'defaultdict' to simplify your code
118 | '''
119 |
120 | '''
121 | BONUS: think of a question about this data that interests you, and then answer it!
122 | '''
123 |
--------------------------------------------------------------------------------
/code/03_exploratory_analysis_pandas.py:
--------------------------------------------------------------------------------
1 | """
2 | CLASS: Pandas for Data Exploration, Analysis, and Visualization
3 |
4 | About the data:
5 | WHO alcohol consumption data:
6 | article: http://fivethirtyeight.com/datalab/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/
7 | original data: https://github.com/fivethirtyeight/data/tree/master/alcohol-consumption
8 | files: drinks.csv (with additional 'continent' column)
9 | """
10 |
11 | """
12 | First, we need to import Pandas into Python. Pandas is a Python package that
13 | allows for easy manipulation of DataFrames. You'll also need to import
14 | matplotlib for plotting.
15 | """
16 |
17 | #imports
18 | import pandas as pd
19 | import matplotlib.pyplot as plt
20 | import numpy as np
21 |
22 |
23 | '''
24 | Reading Files, Summarizing, Selecting, Filtering, Sorting
25 | '''
26 | # Can read a file from a local file on your computer or from a URL
27 | drinks = pd.read_table('drinks.csv', sep=',') # read_table is more general
28 | drinks = pd.read_csv('drinks.csv') # read_csv is specific to CSV and implies sep=","
29 | # Can also read from URLs
30 | drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/drinks.csv')
31 |
32 |
33 | '''
34 | Key Concept: Dot notation
35 | In Python, you can think of an object as an entity that can have both attributes
36 | and methods. A dot following an object indicates that you are about to access
37 | something within the object, an attribute or a method. Attributes contain
38 | information about the object. They are usually a single "word" following the
39 | dot. A method is somethng the object can do. They are usually a "word" with
40 | parentheses following the dot.
41 | '''
42 |
43 | # examine the drinks data
44 | drinks # print the first 30 and last 30 rows
45 | type(drinks) # DataFrame
46 | drinks.head() # print the first 5 rows
47 | drinks.head(10) # print the first 10 rows
48 | drinks.tail() # print the last 5 rows
49 | drinks.describe() # summarize all numeric columns
50 | drinks.describe(include='all') # includes non numeric columns; new in pandas 0.15.0
51 | drinks.index # "the index" (aka "the labels")
52 | drinks.columns # column names (which is "an index")
53 | drinks.dtypes # data types of each column
54 | drinks.shape # number of rows and columns
55 | drinks.values # underlying numpy array
56 | drinks.info() # concise summary (includes memory usage as of pandas 0.15.0)
57 |
58 | # Print the 'beer_servings' Series (a single column)
59 | drinks.beer_servings
60 | drinks['beer_servings']
61 | type(drinks.beer_servings)
62 |
63 | # Print two columns
64 | drinks[['beer_servings','wine_servings']]
65 | cols = ['beer_servings','wine_servings']
66 | drinks[cols]
67 |
68 | # Calculate the average 'beer_servings' for the entire dataset
69 | drinks.describe() # summarize all numeric columns
70 | drinks.beer_servings.describe() # summarize only the 'beer_servings' Series
71 | drinks.beer_servings.mean() # only calculate the mean
72 | drinks.beer_servings.max() # only calculate the max
73 | drinks.beer_servings.min() # only calculate the min
74 |
75 | # Other aggregation functions
76 | drinks.beer_servings.sum()
77 | drinks.beer_servings.count()
78 | float(drinks.beer_servings.sum())/drinks.beer_servings.count()
79 |
80 | # Count the number of occurrences of each 'continent' value
81 | drinks.continent.value_counts()
82 |
83 | # Simple logical filters
84 | # Print all columns, but only show rows where the country is in Europe
85 | # Let's look at each piece of this.
86 | drinks.continent # Returns all of the continent values
87 | drinks.continent=='EU' # Returns True/False list
88 | drinks[drinks.continent=='EU'] # Returns all rows where True
89 |
90 | # Other logical filters
91 | drinks[drinks.beer_servings > 158]
92 | drinks[drinks.beer_servings <= 10]
93 | type(drinks[drinks.beer_servings <= 10]) # DataFrame
94 | drinks[drinks.beer_servings <= 10][['country','beer_servings']]
95 |
96 | # Calculate the average 'beer_servings' for all of Europe
97 | drinks[drinks.continent=='EU'].beer_servings.mean()
98 |
99 | # More complex logical fitering
100 | # Only show European countries with 'wine_servings' greater than 300
101 | # Note: parentheses are required for each condition, and you can't use 'and' or 'or' keywords
102 | drinks[(drinks.continent=='EU') & (drinks.wine_servings > 300)]
103 |
104 | # Show European countries or countries with 'wine_servings' greater than 300
105 | drinks[(drinks.continent=='EU') | (drinks.wine_servings > 300)]
106 |
107 | # Show countries who have more than the mean beer_servings
108 | drinks[drinks.beer_servings > drinks.beer_servings.mean()]
109 |
110 | ##########################################
111 | ############ Exercise 1 ############
112 | ##########################################
113 |
114 | # Using the 'drinks' data, answer the following questions:
115 | # 1. What is the maximum number of total litres of pure alcohol?
116 | drinks.total_litres_of_pure_alcohol.max()
117 |
118 | # 2. Which country has the maximum number of total litres of pure alcohol?
119 | drinks[drinks.total_litres_of_pure_alcohol == drinks.total_litres_of_pure_alcohol.max()]['country']
120 |
121 | # 3. Does Haiti or Belarus consume more servings of spirits?
122 | drinks.spirit_servings[drinks.country=='Haiti'] > drinks.spirit_servings[drinks.country=='Belarus']
123 |
124 | # 4. How many countries have more than 300 wine servings OR more than 300
125 | # beer servings OR more than 300 spirit servings?
126 | drinks[(drinks.wine_servings > 300) | (drinks.beer_servings > 300) | (drinks.spirit_servings > 300)].country.count()
127 |
128 | # 5. For the countries in the previous question, what is the average total litres
129 | # of pure alcohol?
130 | drinks[(drinks.wine_servings > 300) | (drinks.beer_servings > 300) | (drinks.spirit_servings > 300)].mean()
131 |
132 |
133 | # sorting
134 | drinks.beer_servings.order() # only works for a Series
135 | drinks.sort_index() # sort rows by label
136 | drinks.sort_index(by='beer_servings') # sort rows by a specific column
137 | drinks.sort_index(by='beer_servings', ascending=False) # use descending order instead
138 | drinks.sort_index(by=['beer_servings', 'wine_servings']) # sort by multiple columns
139 |
140 | # Determine which 10 countries have the highest 'total_litres_of_pure_alcohol'
141 | drinks.sort_index(by='total_litres_of_pure_alcohol').tail(10)
142 |
143 | # Determine which country has the highest value for 'beer_servings'
144 | drinks[drinks.beer_servings==drinks.beer_servings.max()].country
145 |
146 | # Use dot notation to string together commands
147 | # How many countries in each continent have beer_servings greater than 182?
148 | # i.e. a beer every two days
149 | drinks[drinks.beer_servings > 182].continent.value_counts()
150 |
151 | # add a new column as a function of existing columns
152 | # note: can't (usually) assign to an attribute (e.g., 'drinks.total_servings')
153 | drinks['total_servings'] = drinks.beer_servings + drinks.spirit_servings + drinks.wine_servings
154 | drinks['alcohol_mL'] = drinks.total_litres_of_pure_alcohol * 1000
155 | drinks.head()
156 |
157 | '''
158 | Split-Apply-Combine
159 | '''
160 |
161 | # for each continent, calculate mean beer servings
162 | drinks.groupby('continent').beer_servings.mean()
163 |
164 | # for each continent, calculate mean of all numeric columns
165 | drinks.groupby('continent').mean()
166 |
167 | # for each continent, count number of occurrences
168 | drinks.groupby('continent').continent.count()
169 | drinks.continent.value_counts()
170 |
171 |
172 | '''
173 | A little numpy
174 | '''
175 | probs = np.array([0.51, 0.50, 0.02, 0.49, 0.78])
176 | # np.where functions like an IF statement in Excel
177 | # np.where(condition, value if true, value if false)
178 | np.where(probs >= 0.5, 1, 0)
179 | drinks['lots_of_beer'] = np.where(drinks.beer_servings > 300, 1, 0)
180 |
181 |
182 |
183 | ##########################################
184 | ############ Exercise 2 ############
185 | ##########################################
186 |
187 | # 1. What is the average number of total litres of pure alcohol for each
188 | # continent?
189 | drinks.groupby('continent').total_litres_of_pure_alcohol.mean()
190 |
191 |
192 | # 2. For each continent, calculate the mean wine_servings for all countries who
193 | # have a spirit_servings greater than the overall spirit_servings mean.
194 | drinks[drinks.spirit_servings > drinks.spirit_servings.mean()].groupby('continent').wine_servings.mean()
195 |
196 |
197 | # 3. Per continent, for all of the countries that drink more beer servings than
198 | # the average number of beer servings, what is the average number of wine
199 | # servings?
200 | drinks[drinks.beer_servings > drinks.beer_servings.mean()].groupby('continent').wine_servings.mean()
201 |
202 |
203 | '''
204 | Advanced Filtering (of rows) and Selecting (of columns)
205 | '''
206 |
207 | # loc: filter rows by LABEL, and select columns by LABEL
208 | drinks.loc[0] # row with label 0
209 | drinks.loc[0:3] # rows with labels 0 through 3
210 | drinks.loc[0:3, 'beer_servings':'wine_servings'] # rows 0-3, columns 'beer_servings' through 'wine_servings'
211 | drinks.loc[:, 'beer_servings':'wine_servings'] # all rows, columns 'beer_servings' through 'wine_servings'
212 | drinks.loc[[0,3], ['beer_servings','spirit_servings']] # rows 1 and 4, columns 'beer_servings' and 'spirit_servings'
213 |
214 | # iloc: filter rows by POSITION, and select columns by POSITION
215 | drinks.iloc[0] # row with 0th position (first row)
216 | drinks.iloc[0:3] # rows with positions 0 through 2 (not 3)
217 | drinks.iloc[0:3, 0:3] # rows and columns with positions 0 through 2
218 | drinks.iloc[:, 0:3] # all rows, columns with positions 0 through 2
219 | drinks.iloc[[0,2], [0,1]] # 1st and 3rd row, 1st and 2nd column
220 |
221 | # mixing: select columns by LABEL, then filter rows by POSITION
222 | drinks.wine_servings[0:3]
223 | drinks[['beer_servings', 'spirit_servings', 'wine_servings']][0:3]
224 |
225 |
226 | ##########################################
227 | ############# Homework #############
228 | ##########################################
229 | '''
230 | Use the automotive mpg data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.csv)
231 | to complete the following parts. Please turn in your code for each part.
232 | Before each code chunk, give a brief description (one line) of what the code is
233 | doing (e.g. "Loads the data" or "Creates scatter plot of mpg and weight"). If
234 | the code output produces a plot or answers a question, give a brief
235 | interpretation of the output (e.g. "This plot shows X,Y,Z" or "The mean for
236 | group A is higher than the mean for group B which means X,Y,Z").
237 | '''
238 |
239 | '''
240 | Part 1
241 | Load the data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt)
242 | into a DataFrame. Try looking at the "head" of the file in the command line
243 | to see how the file is delimited and how to load it.
244 | Note: You do not need to turn in any command line code you may use.
245 | '''
246 |
247 | '''
248 | Part 2
249 | Get familiar with the data. Answer the following questions:
250 | - What is the shape of the data? How many rows and columns are there?
251 | - What variables are available?
252 | - What are the ranges for the values in each numeric column?
253 | - What is the average value for each column? Does that differ significantly
254 | from the median?
255 | '''
256 |
257 |
258 | '''
259 | Part 3
260 | Use the data to answer the following questions:
261 | - Which 5 cars get the best gas mileage?
262 | - Which 5 cars with more than 4 cylinders get the best gas mileage?
263 | - Which 5 cars get the worst gas mileage?
264 | - Which 5 cars with 4 or fewer cylinders get the worst gas mileage?
265 | '''
266 |
267 | '''
268 | Part 4
269 | Use groupby and aggregations to explore the relationships
270 | between mpg and the other variables. Which variables seem to have the greatest
271 | effect on mpg?
272 | Some examples of things you might want to look at are:
273 | - What is the mean mpg for cars for each number of cylindres (i.e. 3 cylinders,
274 | 4 cylinders, 5 cylinders, etc)?
275 | - Did mpg rise or fall over the years contained in this dataset?
276 | - What is the mpg for the group of lighter cars vs the group of heaver cars?
277 | Note: Be creative in the ways in which you divide up the data. You are trying
278 | to create segments of the data using logical filters and comparing the mpg
279 | for each segment of the data.
280 | '''
281 |
282 |
--------------------------------------------------------------------------------
/code/04_apis.py:
--------------------------------------------------------------------------------
1 | '''
2 | CLASS: APIs
3 |
4 | Data Science Toolkit text2sentiment API
5 | '''
6 |
7 | '''
8 | APIs without wrappers (i.e. there is no nicely formatted function)
9 | '''
10 | # Import the necessary modules
11 | import requests # Helps construct the request to send to the API
12 | import json # JSON helper functions
13 |
14 | # We have a sentence we want the sentiment of
15 | sample_sentence = 'A couple hundred hours & several thousand lines of code later... thank you @GA_DC!! #DataScience #GAGradNight'
16 |
17 | # We know end URL endpoint to send it to
18 | url = 'http://www.datasciencetoolkit.org/text2sentiment/'
19 |
20 | # First we specify the header
21 | header = {'content-type': 'application/json'}
22 |
23 | # Next we specify the body (the information we want the API to work on)
24 | body = sample_sentence
25 |
26 | # Now we make the request
27 | response = requests.post(url, data=body, headers=header)
28 | # Notice that this is a POST request
29 |
30 | # Let's look at the response
31 | response.status_code
32 | response.ok
33 | response.text
34 |
35 | # Let's turn that text back into JSON
36 | r_json = json.loads(response.text)
37 | r_json
38 | r_json['score'] # 2.0
39 |
40 | ##########################################
41 | ############ Exercise 1 ############
42 | ##########################################
43 | # Turn the above code into a function
44 | # The function should take in one argument, some text, and return a number,
45 | # the sentiment. Call your function "get_sentiment".
46 | def get_sentiment(text):
47 | url = 'http://www.datasciencetoolkit.org/text2sentiment/'
48 |
49 | #specify header
50 | header = {'content-type': 'application/json'}
51 |
52 | # Next we specify the body (the information we want the API to work on)
53 | body = text
54 |
55 | # Now we make the request
56 | response = requests.post(url, data=body, headers=header)
57 | # Notice that this is a POST request
58 | r_json = json.loads(response.text)
59 | sentiment = r_json['score'] # 2.0
60 | return sentiment
61 |
62 |
63 |
64 |
65 | # Now that we've created our own wrapper, we can use it throughout our code.
66 | # We now have multiple sentences
67 | sentences = ['I love pizza!', 'I hate pizza!', 'I feel nothing about pizza!']
68 |
69 | # Loop through the sentences
70 | for sentence in sentences:
71 | sentiment = get_sentiment(sentence)
72 | print sentence, sentiment # Print the results
73 |
74 |
75 | '''
76 | APIs with wrappers (i.e. there is a nicely formatted function)
77 | '''
78 | # Import the API library
79 | import dstk
80 |
81 | # Remember our sample sentence?
82 | sample_sentence
83 |
84 | # Let's try our new API library
85 | # Instantiate DSTK object
86 | dstk = dstk.DSTK()
87 | dstk.text2sentiment(sample_sentence) # 2.0
88 |
89 | # We can once again loop through our sentences
90 | for sentence in sentences:
91 | sentiment = dstk.text2sentiment(sentence)
92 | print sentence, sentiment['score']
93 |
--------------------------------------------------------------------------------
/code/04_visualization.py:
--------------------------------------------------------------------------------
1 | """
2 | CLASS: Visualization
3 | """
4 |
5 | # imports
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 |
9 | # import the data available at https://raw.githubusercontent.com/justmarkham/DAT5/master/data/drinks.csv
10 | drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/drinks.csv')
11 |
12 | '''
13 | Visualization
14 | '''
15 |
16 | # bar plot of number of countries in each continent
17 | drinks.continent.value_counts().plot(kind='bar', title='Countries per Continent')
18 | plt.xlabel('Continent')
19 | plt.ylabel('Count')
20 | plt.show() # show plot window (if it doesn't automatically appear)
21 | plt.savefig('countries_per_continent.png') # save plot to file
22 |
23 | # bar plot of average number of beer servings (per adult per year) by continent
24 | drinks.groupby('continent').beer_servings.mean().plot(kind='bar', title='Average Number of Beer Servings By Continent')
25 | plt.ylabel('Average Number of Beer Servings Per Year')
26 | plt.show()
27 |
28 | # histogram of beer servings (shows the distribution of a numeric column)
29 | drinks.beer_servings.hist(bins=20)
30 | plt.title("Distribution of Beer Servings")
31 | plt.xlabel('Beer Servings')
32 | plt.ylabel('Frequency')
33 | plt.show()
34 |
35 | # density plot of beer servings (smooth version of a histogram)
36 | drinks.beer_servings.plot(kind='density', xlim=(0,500))
37 | plt.title("Distribution of Beer Servings")
38 | plt.xlabel('Beer Servings')
39 | plt.show()
40 |
41 | # grouped histogram of beer servings (shows the distribution for each group)
42 | drinks.beer_servings.hist(by=drinks.continent)
43 | plt.show()
44 |
45 | drinks.beer_servings.hist(by=drinks.continent, sharex=True)
46 | plt.show()
47 |
48 | drinks.beer_servings.hist(by=drinks.continent, sharex=True, sharey=True)
49 | plt.show()
50 |
51 | drinks.beer_servings.hist(by=drinks.continent, sharey=True, layout=(2, 3)) # change layout (new in pandas 0.15.0)
52 | plt.show()
53 |
54 | # boxplot of beer servings by continent (shows five-number summary and outliers)
55 | drinks.boxplot(column='beer_servings', by='continent')
56 | plt.show()
57 |
58 | # scatterplot of beer servings versus wine servings
59 | drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', alpha=0.3)
60 | plt.show()
61 |
62 | # same scatterplot, except point color varies by 'spirit_servings'
63 | # note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0
64 | drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', c='spirit_servings', colormap='Blues')
65 | plt.show()
66 |
67 | # same scatterplot, except all European countries are colored red
68 | colors = np.where(drinks.continent=='EU', 'r', 'b')
69 | drinks.plot(x='beer_servings', y='wine_servings', kind='scatter', c=colors)
70 | plt.show()
71 |
72 | # Scatter matrix
73 | pd.scatter_matrix(drinks)
74 | plt.show()
75 |
76 |
77 | ##########################################
78 | ############ Exercise 1 ############
79 | ##########################################
80 |
81 | # 1. Generate a plot showing the average number of total litres of pure alcohol
82 | # by continent.
83 | drinks.groupby('continent').total_litres_of_pure_alcohol.mean().plot(kind='bar')
84 | plt.show()
85 |
86 | # 2. Illustrate the relationship between spirit servings and total litres of
87 | # pure alcohol. What kind of relationship is there?
88 | drinks.plot(kind='scatter', x='spirit_servings', y='total_litres_of_pure_alcohol', alpha=0.4)
89 | plt.show()
90 |
91 | # 3. Generate one plot that shows the distribution of spirit servings for each
92 | # continent.
93 | drinks.spirit_servings.hist(by=drinks.continent, sharex=True, sharey=True)
94 | plt.show()
95 |
96 |
97 | ##########################################
98 | ############# Homework #############
99 | ##########################################
100 | '''
101 | Use the automotive mpg data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt)
102 | to complete the following parts. Please turn in your code for each part.
103 | Before each code chunk, give a brief description (one line) of what the code is
104 | doing (e.g. "Loads the data" or "Creates scatter plot of mpg and weight"). If
105 | the code output produces a plot or answers a question, give a brief
106 | interpretation of the output (e.g. "This plot shows X,Y,Z" or "The mean for
107 | group A is higher than the mean for group B which means X,Y,Z").
108 | '''
109 |
110 | '''
111 | Part 1
112 | Produce a plot that compares the mean mpg for the different numbers of cylinders.
113 | '''
114 |
115 | '''
116 | Part 2
117 | Use a scatter matrix to explore relationships between different numeric variables.
118 | '''
119 |
120 | '''
121 | Part 3
122 | Use a plot to answer the following questions:
123 | -Do heavier or lighter cars get better mpg?
124 | -How are horsepower and displacement related?
125 | -What does the distribution of acceleration look like?
126 | -How is mpg spread for cars with different numbers of cylinders?
127 | -Do cars made before or after 1975 get better average mpg? (Hint: You need to
128 | create a new column that encodes whether a year is before or after 1975.)
129 | '''
--------------------------------------------------------------------------------
/code/05_iris_exercise.py:
--------------------------------------------------------------------------------
1 | '''
2 | EXERCISE: "Human Learning" with iris data
3 |
4 | Can you predict the species of an iris using petal and sepal measurements?
5 |
6 | TASKS:
7 | 1. Read iris data into a pandas DataFrame, including column names.
8 | 2. Gather some basic information about the data.
9 | 3. Use groupby, sorting, and/or plotting to look for differences between species.
10 | 4. Come up with a set of rules that could be used to predict species based upon measurements.
11 |
12 | BONUS: Define a function that accepts a row of data and returns a predicted species.
13 | Then, use that function to make predictions for all existing rows of data.
14 | '''
15 |
16 | import pandas as pd
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 |
20 |
21 | ## TASK 1
22 |
23 | # read the iris data into a pandas DataFrame, including column names
24 | col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
25 | iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
26 | names=col_names)
27 |
28 |
29 | ## TASK 2
30 |
31 | # gather basic information
32 | iris.shape
33 | iris.head()
34 | iris.describe()
35 | iris.species.value_counts()
36 | iris.dtypes
37 | iris.isnull().sum()
38 |
39 |
40 | ## TASK 3
41 |
42 | # use groupby to look for differences between the species
43 | iris.groupby('species').sepal_length.mean()
44 | iris.groupby('species').mean()
45 | iris.groupby('species').describe()
46 |
47 | # use sorting to look for differences between the species
48 | iris.sort_index(by='sepal_length').values
49 | iris.sort_index(by='sepal_width').values
50 | iris.sort_index(by='petal_length').values
51 | iris.sort_index(by='petal_width').values
52 |
53 | # use plotting to look for differences between the species
54 | iris.petal_width.hist(by=iris.species, sharex=True)
55 | iris.boxplot(column='petal_width', by='species')
56 | iris.boxplot(by='species')
57 |
58 | # map species to a numeric value so that plots can be colored by category
59 | iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})
60 | iris.plot(kind='scatter', x='petal_length', y='petal_width', c='species_num', colormap='Blues')
61 | pd.scatter_matrix(iris, c=iris.species_num)
62 |
63 |
64 | ## TASK 4
65 |
66 | # If petal length is less than 3, predict setosa.
67 | # Else if petal width is less than 1.8, predict versicolor.
68 | # Otherwise predict virginica.
69 |
70 |
71 | ## BONUS
72 |
73 | # define function that accepts a row of data and returns a predicted species
74 | def classify_iris(row):
75 | if row[2] < 3: # petal_length
76 | return 0 # setosa
77 | elif row[3] < 1.8: # petal_width
78 | return 1 # versicolor
79 | else:
80 | return 2 # virginica
81 |
82 | # predict for a single row
83 | classify_iris(iris.iloc[0, :]) # first row
84 | classify_iris(iris.iloc[149, :]) # last row
85 |
86 | # store predictions for all rows
87 | predictions = [classify_iris(row) for row in iris.values]
88 |
89 | # calculate the percentage of correct predictions
90 | np.mean(iris.species_num == predictions) # 0.96
91 |
--------------------------------------------------------------------------------
/code/05_sklearn_knn.py:
--------------------------------------------------------------------------------
1 | '''
2 | CLASS: Introduction to scikit-learn with iris data
3 | '''
4 |
5 | # read in iris data
6 | import pandas as pd
7 | col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
8 | iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
9 | names=col_names)
10 |
11 | # create numeric column for the response
12 | # note: features and response must both be entirely numeric!
13 | iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})
14 |
15 | # create X (features) three different ways
16 | X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
17 | X = iris.loc[:, 'sepal_length':'petal_width']
18 | X = iris.iloc[:, 0:4]
19 |
20 | # create y (response)
21 | y = iris.species_num
22 |
23 | # check the shape of X and y
24 | X.shape # 150 by 4 (n=150, p=4)
25 | y.shape # 150 (must match first dimension of X)
26 |
27 | # scikit-learn 4-step modeling pattern:
28 |
29 | # Step 1: import the class you plan to use
30 | from sklearn.neighbors import KNeighborsClassifier
31 |
32 | # Step 2: instantiate the "estimator" (aka the model)
33 | # note: all unspecified parameters are set to the defaults
34 | knn = KNeighborsClassifier(n_neighbors=1)
35 |
36 | # Step 3: fit the model with data (learn the relationship between X and y)
37 | knn.fit(X, y)
38 |
39 | # Step 4: use the "fitted model" to predict the response for a new observation
40 | knn.predict([3, 5, 4, 2])
41 |
42 | # predict for multiple observations at once
43 | X_new = [[3, 5, 4, 2], [3, 5, 2, 2]]
44 | knn.predict(X_new)
45 |
46 | # try a different value of K ("tuning parameter")
47 | knn = KNeighborsClassifier(n_neighbors=5)
48 | knn.fit(X, y)
49 | knn.predict(X_new) # predicted classes
50 | knn.predict_proba(X_new) # predicted probabilities of class membership
51 | knn.kneighbors([3, 5, 4, 2]) # distances to nearest neighbors (and identities)
52 |
53 | # calculate Euclidian distance manually for nearest neighbor
54 | import numpy as np
55 | np.sqrt(((X.iloc[106, :] - [3, 5, 4, 2])**2).sum())
56 |
--------------------------------------------------------------------------------
/code/07_glass_id_homework_solution.py:
--------------------------------------------------------------------------------
1 | '''
2 | HOMEWORK: Glass Identification (aka "Glassification")
3 | '''
4 |
5 | # TASK 1: read data into a DataFrame
6 | import pandas as pd
7 | df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data',
8 | names=['id','ri','na','mg','al','si','k','ca','ba','fe','glass_type'],
9 | index_col='id')
10 |
11 | # TASK 2: briefly explore the data
12 | df.shape
13 | df.head()
14 | df.tail()
15 | df.glass_type.value_counts()
16 | df.isnull().sum()
17 |
18 | # TASK 3: convert to binary classification problem (1/2/3/4 maps to 0, 5/6/7 maps to 1)
19 | import numpy as np
20 | df['binary'] = np.where(df.glass_type < 5, 0, 1) # method 1
21 | df['binary'] = df.glass_type.map({1:0, 2:0, 3:0, 4:0, 5:1, 6:1, 7:1}) # method 2
22 | df.binary.value_counts()
23 |
24 | # TASK 4: create a feature matrix (X)
25 | features = ['ri','na','mg','al','si','k','ca','ba','fe'] # create a list of features
26 | features = df.columns[:-2] # alternative way: slice 'columns' attribute like a list
27 | X = df[features] # create DataFrame X by only selecting features
28 |
29 | # TASK 5: create a response vector (y)
30 | y = df.binary
31 |
32 | # TASK 6: split X and y into training and testing sets
33 | from sklearn.cross_validation import train_test_split
34 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99)
35 |
36 | # TASK 7: fit a KNN model on the training set using K=5
37 | from sklearn.neighbors import KNeighborsClassifier
38 | knn = KNeighborsClassifier(n_neighbors=5)
39 | knn.fit(X_train, y_train)
40 |
41 | # TASK 8: make predictions on the testing set and calculate accuracy
42 | y_pred = knn.predict(X_test)
43 | from sklearn import metrics
44 | print metrics.accuracy_score(y_test, y_pred) # 90.7% accuracy
45 |
46 | # TASK 9: calculate null accuracy
47 | 1 - y.mean() # 76.2% null accuracy
48 |
49 | # BONUS: write a for loop that computes test set accuracy for a range of K values
50 | k_range = range(1, 30, 2)
51 | scores = []
52 | for k in k_range:
53 | knn = KNeighborsClassifier(n_neighbors=k)
54 | knn.fit(X_train, y_train)
55 | y_pred = knn.predict(X_test)
56 | scores.append(metrics.accuracy_score(y_test, y_pred))
57 |
58 | # BONUS: plot K versus test set accuracy to choose on optimal value for K
59 | import matplotlib.pyplot as plt
60 | plt.plot(k_range, scores) # optimal value is K=1
61 |
--------------------------------------------------------------------------------
/code/08_web_scraping.py:
--------------------------------------------------------------------------------
1 | '''
2 | CLASS: Web Scraping
3 |
4 | We will be using two packages in particular: requests and Beautiful Soup 4.
5 | '''
6 |
7 | '''
8 | Introduction to Beautiful Soup
9 | '''
10 |
11 | # imports
12 | import requests # How Python gets the webpages
13 | from bs4 import BeautifulSoup # Creates structured, searchable object
14 | import pandas as pd
15 | import matplotlib.pyplot as plt
16 |
17 | # First, let's play with beautiful soup on a "toy" webpage
18 | html_doc = """
19 |
20 |
21 |
22 |
23 | Brandon's Homepage!
24 |
25 |
26 |
27 | Brandon's Homepage
28 | My name is Brandon. I'm love web scraping!
29 | I'm originally from Louisiana. I went to undergrad at Louisiana Tech and grad school at UNC.
30 | I currently work as a Product Manager of Linguistics and Analytics at Clarabridge.
31 |
32 | My Hobbies
33 |
34 | - Data Science
35 | - Backcountry Camping
36 | - Rock Climbing
37 | - Cycling
38 | - The Internet
39 |
40 |
41 |
42 | """
43 | type(html_doc)
44 |
45 | # Beautiful soup allows us to create a structured object out of this string
46 | b = BeautifulSoup(html_doc)
47 | type(b)
48 |
49 | # Let's look at "b"
50 | b
51 |
52 | # The most useful methods in a Beautiful Soup object are "find" and "findAll".
53 | # "find" takes several parameters, the most important are "name" and "attrs".
54 | # Let's talk about "name".
55 | b.find(name='body') # Finds the 'body' tag and everything inside of it.
56 | body = b.find(name='body')
57 | type(body) #tag
58 |
59 | # You can search tags also
60 | h1 = body.find(name='h1') # Find the 'h1' tag inside of the 'body' tag
61 | h1
62 | h1.text # Print out just the text inside of the body
63 |
64 | # Now let's find the 'p' tags
65 | p = b.find(name='p')
66 | # This only finds one. This is where 'findAll' comes in.
67 | all_p = b.findAll(name='p')
68 | all_p
69 | type(all_p) # Result sets are a lot like Python lists
70 | all_p[0] # Access specific element with index
71 | all_p[1]
72 | # Iterable like list
73 | for one_p in all_p:
74 | print one_p.text # Print text
75 |
76 | # Access specific attribute of a tag
77 | all_p[0] # Specific tag
78 | all_p[0]['id'] # Speific attribute of a specific tag
79 |
80 | # Now let's talk about 'attrs'
81 | # Beautiful soup also allows us to choose tags with specific attributes
82 | b.find(name='p', attrs={"id":"intro"})
83 | b.find(name='p', attrs={"id":"background"})
84 | b.find(name='p', attrs={"id":"current"})
85 |
86 | ##########################################
87 | ############ Exercise 1 ############
88 | ##########################################
89 |
90 | # 1. Extact the 'h3' element from Brandon's webpage.
91 | b.find(name='h3')
92 |
93 | # 2. Extract Brandon's hobbies from the html_doc. Print out the text of the hobby.
94 | hobbies = b.findAll(name='ul')
95 | for hobby in hobbies:
96 | print hobby.text
97 |
98 | # 3. Extract Brandon's hobby that has the id "my favorite".
99 | b.find(name='li', attrs={'id':'my favorite'})
100 |
101 |
102 | '''
103 | Beautiful Soup from the web
104 | '''
105 |
106 | # We see data on a web page that we want to get. First we need the HTML.
107 | # This downloads the HTML and puts it into the variable r
108 | r = requests.get('http://www.imdb.com/title/tt1856010/')
109 | # But when we look at it, it's just one giant string.
110 | type(r.text) # Unicode string
111 | r.text[0:200]
112 |
113 | # Beautiful soup allows us to create a structured object out of this string
114 | b = BeautifulSoup(r.text)
115 | type(b)
116 |
117 |
118 | '''
119 | "find" and "findAll" with the 'name' parameter in Beautiful Soup
120 | '''
121 | b.find(name='body') # Find a specific HTML tag
122 | body = b.find(name='body') # Store the output of your "find"
123 | type(body) # Let's look at the type
124 |
125 | # Can we still run another "find" command on the output?
126 | img = body.find('img') # Find the image tags
127 | img
128 | type(img)
129 | # Yes, but it only finds one of the "img" tags. We want them all.
130 | imgs = body.findAll(name='img')
131 | imgs # Now we get them all.
132 | type(imgs) # Resultsets are a lot like Python lists
133 |
134 | # Let's look at each individual image
135 | imgs[0]
136 | imgs[1]
137 |
138 | # We're really interested is the 'src' attribute, the actual image location.
139 | # How do we access attributes in a Python object? Using the dot notation or the
140 | # brackets. With Beautiful Soup, we must use the brackets
141 | imgs[0]['src']
142 |
143 | # Now we can look through each image and print the 'src' attribute.
144 | for img in imgs:
145 | print img['src']
146 |
147 | # Or maybe we want to create a list of all of the 'src' attributes
148 | src_list = []
149 | for img in imgs:
150 | src_list.append(img['src'])
151 |
152 | len(src_list)
153 |
154 |
155 | '''
156 | "find" and "findAll" with the 'attrs' parameter in Beautiful Soup
157 | '''
158 | # Now let's talk about 'attrs'
159 | # Beautiful soup also allows us to choose tags with specific attributes
160 | title = b.find(name="span", attrs={"class":"itemprop", "itemprop":"name"})
161 | title # Prints HTML matching that tag, but we want the actual name
162 | title.text # The "text" attribute gives you the text between two HTML tags
163 |
164 | star_rating = b.find(name="div", attrs={"class":"titlePageSprite star-box-giga-star"})
165 | # How do I get the actual star_rating number?
166 | star_rating.text
167 |
168 | # How do I make this star_rating a number instead of a string?
169 | float(star_rating.text)
170 |
171 | ##########################################
172 | ############ Exercise 2 ############
173 | ##########################################
174 | '''
175 | We've retrieved the title of the show, but now we want the show's rating,
176 | duration, and genre. Using "find" and "find all", write code that retrieves
177 | each of these things
178 | Hint: Everything can be found in the "infobar". Try finding that first and
179 | searchng within it.
180 | '''
181 |
182 | infobar = b.find(name="div", attrs={"class":"infobar"})
183 | # Retrieve the show's content rating
184 | content_rating = infobar.find(name='meta', attrs={"itemprop":"contentRating"})['content']
185 |
186 | # Retrieve the show's duration
187 | duration = infobar.find(name='time', attrs={"itemprop":"duration"}).text
188 |
189 | # Retrieve the show's genre
190 | genre = infobar.find(name='span', attrs={"itemprop":"genre"}).text
191 |
192 |
193 | '''
194 | Looping through 'findAll' results
195 | '''
196 | # Now we want to get the list of actors and actresses
197 | # First let's get the "div" block with all of the actor info
198 | actors_raw = b.find(name='div', attrs={"class":"txt-block", "itemprop":"actors", "itemscope":"", "itemtype":"http://schema.org/Person"})
199 |
200 | # Now let's find all of the occurences of the "span" with "itemprop" "name",
201 | # meaning the tags with actors' and actresses' names.
202 | actors = actors_raw.findAll(name="span", attrs={"itemprop":"name"})
203 |
204 | # Now we want to loop through each one and get the text inside the tags
205 | actors_list = [actor.text for actor in actors]
206 |
207 | '''
208 | Creating a "Web Scraping" Function
209 | The above code we've written is useful, but we don't want to have to type it
210 | everytime. We want to create a function that takes the URL and outputs the pieces
211 | we want everytime.
212 | '''
213 |
214 | def getIMDBInfo(url):
215 | r = requests.get(url) # Get HTML
216 | b = BeautifulSoup(r.text) # Create Beautiful Soup object
217 | # Get various attributes and put them in dictionary
218 | results = {} # Initialize empty dictionary
219 |
220 | # Get the title
221 | results['title'] = b.find(name="span", attrs={"class":"itemprop", "itemprop":"name"}).text
222 |
223 | # Rating
224 | results['star_rating'] = float(b.find(name="div", attrs={"class":"titlePageSprite"}).text)
225 |
226 | # Actors/actresses
227 | actors_raw = b.find(name='div', attrs={"class":"txt-block", "itemprop":"actors", "itemscope":"", "itemtype":"http://schema.org/Person"})
228 | actors = actors_raw.findAll(name="span", attrs={"class":"itemprop", "itemprop":"name"})
229 | results['actors_list'] = [actor.text for actor in actors]
230 |
231 | # Content Rating
232 | infobar = b.find(name="div", attrs={"class":"infobar"})
233 | results['content_rating'] = infobar.find(name='meta', attrs={"itemprop":"contentRating"})['content']
234 |
235 | # Show duration
236 | results['duration'] = int(infobar.find(name='time', attrs={"itemprop":"duration"}).text.strip()[:-4])#infobar.find(name='time', attrs={"itemprop":"duration"}).text
237 |
238 | # Genre
239 | results['genre'] = infobar.find(name='span', attrs={"itemprop":"genre"}).text
240 |
241 | # Return dictionary
242 | return results
243 |
244 | # Let's see if it worked
245 | # We can look at the results of our previous web page, "House of Cards"
246 | getIMDBInfo('http://www.imdb.com/title/tt1856010/')
247 | # Now let's try another one: Interstellar
248 | getIMDBInfo('http://www.imdb.com/title/tt0816692/')
249 |
250 | # Now let's show the true functionality
251 | list_of_title_urls = []
252 | with open('imdb_movie_urls.csv', 'rU') as f:
253 | list_of_title_urls = f.read().split('\n')
254 |
255 | # Let's get the data for each title in the list
256 | data = []
257 | for title_url in list_of_title_urls:
258 | imdb_data = getIMDBInfo(title_url)
259 | data.append(imdb_data)
260 |
261 | column_names = ['star_rating', 'title', 'content_rating', 'genre', 'duration', 'actors_list']
262 | movieRatings = pd.DataFrame(data, columns = column_names)
263 | movieRatings
264 | # Now we have some data we can begin exploring, aggregating, etc.
265 |
266 |
267 | '''
268 | Bonus material: Getting movie data for the top 1000 movies on IMDB
269 | '''
270 |
271 | # Or let's build another webscraper to get the IMDB top 1000
272 | movie_links = [] # Create empty list
273 | # Notice that we are creating a list [1,101,201,...] and changing the URL slightly each time.
274 | for i in range(1,1000,100):
275 | # Get url
276 | r = requests.get('http://www.imdb.com/search/title?groups=top_1000&sort=user_rating&start=' + str(i) + '&view=simple') # Get HTML
277 | b = BeautifulSoup(r.text) # Create Beautiful Soup object
278 | links = b.findAll(name='td', attrs={'class':'title'}) # Find all 'td's with 'class'='title'
279 | for link in links:
280 | a_link = link.find('a') # Find liks
281 | movie_links.append('http://www.imdb.com' + str(a_link['href'])) # Add link to list
282 |
283 | # Create dataframe of the top 1000 movies on IMDB
284 | # NOTE: This could take 5-10 minutes. You can skip this part as I've already
285 | # pulled all of this data and saved it to a file.
286 | data = []
287 | j=0
288 | # Loop through every movie title
289 | for movie_link in movie_links:
290 | try:
291 | imdb_data = getIMDBInfo(movie_link) # Get movie data
292 | data.append(imdb_data) # Put movie data in list
293 | except:
294 | pass
295 | j += 1
296 | if j%50 == 0:
297 | print 'Completed ' + str(j) + ' titles!' # Print progress
298 |
299 | # Create data frame with movies
300 | column_names = ['star_rating', 'title', 'content_rating', 'genre', 'duration', 'actors_list']
301 | movieRatingsTop1000 = pd.DataFrame(data, columns = column_names)
302 |
303 | # Read in the reated dataframe
304 | movieRatingsTop1000 = pd.read_csv('imdb_movie_ratings_top_1000.csv')
305 |
306 | # Now you're ready to do some analysis
307 | movieRatingsTop1000.describe()
308 | movieRatingsTop1000.groupby('genre').star_rating.mean()
309 | movieRatingsTop1000.groupby('content_rating').star_rating.mean()
310 | movieRatingsTop1000.plot(kind='scatter', x='duration', y='star_rating')
311 | plt.show()
--------------------------------------------------------------------------------
/code/10_logistic_regression_confusion_matrix.py:
--------------------------------------------------------------------------------
1 | '''
2 | CLASS: Logistic Regression and Confusion Matrix
3 | '''
4 |
5 | ###############################################################################
6 | ### Logistic Regression
7 | ###############################################################################
8 |
9 | # Imports
10 | import pandas as pd
11 | from sklearn.linear_model import LogisticRegression
12 | from sklearn.cross_validation import train_test_split
13 | from sklearn import metrics
14 | from math import exp
15 | import numpy as np
16 | import matplotlib.pyplot as plt
17 |
18 | # Read in data
19 | data = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/default.csv')
20 | data.head()
21 | # Change column to number
22 | data['student_bin'] = data.student.map({'No':0, 'Yes':1})
23 |
24 | # Let's do some cursory analysis.
25 | data.groupby('default').balance.mean()
26 | data.groupby('default').income.mean()
27 |
28 | # Set X and y
29 | feature_cols = ['balance', 'income','student_bin']
30 | X = data[feature_cols]
31 | y = data.default
32 |
33 | # Train test split
34 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2)
35 |
36 | # Fit model
37 | logreg = LogisticRegression()
38 | logreg.fit(X_train, y_train)
39 | y_pred = logreg.predict(X_test) # Predict
40 |
41 | # Access accuracy
42 | print metrics.accuracy_score(y_test, y_pred)
43 |
44 |
45 | ###############################################################################
46 | ### Null Accuracy Rate
47 | ###############################################################################
48 |
49 | # Compare to null accuracy rate. The null accuracy rate is the accuracy if I
50 | # predict all the majority class. If there are more 1's, I predict all 1's.
51 | # If there are more 0's, I predict all 0's. There are several ways to do this.
52 |
53 | # 1. Create a vector of majority class and use the accuracy_score.
54 | # "If I predicted all 0's, how accurate would I be?
55 | print metrics.accuracy_score(y_test, [0]*len(y_test))
56 |
57 | # 2. Calculate the mean of y_test (AKA the percentage of 1's)
58 | y_test.mean()
59 | # One minus that number will be the percentage of 0's. This means that if you
60 | # predict all 0's, you will be correct 1-y_test-mean() percent of the time.
61 | 1 - y_test.mean()
62 |
63 | # This puts our accuracy score into context a bit. We can now see that we
64 | # actually didn't do so great!
65 |
66 |
67 | ###############################################################################
68 | ### Intepretting Logistic Regression Coefficients
69 | ###############################################################################
70 |
71 | # Let's look at the coefficients
72 | for col in zip(feature_cols, logreg.coef_[0]):
73 | print col[0], col[1]
74 |
75 | # Let's interpret those.
76 | for col in zip(feature_cols, logreg.coef_[0]):
77 | print 'A unit increase in', col[0], 'equals a', exp(col[1]), 'increase in odds.'
78 |
79 | ###############################################################################
80 | ### Confusion Matrix
81 | ###############################################################################
82 |
83 | # Let's look at the confusion matrix
84 | con_mat = metrics.confusion_matrix(y_test, y_pred)
85 | print con_mat
86 |
87 | # Let's define our true posititves, false positives, true negatives, and false negatives
88 | true_neg = con_mat[0][0]
89 | false_neg = con_mat[1][0]
90 | true_pos = con_mat[1][1]
91 | false_pos = con_mat[0][1]
92 |
93 | # Sensitivity: percent of correct predictions when reference value is 'default'
94 | sensitivity = float(true_pos)/(false_neg + true_pos)
95 | print sensitivity
96 | print metrics.recall_score(y_test, y_pred)
97 |
98 | # Specificity: percent of correct predictions when reference value is 'not default'
99 | specificity = float(true_neg) / (true_neg + false_pos)
100 | print specificity
101 |
102 | ###############################################################################
103 | ### Logistic Regression Thresholds
104 | ###############################################################################
105 |
106 | # Logistic regression is actually predicting the underlying probability.
107 | # However, when you clal the "predict" function, it returns class labels. You
108 | # can still predict the actual probability and set your own threshold if you'd
109 | # like. This can be useful in cases where the "signal" from the model isn't
110 | # strong.
111 |
112 | # Predict probabilities
113 | logreg.predict_proba(X_test).shape
114 | probs = logreg.predict_proba(X_test)[:, 1]
115 |
116 | # The natural threshold for probabilility is 0.5, but you don't have to use
117 | # that.
118 |
119 | # Use 0.5 thrshold for predicting 'default' and confirm we get the same results
120 | preds_05 = np.where(probs >= 0.5, 1, 0)
121 | print metrics.accuracy_score(y_test, preds_05)
122 | con_mat_05 = metrics.confusion_matrix(y_test, preds_05)
123 | print con_mat_05
124 |
125 | # Let's look at a histogram of these probabilities.
126 | plt.hist(probs, bins=20)
127 | plt.title('Distribution of Probabilities')
128 | plt.xlabel('Probability')
129 | plt.ylabel('Frequency')
130 | plt.show()
131 |
132 | # Change cutoff for predicting default to 0.2
133 | preds_02 = np.where(probs > 0.2, 1, 0)
134 | delta = float((preds_02 != preds_05).sum())/len(X_test)*100
135 | print 'Changing the threshold from 0.5 to 0.2 changed %.2f percent of the predictions.' % delta
136 |
137 | # Check the new accuracy, sensitivity, specificity
138 | print metrics.accuracy_score(y_test, preds_02)
139 | con_mat_02 = metrics.confusion_matrix(y_test, preds_02)
140 | print con_mat_02
141 |
142 | # Let's define our true posititves, false positives, true negatives, and false negatives
143 | true_neg = con_mat_02[0][0]
144 | false_neg = con_mat_02[1][0]
145 | true_pos = con_mat_02[1][1]
146 | false_pos = con_mat_02[0][1]
147 |
148 | # Sensitivity: percent of correct predictions when reference value is 'default'
149 | sensitivity = float(true_pos)/(false_neg + true_pos)
150 | print sensitivity
151 | print metrics.recall_score(y_test, preds_02)
152 |
153 | # Specificity: percent of correct predictions when reference value is 'not default'
154 | specificity = float(true_neg) / (true_neg + false_pos)
155 | print specificity
156 |
157 |
158 | ###############################################################################
159 | ### Exercise/Possibly Homework
160 | ###############################################################################
161 |
162 | '''
163 | Let's use the glass identification dataset again. We've previously run knn
164 | on this dataset. Now, let's try logistic regression. Access the dataset at
165 | http://archive.ics.uci.edu/ml/datasets/Glass+Identification. Complete the
166 | following tasks or answer the following questions.
167 | '''
168 | '''
169 | 1. Read the data into a pandas dataframe.
170 | '''
171 | # Taken from Kevin's 07 HW solution
172 | df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data',
173 | names=['id','ri','na','mg','al','si','k','ca','ba','fe','glass_type'],
174 | index_col='id')
175 |
176 | ''''
177 | 2. Explore the data and look at what columns are available.
178 | '''
179 | # Taken from Kevin's 07 HW solution
180 | df.shape # 214 x 10
181 | df.head()
182 | df.tail()
183 | df.glass_type.value_counts()
184 | df.isnull().sum() # No nulls in our data
185 |
186 | ''''
187 | 3. Convert the 'glass type' column into a binary response.
188 | * If type of class = 1/2/3/4, binary=0.
189 | * If type of glass = 5/6/7, binary=1.
190 | '''
191 | # Taken from Kevin's 07 HW solution
192 | df['binary'] = np.where(df.glass_type < 5, 0, 1) # method 1
193 | df['binary'] = df.glass_type.map({1:0, 2:0, 3:0, 4:0, 5:1, 6:1, 7:1}) # method 2
194 | df.binary.value_counts()
195 |
196 | '''
197 | 4. Create a feature matrix and a response vector.
198 | '''
199 | # Taken from Kevin's 07 HW solution
200 | features = ['ri','na','mg','al','si','k','ca','ba','fe'] # create a list of features
201 | features = df.columns[:-2] # alternative way: slice 'columns' attribute like a list
202 | X = df[features] # create DataFrame X by only selecting features
203 | y = df.binary
204 |
205 | '''
206 | 5. Split the data into the appropriate training and testing sets.
207 | '''
208 | # Taken from Kevin's 07 HW solution
209 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99)
210 |
211 | '''
212 | 6. Create and fit a logistic regression model.
213 | '''
214 | logreg = LogisticRegression() # Instatiate estimator
215 | logreg.fit(X_train, y_train) # Fit data
216 |
217 | '''
218 | 7. Make predictions with your new model.
219 | '''
220 | y_pred = logreg.predict(X_test) # Create predictions
221 |
222 | '''
223 | 8. Calculate the accuracy rate of your model and compare it to the null accuracy.
224 | '''
225 | # Calculate accuracy of model
226 | metrics.accuracy_score(y_test, y_pred)
227 |
228 | # Calculate null accuracy
229 | metrics.accuracy_score(y_test, [0]*len(y_test))
230 |
231 | '''
232 | 9. Generate a confusion matrix for your predictions. Use this to calculate the
233 | sensitivity and specificity of your model.
234 | '''
235 | # Let's look at the confusion matrix
236 | con_mat = metrics.confusion_matrix(y_test, y_pred)
237 | print con_mat
238 |
239 | # Let's define our true posititves, false positives, true negatives, and false negatives
240 | true_neg = con_mat[0][0]
241 | false_neg = con_mat[1][0]
242 | true_pos = con_mat[1][1]
243 | false_pos = con_mat[0][1]
244 |
245 | # Sensitivity: percent of correct predictions when reference value is 'default'
246 | sensitivity = float(true_pos)/(false_neg + true_pos)
247 | print sensitivity
248 |
249 | # Specificity: percent of correct predictions when reference value is 'not default'
250 | specificity = float(true_neg) / (true_neg + false_pos)
251 | print specificity
--------------------------------------------------------------------------------
/code/13_naive_bayes.py:
--------------------------------------------------------------------------------
1 | '''
2 | CLASS: Naive Bayes SMS spam classifier
3 | DATA SOURCE: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
4 | '''
5 |
6 | ## READING IN THE DATA
7 |
8 | # read tab-separated file using pandas
9 | import pandas as pd
10 | df = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/SMSSpamCollection.txt',
11 | sep='\t', header=None, names=['label', 'msg'])
12 |
13 | # examine the data
14 | df.head(20)
15 | df.label.value_counts()
16 | df.msg.describe()
17 |
18 | # convert label to a binary variable
19 | df['label'] = df.label.map({'ham':0, 'spam':1})
20 | df.head()
21 |
22 | # split into training and testing sets
23 | from sklearn.cross_validation import train_test_split
24 | X_train, X_test, y_train, y_test = train_test_split(df.msg, df.label, random_state=1)
25 | X_train.shape
26 | X_test.shape
27 |
28 |
29 | ## COUNTVECTORIZER: 'convert text into a matrix of token counts'
30 | ## http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
31 |
32 | from sklearn.feature_extraction.text import CountVectorizer
33 |
34 | # start with a simple example
35 | train_simple = ['call you tonight',
36 | 'Call me a cab',
37 | 'please call me... PLEASE!']
38 |
39 | # learn the 'vocabulary' of the training data
40 | vect = CountVectorizer()
41 | vect.fit(train_simple)
42 | vect.get_feature_names()
43 |
44 | # transform training data into a 'document-term matrix'
45 | train_simple_dtm = vect.transform(train_simple)
46 | train_simple_dtm
47 | train_simple_dtm.toarray()
48 |
49 | # examine the vocabulary and document-term matrix together
50 | pd.DataFrame(train_simple_dtm.toarray(), columns=vect.get_feature_names())
51 |
52 | # transform testing data into a document-term matrix (using existing vocabulary)
53 | test_simple = ["please don't call me"]
54 | test_simple_dtm = vect.transform(test_simple)
55 | test_simple_dtm.toarray()
56 | pd.DataFrame(test_simple_dtm.toarray(), columns=vect.get_feature_names())
57 |
58 |
59 | ## REPEAT PATTERN WITH SMS DATA
60 |
61 | # instantiate the vectorizer
62 | vect = CountVectorizer()
63 |
64 | # learn vocabulary and create document-term matrix in a single step
65 | train_dtm = vect.fit_transform(X_train)
66 | train_dtm
67 |
68 | # transform testing data into a document-term matrix
69 | test_dtm = vect.transform(X_test)
70 | test_dtm
71 |
72 | # store feature names and examine them
73 | train_features = vect.get_feature_names()
74 | len(train_features)
75 | train_features[:50]
76 | train_features[-50:]
77 |
78 | # convert train_dtm to a regular array
79 | train_arr = train_dtm.toarray()
80 | train_arr
81 |
82 |
83 | ## SIMPLE SUMMARIES OF THE TRAINING DATA
84 |
85 | # refresher on NumPy
86 | import numpy as np
87 | arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
88 | arr
89 | arr[0, 0]
90 | arr[1, 3]
91 | arr[0, :]
92 | arr[:, 0]
93 | np.sum(arr)
94 | np.sum(arr, axis=0)
95 | np.sum(arr, axis=1)
96 |
97 | # exercise: calculate the number of tokens in the 0th message in train_arr
98 | sum(train_arr[0, :])
99 |
100 | # exercise: count how many times the 0th token appears across ALL messages in train_arr
101 | sum(train_arr[:, 0])
102 |
103 | # exercise: count how many times EACH token appears across ALL messages in train_arr
104 | np.sum(train_arr, axis=0)
105 |
106 | # exercise: create a DataFrame of tokens with their counts
107 | train_token_counts = pd.DataFrame({'token':train_features, 'count':np.sum(train_arr, axis=0)})
108 | train_token_counts.sort('count', ascending=False)
109 |
110 |
111 | ## MODEL BUILDING WITH NAIVE BAYES
112 | ## http://scikit-learn.org/stable/modules/naive_bayes.html
113 |
114 | # train a Naive Bayes model using train_dtm
115 | from sklearn.naive_bayes import MultinomialNB
116 | nb = MultinomialNB()
117 | nb.fit(train_dtm, y_train)
118 |
119 | # make predictions on test data using test_dtm
120 | y_pred = nb.predict(test_dtm)
121 | y_pred
122 |
123 | # compare predictions to true labels
124 | from sklearn import metrics
125 | print metrics.accuracy_score(y_test, y_pred)
126 | print metrics.confusion_matrix(y_test, y_pred)
127 |
128 | # predict (poorly calibrated) probabilities and calculate AUC
129 | y_prob = nb.predict_proba(test_dtm)[:, 1]
130 | y_prob
131 | print metrics.roc_auc_score(y_test, y_prob)
132 |
133 | # exercise: show the message text for the false positives
134 | X_test[y_test < y_pred]
135 |
136 | # exercise: show the message text for the false negatives
137 | X_test[y_test > y_pred]
138 |
139 |
140 | ## COMPARE NAIVE BAYES AND LOGISTIC REGRESSION
141 | ## USING ALL DATA AND CROSS-VALIDATION
142 |
143 | # create a document-term matrix using all data
144 | all_dtm = vect.fit_transform(df.msg)
145 |
146 | # instantiate logistic regression
147 | from sklearn.linear_model import LogisticRegression
148 | logreg = LogisticRegression()
149 |
150 | # compare AUC using cross-validation
151 | # note: this is slightly improper cross-validation... can you figure out why?
152 | from sklearn.cross_validation import cross_val_score
153 | cross_val_score(nb, all_dtm, df.label, cv=10, scoring='roc_auc').mean()
154 | cross_val_score(logreg, all_dtm, df.label, cv=10, scoring='roc_auc').mean()
155 |
156 |
157 | ## EXERCISE: CALCULATE THE 'SPAMMINESS' OF EACH TOKEN
158 |
159 | # create separate DataFrames for ham and spam
160 | df_ham = df[df.label==0]
161 | df_spam = df[df.label==1]
162 |
163 | # learn the vocabulary of ALL messages and save it
164 | vect.fit(df.msg)
165 | all_features = vect.get_feature_names()
166 |
167 | # create document-term matrix of ham, then convert to a regular array
168 | ham_dtm = vect.transform(df_ham.msg)
169 | ham_arr = ham_dtm.toarray()
170 |
171 | # create document-term matrix of spam, then convert to a regular array
172 | spam_dtm = vect.transform(df_spam.msg)
173 | spam_arr = spam_dtm.toarray()
174 |
175 | # count how many times EACH token appears across ALL messages in ham_arr
176 | ham_counts = np.sum(ham_arr, axis=0)
177 |
178 | # count how many times EACH token appears across ALL messages in spam_arr
179 | spam_counts = np.sum(spam_arr, axis=0)
180 |
181 | # create a DataFrame of tokens with their separate ham and spam counts
182 | all_token_counts = pd.DataFrame({'token':all_features, 'ham':ham_counts, 'spam':spam_counts})
183 |
184 | # add one to ham counts and spam counts so that ratio calculations (below) make more sense
185 | all_token_counts['ham'] = all_token_counts.ham + 1
186 | all_token_counts['spam'] = all_token_counts.spam + 1
187 |
188 | # calculate ratio of spam-to-ham for each token
189 | all_token_counts['spam_ratio'] = all_token_counts.spam / all_token_counts.ham
190 | all_token_counts.sort('spam_ratio')
191 |
--------------------------------------------------------------------------------
/code/15_kaggle.py:
--------------------------------------------------------------------------------
1 | '''
2 | CLASS: Kaggle Stack Overflow competition
3 | '''
4 |
5 | # read in the file and set the first column as the index
6 | import pandas as pd
7 | train = pd.read_csv('train.csv', index_col=0)
8 | train.head()
9 |
10 |
11 | '''
12 | What are some assumptions and theories to test?
13 |
14 | PostId: unique within the dataset
15 | OwnerUserId: not unique within the dataset, assigned in order
16 | OwnerCreationDate: users with older accounts have more open questions
17 | ReputationAtPostCreation: higher reputation users have more open questions
18 | OwnerUndeletedAnswerCountAtPostTime: users with more answers have more open questions
19 | Tags: 1 to 5 tags are required, many unique tags
20 | PostClosedDate: should only exist for closed questions
21 | OpenStatus: 1 means open
22 | '''
23 |
24 | ## OPEN STATUS
25 |
26 | # dataset is perfectly balanced in terms of OpenStatus (not a representative sample)
27 | train.OpenStatus.value_counts()
28 |
29 |
30 | ## USER ID
31 |
32 | # OwnerUserId is not unique within the dataset, let's examine the top 3 users
33 | train.OwnerUserId.value_counts()
34 |
35 | # mostly closed questions, all lowercase, lots of spelling errors
36 | train[train.OwnerUserId==466534]
37 |
38 | # fewer closed questions, better grammar, high reputation but few answers
39 | train[train.OwnerUserId==39677]
40 |
41 | # very few closed questions, lots of answers
42 | train[train.OwnerUserId==34537]
43 |
44 |
45 | ## REPUTATION
46 |
47 | # ReputationAtPostCreation is higher for open questions: possibly use as a feature
48 | train.groupby('OpenStatus').ReputationAtPostCreation.describe()
49 |
50 | # not a useful histogram
51 | train.ReputationAtPostCreation.hist()
52 |
53 | # much more useful histogram
54 | train[train.ReputationAtPostCreation < 1000].ReputationAtPostCreation.hist()
55 |
56 | # grouped histogram
57 | train[train.ReputationAtPostCreation < 1000].ReputationAtPostCreation.hist(by=train.OpenStatus, sharey=True)
58 |
59 |
60 | ## ANSWER COUNT
61 |
62 | # rename column
63 | train.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True)
64 |
65 | # Answers is higher for open questions: possibly use as a feature
66 | train.groupby('OpenStatus').Answers.describe()
67 |
68 | # grouped histogram
69 | train[train.Answers < 50].Answers.hist(by=train.OpenStatus, sharey=True)
70 |
71 |
72 | ## USER ID
73 |
74 | # OwnerUserId is assigned in numerical order
75 | train.sort('OwnerUserId').OwnerCreationDate
76 |
77 | # OwnerUserId is lower for open questions: possibly use as a feature
78 | train.groupby('OpenStatus').OwnerUserId.describe()
79 |
80 |
81 | ## TITLE
82 |
83 | # create a new feature that represents the length of the title (in characters)
84 | train['TitleLength'] = train.Title.apply(len)
85 |
86 | # Title is longer for open questions: possibly use as a feature
87 | train.TitleLength.hist(by=train.OpenStatus)
88 |
89 |
90 | ## BODY
91 |
92 | # create a new feature that represents the length of the body (in characters)
93 | train['BodyLength'] = train.BodyMarkdown.apply(len)
94 |
95 | # BodyMarkdown is longer for open questions: possibly use as a feature
96 | train.BodyLength.hist(by=train.OpenStatus)
97 |
98 |
99 | ## TAGS
100 |
101 | # Tag1 is required, and the rest are optional
102 | train.isnull().sum()
103 |
104 | # there are over 5000 unique tags
105 | len(train.Tag1.unique())
106 |
107 | # calculate the percentage of open questions for each tag
108 | train.groupby('Tag1').OpenStatus.mean()
109 |
110 | # percentage of open questions varies widely by tag (among popular tags)
111 | train.groupby('Tag1').OpenStatus.agg(['mean','count']).sort('count')
112 |
113 | # create a new feature that represents the number of tags for each question
114 | train['NumTags'] = train.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1)
115 |
116 | # NumTags is higher for open questions: possibly use as a feature
117 | train.NumTags.hist(by=train.OpenStatus)
118 |
119 |
120 | '''
121 | Define a function that takes in a raw CSV file and returns a DataFrame that
122 | includes all created features (and any other modifications). That way, we
123 | can apply the same changes to both train.csv and test.csv.
124 | '''
125 |
126 | # define the function
127 | def make_features(filename):
128 | df = pd.read_csv(filename, index_col=0)
129 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True)
130 | df['TitleLength'] = df.Title.apply(len)
131 | df['BodyLength'] = df.BodyMarkdown.apply(len)
132 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1)
133 | return df
134 |
135 | # apply function to both training and testing files
136 | train = make_features('train.csv')
137 | test = make_features('test.csv')
138 |
139 |
140 | '''
141 | Use train/test split to compare a model that includes 1 feature with a model
142 | that includes 5 features.
143 | '''
144 |
145 | ## ONE FEATURE
146 |
147 | # define X and y
148 | feature_cols = ['ReputationAtPostCreation']
149 | X = train[feature_cols]
150 | y = train.OpenStatus
151 |
152 | # split into training and testing sets
153 | from sklearn.cross_validation import train_test_split
154 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
155 |
156 | # fit a logistic regression model
157 | from sklearn.linear_model import LogisticRegression
158 | logreg = LogisticRegression()
159 | logreg.fit(X_train, y_train)
160 |
161 | # examine the coefficient to check that it makes sense
162 | logreg.coef_
163 |
164 | # predict response classes and predict class probabilities
165 | y_pred = logreg.predict(X_test)
166 | y_prob = logreg.predict_proba(X_test)[:, 1]
167 |
168 | # check how well we did
169 | from sklearn import metrics
170 | metrics.accuracy_score(y_test, y_pred) # 0.538 (better than guessing)
171 | metrics.confusion_matrix(y_test, y_pred) # predicts closed most of the time
172 | metrics.roc_auc_score(y_test, y_prob) # 0.602 (not horrible)
173 | metrics.log_loss(y_test, y_prob) # 0.690 (what is this?)
174 |
175 | # log loss is the competition's evaluation metric, so let's get a feel for it
176 | true = [0, 0, 1, 1]
177 | prob = [0.1, 0.2, 0.8, 0.9]
178 | metrics.log_loss(true, prob) # 0.164 (lower is better)
179 |
180 | # let's try a few other predicted probabilities and check the log loss
181 | prob = [0.4, 0.4, 0.6, 0.6] # 0.511 (predictions are right, but less confident)
182 | prob = [0.4, 0.4, 0.4, 0.6] # 0.612 (one wrong prediction that is a bit off)
183 | prob = [0.4, 0.4, 0.1, 0.6] # 0.959 (one wrong prediction that is way off)
184 | prob = [0.5, 0.5, 0.5, 0.5] # 0.693 (you can get this score without a model)
185 |
186 |
187 | ## FIVE FEATURES
188 |
189 | # define X and y
190 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags']
191 | X = train[feature_cols]
192 | y = train.OpenStatus
193 |
194 | # split into training and testing sets
195 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
196 |
197 | # fit a logistic regression model
198 | logreg.fit(X_train, y_train)
199 |
200 | # examine the coefficients to check that they make sense
201 | logreg.coef_
202 |
203 | # predict response classes and predict class probabilities
204 | y_pred = logreg.predict(X_test)
205 | y_prob = logreg.predict_proba(X_test)[:, 1]
206 |
207 | # check how well we did
208 | metrics.accuracy_score(y_test, y_pred) # 0.589 (doing better)
209 | metrics.confusion_matrix(y_test, y_pred) # predicts open more often
210 | metrics.roc_auc_score(y_test, y_prob) # 0.625 (tiny bit better)
211 | metrics.log_loss(y_test, y_prob) # 0.677 (a bit better)
212 |
213 | # let's see if cross-validation gives us similar results
214 | from sklearn.cross_validation import cross_val_score
215 | scores = cross_val_score(logreg, X, y, scoring='log_loss', cv=10)
216 | scores.mean() # 0.677 (identical to train/test split)
217 | scores.std() # very small
218 |
219 |
220 | '''
221 | Use the model with 5 features to make a submission
222 | '''
223 |
224 | # make sure that X and y are defined properly
225 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags']
226 | X = train[feature_cols]
227 | y = train.OpenStatus
228 |
229 | # train the model on ALL data (not X_train and y_train)
230 | logreg.fit(X, y)
231 |
232 | # predict class probabilities for the actual testing data (not X_test)
233 | y_prob = logreg.predict_proba(test[feature_cols])[:, 1]
234 |
235 | # sample submission file indicates we need two columns: PostId and predicted probability
236 | test.index # PostId
237 | y_prob # predicted probability
238 |
239 | # create a DataFrame that has 'id' as the index, then export to a CSV file
240 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id')
241 | sub.to_csv('sub1.csv')
242 |
243 |
244 | '''
245 | Create a few more features from Title
246 | '''
247 |
248 | # string methods for a Series are accessed via 'str'
249 | train.Title.str.lower()
250 |
251 | # create a new feature that represents whether a Title is all lowercase
252 | train['TitleLowercase'] = (train.Title.str.lower() == train.Title).astype(int)
253 |
254 | # check if there are a meaningful number of ones
255 | train.TitleLowercase.value_counts()
256 |
257 | # percentage of open questions is lower among questions with lowercase titles: possibly use as a feature
258 | train.groupby('TitleLowercase').OpenStatus.mean()
259 |
260 | # create features that represent whether Title contains certain words
261 | train['TitleQuestion'] = train.Title.str.contains('question', case=False).astype(int)
262 | train['TitleNeed'] = train.Title.str.contains('need', case=False).astype(int)
263 | train['TitleHelp'] = train.Title.str.contains('help', case=False).astype(int)
264 |
265 |
266 | '''
267 | Build a document-term matrix from Title using CountVectorizer
268 | '''
269 |
270 | # define X and y
271 | X = train.Title
272 | y = train.OpenStatus
273 |
274 | # split into training and testing sets
275 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
276 |
277 | # use CountVectorizer with the default settings
278 | from sklearn.feature_extraction.text import CountVectorizer
279 | vect = CountVectorizer()
280 |
281 | # fit and transform on X_train, but only transform on X_test
282 | train_dtm = vect.fit_transform(X_train)
283 | test_dtm = vect.transform(X_test)
284 |
285 | # try a Naive Bayes model
286 | from sklearn.naive_bayes import MultinomialNB
287 | nb = MultinomialNB()
288 | nb.fit(train_dtm, y_train)
289 | y_prob = nb.predict_proba(test_dtm)[:, 1]
290 | metrics.log_loss(y_test, y_prob) # 0.659 (a bit better than our previous model)
291 |
292 | # try tuning CountVectorizer and repeat Naive Bayes
293 | vect = CountVectorizer(stop_words='english')
294 | train_dtm = vect.fit_transform(X_train)
295 | test_dtm = vect.transform(X_test)
296 | nb.fit(train_dtm, y_train)
297 | y_prob = nb.predict_proba(test_dtm)[:, 1]
298 | metrics.log_loss(y_test, y_prob) # 0.637 (even better)
299 |
300 | # try switching to logistic regression
301 | logreg.fit(train_dtm, y_train)
302 | y_prob = logreg.predict_proba(test_dtm)[:, 1]
303 | metrics.log_loss(y_test, y_prob) # 0.573 (much better!)
304 |
305 |
306 | '''
307 | Create features from BodyMarkdown using TextBlob
308 | '''
309 |
310 | # examine BodyMarkdown for first question
311 | train.iloc[0].BodyMarkdown
312 |
313 | # calculate the number of sentences in that question using TextBlob
314 | from textblob import TextBlob
315 | len(TextBlob(train.iloc[0].BodyMarkdown).sentences)
316 |
317 | # calculate the number of sentences for all questions (raises an error)
318 | train.BodyMarkdown.apply(lambda x: len(TextBlob(x).sentences))
319 |
320 | # explicitly decode string to unicode to fix error (WARNING: VERY SLOW)
321 | train['BodySentences'] = train.BodyMarkdown.apply(lambda x: len(TextBlob(x.decode('utf-8')).sentences))
322 |
--------------------------------------------------------------------------------
/code/17_ensembling_exercise.py:
--------------------------------------------------------------------------------
1 | # Helper code for class 17 exercise
2 |
3 | # define the function
4 | def make_features(filename):
5 | df = pd.read_csv(filename, index_col=0)
6 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True)
7 | df['TitleLength'] = df.Title.apply(len)
8 | df['BodyLength'] = df.BodyMarkdown.apply(len)
9 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1)
10 | return df
11 |
12 | # apply function to both training and testing files
13 | train = make_features('train.csv')
14 | test = make_features('test.csv')
15 |
16 | # define X and y
17 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags']
18 | X = train[feature_cols]
19 | y = train.OpenStatus
20 |
21 | ###############################################################################
22 | ##### Create some models with the derived features
23 | ###############################################################################
24 |
25 |
26 | ###############################################################################
27 | ##### Count vectorizer
28 | ###############################################################################
29 |
30 |
31 | # define X and y
32 | X = train.Title
33 | y = train.OpenStatus
34 |
35 | # split into training and testing sets
36 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
37 |
38 | # use CountVectorizer with the default settings
39 | from sklearn.feature_extraction.text import CountVectorizer
40 | vect = CountVectorizer()
41 |
42 | # fit and transform on X_train, but only transform on X_test
43 | train_dtm = vect.fit_transform(X_train)
44 | test_dtm = vect.transform(X_test)
45 |
46 |
47 | ###############################################################################
48 | ##### Create a model with the text features
49 | ###############################################################################
50 |
51 |
--------------------------------------------------------------------------------
/code/18_clustering.py:
--------------------------------------------------------------------------------
1 | '''
2 | THE DATA
3 |
4 | We have data about cars: things like MPG, acceleration, weight, etc. However,
5 | we don't have logical groupings for these cars. We can construct these
6 | manually using our domain knowledge (e.g. we could put all of the high mpg cars
7 | together and all of the low mpg cars together), but we want a more automatic
8 | way of grouping these vehicles that can take into account more features.
9 | '''
10 |
11 | # Imports
12 | from sklearn.cluster import KMeans # K means model
13 | import matplotlib.pyplot as plt
14 | import pandas as pd
15 | import numpy as np
16 |
17 | # Read in data
18 | data = pd.read_table('auto_mpg.txt', sep='|') # All values range from 0 to 1
19 | data.drop('car_name', axis=1, inplace=True) # Drop labels from dataframe
20 | data.head()
21 |
22 |
23 |
24 | '''
25 | CLUSTER ANALYSIS
26 | How do we implement a k-means clustering algorithm?
27 |
28 | scikit-learn KMeans documentation for reference:
29 | http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
30 | '''
31 |
32 | # Standardize our data
33 | from sklearn.preprocessing import StandardScaler
34 | scaler = StandardScaler()
35 | data_scaled = scaler.fit_transform(data)
36 |
37 |
38 | # Set random seed for reproducibility
39 | np.random.seed(0)
40 |
41 | # Run KMeans
42 | est = KMeans(n_clusters=2, init='random') # Instatiate estimator
43 | est.fit(data_scaled) # Fit your data
44 | y_kmeans = est.predict(data_scaled) # Make cluster "predictions"
45 |
46 | # Inspect the data by looking at the means for each cluster
47 | data.groupby(y_kmeans).mean()
48 |
49 | # This can be compared to the overall means for each variable
50 | data.mean()
51 |
52 | # We can get the coordiantes for the center of each cluster
53 | centers = est.cluster_centers_
54 |
55 |
56 |
57 | '''
58 | VISUALIZING THE CLUSTERS
59 | '''
60 |
61 | # We can create a nice plot to visualize this upon two of the dimensions
62 | colors = np.array(['red', 'green', 'blue', 'yellow', 'orange'])
63 |
64 | plt.figure()
65 | plt.scatter(data_scaled[:, 0], data_scaled[:, 5], c=colors[y_kmeans], s=50)
66 | plt.xlabel('MPG')
67 | plt.ylabel('Acceleration')
68 | plt.scatter(centers[:, 0], centers[:, 5], linewidths=3, marker='+', s=300, c='black')
69 | plt.show()
70 |
71 | # We can generate a scatter matrix to see all of the different dimensions paired
72 | pd.scatter_matrix(data, c=colors[y_kmeans], figsize=(15,15), s = 100)
73 | plt.show()
74 |
75 |
76 |
77 | '''
78 | DETERMINING THE NUMBER OF CLUSTERS
79 | How do you choose k? There isn't a bright line, but we can evaluate
80 | performance metrics such as the silhouette coefficient across values of k.
81 |
82 | Note: You also have to take into account practical limitations of choosing k
83 | also. Ten clusters may give the best value, but it might not make sense in the
84 | context of your data.
85 |
86 | scikit-learn Clustering metrics documentation:
87 | http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
88 | '''
89 |
90 | # Create a bunch of different models
91 | k_rng = range(2,15)
92 | k_est = [KMeans(n_clusters = k).fit(data) for k in k_rng]
93 |
94 | # Silhouette Coefficient
95 | # Generally want SC to be closer to 1, while also minimizing k
96 | from sklearn import metrics
97 | silhouette_score = [metrics.silhouette_score(data, e.labels_, metric='euclidean') for e in k_est]
98 |
99 | # Plot the results
100 | plt.figure()
101 | plt.title('Silhouette coefficient for various values of k')
102 | plt.plot(k_rng, silhouette_score, 'b*-')
103 | plt.xlim([1,15])
104 | plt.grid(True)
105 | plt.ylabel('Silhouette Coefficient')
106 | plt.show()
--------------------------------------------------------------------------------
/code/18_regularization.py:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | ##### Regularization with Linear Regression
3 | ###############################################################################
4 |
5 | ## TASK: Regularized regression
6 | ## FUNCTIONS: Ridge, RidgeCV, Lasso, LassoCV
7 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
8 | ## DATA: Crime (n=319 non-null, p=122, type=regression)
9 | ## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime
10 |
11 |
12 | ########## Prepare data ##########
13 | # read in data, remove categorical features, remove rows with missing values
14 | import pandas as pd
15 | crime = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?'])
16 | crime = crime.iloc[:, 5:]
17 | crime.dropna(inplace=True)
18 | crime.head()
19 |
20 | # define X and y
21 | X = crime.iloc[:, :-1]
22 | y = crime.iloc[:, -1]
23 |
24 | # split into train/test
25 | from sklearn.cross_validation import train_test_split
26 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
27 |
28 |
29 | ########## Linear Regression Model Without Regularization ##########
30 | # linear regression
31 | from sklearn.linear_model import LinearRegression
32 | lm = LinearRegression()
33 | lm.fit(X_train, y_train)
34 | lm.coef_
35 |
36 | # make predictions and evaluate
37 | import numpy as np
38 | from sklearn import metrics
39 | preds = lm.predict(X_test)
40 | print 'RMSE (no regularization) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
41 |
42 |
43 | ########## Ridge Regression Model ##########
44 | # ridge regression (alpha must be positive, larger means more regularization)
45 | from sklearn.linear_model import Ridge
46 | rreg = Ridge(alpha=0.1, normalize=True)
47 | rreg.fit(X_train, y_train)
48 | rreg.coef_
49 | preds = rreg.predict(X_test)
50 | print 'RMSE (Ridge reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
51 |
52 | # use RidgeCV to select best alpha
53 | from sklearn.linear_model import RidgeCV
54 | alpha_range = 10.**np.arange(-2, 3)
55 | rregcv = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=alpha_range)
56 | rregcv.fit(X_train, y_train)
57 | rregcv.alpha_
58 | preds = rregcv.predict(X_test)
59 | print 'RMSE (Ridge CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
60 |
61 | ########## Lasso Regression Model ##########
62 | # lasso (alpha must be positive, larger means more regularization)
63 | from sklearn.linear_model import Lasso
64 | las = Lasso(alpha=0.01, normalize=True)
65 | las.fit(X_train, y_train)
66 | las.coef_
67 | preds = las.predict(X_test)
68 | print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
69 |
70 | # try a smaller alpha
71 | las = Lasso(alpha=0.0001, normalize=True)
72 | las.fit(X_train, y_train)
73 | las.coef_
74 | preds = las.predict(X_test)
75 | print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
76 |
77 | # use LassoCV to select best alpha (tries 100 alphas by default)
78 | from sklearn.linear_model import LassoCV
79 | lascv = LassoCV(normalize=True, alphas=alpha_range)
80 | lascv.fit(X_train, y_train)
81 | lascv.alpha_
82 | lascv.coef_
83 | preds = lascv.predict(X_test)
84 | print 'RMSE (Lasso CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
85 |
86 | ###############################################################################
87 | ##### Regularization with Logistic Regression
88 | ###############################################################################
89 |
90 | ## TASK: Regularized classification
91 | ## FUNCTION: LogisticRegression
92 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
93 | ## DATA: Titanic (n=891, p=5 selected, type=classification)
94 | ## DATA DICTIONARY: https://www.kaggle.com/c/titanic-gettingStarted/data
95 |
96 |
97 | ########## Prepare data ##########
98 | # Get and prepare data
99 | titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/titanic_train.csv')
100 | titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1})
101 | titanic.Age.fillna(titanic.Age.mean(), inplace=True)
102 | embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:]
103 | titanic = pd.concat([titanic, embarked_dummies], axis=1)
104 |
105 | # define X and y
106 | feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S']
107 | X = titanic[feature_cols]
108 | y = titanic.Survived
109 |
110 | # split into train/test
111 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
112 |
113 | # standardize our data
114 | from sklearn.preprocessing import StandardScaler
115 | scaler = StandardScaler()
116 | scaler.fit(X_train)
117 | X_train_scaled = scaler.transform(X_train)
118 | X_test_scaled = scaler.transform(X_test)
119 |
120 |
121 | ########## Logistic Regression Model Without Regularization ##########
122 | # logistic regression
123 | from sklearn.linear_model import LogisticRegression
124 | logreg = LogisticRegression()
125 | logreg.fit(X_train_scaled, y_train)
126 | logreg.coef_
127 | y_pred = logreg.predict(X_test_scaled)
128 |
129 | # Access accuracy
130 | print 'Accuracy (no penalty) =', metrics.accuracy_score(y_test, y_pred)
131 |
132 |
133 | ########## Logistic Regression With L1 Penalty ##########
134 | # logistic regression with L1 penalty (C must be positive, smaller means more regularization)
135 | logreg_l1 = LogisticRegression(C=0.1, penalty='l1')
136 | logreg_l1.fit(X_train_scaled, y_train)
137 | logreg_l1.coef_
138 | y_pred_l1 = logreg_l1.predict(X_test_scaled)
139 |
140 | # Access accuracy
141 | print 'Accuracy (L1 penalty) =', metrics.accuracy_score(y_test, y_pred_l1)
142 |
143 |
144 | ########## Logistic Regression With L2 Penalty ##########
145 | # logistic regression with L2 penalty (C must be positive, smaller means more regularization)
146 | logreg_l2 = LogisticRegression(C=0.1, penalty='l2')
147 | logreg_l2.fit(X_train_scaled, y_train)
148 | logreg_l2.coef_
149 | y_pred_l2 = logreg_l2.predict(X_test_scaled)
150 |
151 | # Access accuracy
152 | print 'Accuracy (L2 penalty) =', metrics.accuracy_score(y_test, y_pred_l2)
--------------------------------------------------------------------------------
/code/19_advanced_sklearn.py:
--------------------------------------------------------------------------------
1 | ## TASK: Searching for optimal parameters
2 | ## FUNCTION: GridSearchCV
3 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/grid_search.html
4 | ## DATA: Titanic (n=891, p=5 selected, type=classification)
5 | ## DATA DICTIONARY: https://www.kaggle.com/c/titanic-gettingStarted/data
6 |
7 | # read in and prepare titanic data
8 | import pandas as pd
9 | titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/titanic_train.csv')
10 | titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1})
11 | titanic.Age.fillna(titanic.Age.mean(), inplace=True)
12 | embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:]
13 | titanic = pd.concat([titanic, embarked_dummies], axis=1)
14 |
15 | # define X and y
16 | feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S']
17 | X = titanic[feature_cols]
18 | y = titanic.Survived
19 |
20 | # use cross-validation to find best max_depth
21 | from sklearn.tree import DecisionTreeClassifier
22 | from sklearn.cross_validation import cross_val_score
23 |
24 | # try max_depth=2
25 | treeclf = DecisionTreeClassifier(max_depth=2, random_state=1)
26 | cross_val_score(treeclf, X, y, cv=10, scoring='roc_auc').mean()
27 |
28 | # try max_depth=3
29 | treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
30 | cross_val_score(treeclf, X, y, cv=10, scoring='roc_auc').mean()
31 |
32 | # use GridSearchCV to automate the search
33 | from sklearn.grid_search import GridSearchCV
34 | treeclf = DecisionTreeClassifier(random_state=1)
35 | depth_range = range(1, 21)
36 | param_grid = dict(max_depth=depth_range)
37 | grid = GridSearchCV(treeclf, param_grid, cv=10, scoring='roc_auc')
38 | grid.fit(X, y)
39 |
40 | # check the results of the grid search
41 | grid.grid_scores_
42 | grid_mean_scores = [result[1] for result in grid.grid_scores_]
43 |
44 | # plot the results
45 | import matplotlib.pyplot as plt
46 | plt.plot(depth_range, grid_mean_scores)
47 |
48 | # what was best?
49 | grid.best_score_
50 | grid.best_params_
51 | grid.best_estimator_
52 |
53 | # search a "grid" of parameters
54 | depth_range = range(1, 21)
55 | leaf_range = range(1, 11)
56 | param_grid = dict(max_depth=depth_range, min_samples_leaf=leaf_range)
57 | grid = GridSearchCV(treeclf, param_grid, cv=10, scoring='roc_auc')
58 | grid.fit(X, y)
59 | grid.grid_scores_
60 | grid.best_score_
61 | grid.best_params_
62 |
63 |
64 | ## TASK: Standardization of features (aka "center and scale" or "z-score normalization")
65 | ## FUNCTION: StandardScaler
66 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
67 | ## EXAMPLE: http://nbviewer.ipython.org/github/rasbt/pattern_classification/blob/master/preprocessing/about_standardization_normalization.ipynb
68 | ## DATA: Wine (n=178, p=2 selected, type=classification)
69 | ## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Wine
70 |
71 | # fake data
72 | train = pd.DataFrame({'id':[0,1,2], 'length':[0.9,0.3,0.6], 'mass':[0.1,0.2,0.8], 'rings':[40,50,60]})
73 | oos = pd.DataFrame({'length':[0.59], 'mass':[0.79], 'rings':[54.9]})
74 |
75 | # define X and y
76 | X = train[['length','mass','rings']]
77 | y = train.id
78 |
79 | # KNN with k=1
80 | from sklearn.neighbors import KNeighborsClassifier
81 | knn = KNeighborsClassifier(n_neighbors=1)
82 | knn.fit(X, y)
83 |
84 | # what "should" it predict? what does it predict?
85 | knn.predict(oos)
86 |
87 | # standardize the features
88 | from sklearn.preprocessing import StandardScaler
89 | scaler = StandardScaler()
90 | scaler.fit(X)
91 | X_scaled = scaler.transform(X)
92 |
93 | # compare original to standardized
94 | X.values
95 | X_scaled
96 |
97 | # figure out how it standardized
98 | scaler.mean_
99 | scaler.std_
100 | (X.values-scaler.mean_) / scaler.std_
101 |
102 | # try this on real data
103 | wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None, usecols=[0,10,13])
104 | wine.columns=['label', 'color', 'proline']
105 | wine.head()
106 | wine.describe()
107 |
108 | # define X and y
109 | X = wine[['color', 'proline']]
110 | y = wine.label
111 |
112 | # split into train/test
113 | from sklearn.cross_validation import train_test_split
114 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
115 |
116 | # standardize X_train
117 | scaler.fit(X_train)
118 | X_train_scaled = scaler.transform(X_train)
119 |
120 | # check that it worked properly
121 | X_train_scaled[:, 0].mean()
122 | X_train_scaled[:, 0].std()
123 | X_train_scaled[:, 1].mean()
124 | X_train_scaled[:, 1].std()
125 |
126 | # standardize X_test
127 | X_test_scaled = scaler.transform(X_test)
128 |
129 | # is this right?
130 | X_test_scaled[:, 0].mean()
131 | X_test_scaled[:, 0].std()
132 | X_test_scaled[:, 1].mean()
133 | X_test_scaled[:, 1].std()
134 |
135 | # compare KNN accuracy on original vs scaled data
136 | knn = KNeighborsClassifier(n_neighbors=3)
137 | knn.fit(X_train, y_train)
138 | knn.score(X_test, y_test)
139 | knn.fit(X_train_scaled, y_train)
140 | knn.score(X_test_scaled, y_test)
141 |
142 |
143 | ## TASK: Chaining steps
144 | ## FUNCTION: Pipeline
145 | ## DOCUMENTATION: http://scikit-learn.org/stable/modules/pipeline.html
146 | ## DATA: Wine (n=178, p=2 selected, type=classification)
147 | ## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Wine
148 |
149 | # here is proper cross-validation on the original (unscaled) data
150 | X = wine[['color', 'proline']]
151 | y = wine.label
152 | knn = KNeighborsClassifier(n_neighbors=3)
153 | cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()
154 |
155 | # why is this improper cross-validation on the scaled data?
156 | scaler = StandardScaler()
157 | X_scaled = scaler.fit_transform(X)
158 | cross_val_score(knn, X_scaled, y, cv=5, scoring='accuracy').mean()
159 |
160 | # fix this using Pipeline
161 | from sklearn.pipeline import make_pipeline
162 | pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
163 | cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
164 |
165 | # using GridSearchCV with Pipeline
166 | neighbors_range = range(1, 21)
167 | param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range)
168 | grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
169 | grid.fit(X, y)
170 | grid.best_score_
171 | grid.best_params_
172 |
--------------------------------------------------------------------------------
/code/19_gridsearchcv_exercise.py:
--------------------------------------------------------------------------------
1 | '''
2 | EXERCISE: GridSearchCV with Stack Overflow competition data
3 | '''
4 |
5 | import pandas as pd
6 |
7 | # define a function to create features
8 | def make_features(filename):
9 | df = pd.read_csv(filename, index_col=0)
10 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True)
11 | df['TitleLength'] = df.Title.apply(len)
12 | df['BodyLength'] = df.BodyMarkdown.apply(len)
13 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1)
14 | return df
15 |
16 | # apply function to both training and testing files
17 | train = make_features('train.csv')
18 | test = make_features('test.csv')
19 |
20 | # define X and y
21 | feature_cols = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags']
22 | X = train[feature_cols]
23 | y = train.OpenStatus
24 |
25 |
26 | '''
27 | MAIN TASK: Use GridSearchCV to find optimal parameters for KNeighborsClassifier.
28 | - For "n_neighbors", try 5 different integer values.
29 | - For "weights", try 'uniform' and 'distance'.
30 | - Use 5-fold cross-validation (instead of 10-fold) to save computational time.
31 | - Remember that log loss is your evaluation metric!
32 |
33 | BONUS TASK #1: Once you have found optimal parameters, train your KNN model using
34 | those parameters, make predictions on the test set, and submit those predictions.
35 |
36 | BONUS TASK #2: Read the scikit-learn documentation for GridSearchCV to find the
37 | shortcut for accomplishing bonus task #1.
38 | '''
39 |
40 | # MAIN TASK
41 | from sklearn.neighbors import KNeighborsClassifier
42 | knn = KNeighborsClassifier()
43 | from sklearn.grid_search import GridSearchCV
44 | neighbors_range = [20, 40, 60, 80, 100]
45 | weight_options = ['uniform', 'distance']
46 | param_grid = dict(n_neighbors=neighbors_range, weights=weight_options)
47 | grid = GridSearchCV(knn, param_grid, cv=5, scoring='log_loss')
48 | grid.fit(X, y)
49 | grid.grid_scores_
50 | grid.best_score_
51 | grid.best_params_
52 |
53 | # BONUS TASK #1
54 | knn = KNeighborsClassifier(n_neighbors=100, weights='uniform')
55 | knn.fit(X, y)
56 | y_prob = knn.predict_proba(test[feature_cols])[:, 1]
57 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id')
58 | sub.to_csv('sub.csv')
59 |
60 | # BONUS TASK #2
61 | y_prob = grid.predict_proba(test[feature_cols])[:, 1]
62 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id')
63 | sub.to_csv('sub.csv')
64 |
--------------------------------------------------------------------------------
/code/19_regex_exercise.py:
--------------------------------------------------------------------------------
1 | '''
2 | Regular Expressions Exercise
3 | '''
4 |
5 | # open file and store each line as one row
6 | with open('../data/homicides.txt', 'rU') as f:
7 | raw = [row for row in f]
8 |
9 |
10 | '''
11 | Create a list of ages
12 | '''
13 |
14 | import re
15 |
16 | ages = []
17 | for row in raw:
18 | match = re.search(r'\d+ years old', row)
19 | if match:
20 | ages.append(match.group())
21 | else:
22 | ages.append('0')
23 |
24 | # split the string on spaces, only keep the first element, and convert to int
25 | ages = [int(element.split()[0]) for element in ages]
26 |
27 | # check that 'raw' and 'ages' are the same length
28 | assert(len(raw)==len(ages))
29 |
30 | # simplify process using a lookahead
31 | ages = []
32 | for row in raw:
33 | match = re.search(r'\d+(?= years)', row)
34 | if match:
35 | ages.append(int(match.group()))
36 | else:
37 | ages.append(0)
38 |
--------------------------------------------------------------------------------
/code/19_regex_reference.py:
--------------------------------------------------------------------------------
1 | '''
2 | Regular Expressions (regex) Reference Guide
3 |
4 | Sources:
5 | https://developers.google.com/edu/python/regular-expressions
6 | https://docs.python.org/2/library/re.html
7 | '''
8 |
9 | '''
10 | Basic Patterns:
11 |
12 | Ordinary characters match themselves exactly
13 | . matches any single character except newline \n
14 | \w matches a word character (letter, digit, underscore)
15 | \W matches any non-word character
16 | \b matches boundary between word and non-word
17 | \s matches single whitespace character (space, newline, return, tab, form)
18 | \S matches single non-whitespace character
19 | \d matches single digit (0 through 9)
20 | \t matches tab
21 | \n matches newline
22 | \r matches return
23 | \ match a special character, such as period: \.
24 |
25 | Rules for Searching:
26 |
27 | Search proceeds through string from start to end, stopping at first match
28 | All of the pattern must be matched
29 |
30 | Basic Search Function:
31 |
32 | match = re.search(r'pattern', string_to_search)
33 | Returns match object
34 | If there is a match, access match using match.group()
35 | If there is no match, match is None
36 | Use 'r' in front of pattern to designate a raw string
37 | '''
38 |
39 | import re
40 |
41 | s = 'my 1st string!!'
42 |
43 | match = re.search(r'my', s) # returns match object
44 | if match: # checks whether match was found
45 | print match.group() # if match was found, then print result
46 |
47 | re.search(r'my', s).group() # single-line version (without error handling)
48 | re.search(r'st', s).group() # 'st'
49 | re.search(r'sta', s).group() # error
50 | re.search(r'\w\w\w', s).group() # '1st'
51 | re.search(r'\W', s).group() # ' '
52 | re.search(r'\W\W', s).group() # '!!'
53 | re.search(r'\s', s).group() # ' '
54 | re.search(r'\s\s', s).group() # error
55 | re.search(r'..t', s).group() # '1st'
56 | re.search(r'\s\St', s).group() # ' st'
57 | re.search(r'\bst', s).group() # 'st'
58 |
59 |
60 | '''
61 | Repetition:
62 |
63 | + 1 or more occurrences of the pattern to its left
64 | * 0 or more occurrences
65 | ? 0 or 1 occurrence
66 |
67 | + and * are 'greedy': they try to use up as much of the string as possible
68 |
69 | Add ? after + or * to make them non-greedy: +? or *?
70 | '''
71 |
72 | s = 'sid is missing class'
73 |
74 | re.search(r'miss\w+', s).group() # 'missing'
75 | re.search(r'is\w+', s).group() # 'issing'
76 | re.search(r'is\w*', s).group() # 'is'
77 |
78 | s = 'my heading
'
79 |
80 | re.search(r'<.+>', s).group() # 'my heading
'
81 | re.search(r'<.+?>', s).group() # ''
82 |
83 |
84 | '''
85 | Positions:
86 |
87 | ^ match start of a string
88 | $ match end of a string
89 | '''
90 |
91 | s = 'sid is missing class'
92 |
93 | re.search(r'^miss', s).group() # error
94 | re.search(r'..ss', s).group() # 'miss'
95 | re.search(r'..ss$', s).group() # 'lass'
96 |
97 |
98 | '''
99 | Brackets:
100 |
101 | [abc] match a or b or c
102 | \w, \s, etc. work inside brackets, except period just means a literal period
103 | [a-z] match any lowercase letter (dash indicates range unless it's last)
104 | [abc-] match a or b or c or -
105 | [^ab] match anything except a or b
106 | '''
107 |
108 | s = 'my email is john-doe@gmail.com'
109 |
110 | re.search(r'\w+@\w+', s).group() # 'doe@gmail'
111 | re.search(r'[\w.-]+@[\w.-]+', s).group() # 'john-doe@gmail.com'
112 |
113 |
114 | '''
115 | Lookarounds:
116 |
117 | Lookahead matches a pattern only if it is followed by another pattern
118 | 100(?= dollars) matches '100' only if it is followed by ' dollars'
119 |
120 | Lookbehind matches a pattern only if it is preceded by another pattern
121 | (?<=\$)100 matches '100' only if it is preceded by '$'
122 | '''
123 |
124 | s = 'Name: Cindy, 30 years old'
125 |
126 | re.search(r'\d+(?= years? old)', s).group() # '30'
127 | re.search(r'(?<=Name: )\w+', s).group() # 'Cindy'
128 |
129 |
130 | '''
131 | Match Groups:
132 |
133 | Parentheses create logical groups inside of match text
134 | match.group(1) corresponds to first group
135 | match.group(2) corresponds to second group
136 | match.group() corresponds to entire match text (as usual)
137 | '''
138 |
139 | s = 'my email is john-doe@gmail.com'
140 |
141 | match = re.search(r'([\w.-]+)@([\w.-]+)', s)
142 | if match:
143 | match.group(1) # 'john-doe'
144 | match.group(2) # 'gmail.com'
145 | match.group() # 'john-doe@gmail.com'
146 |
147 |
148 | '''
149 | Finding All Matches:
150 |
151 | re.findall() finds all matches and returns them as a list of strings
152 | list_of_strings = re.findall(r'pattern', string_to_search)
153 |
154 | If pattern includes parentheses, a list of tuples is returned
155 | '''
156 |
157 | s = 'emails: joe@gmail.com, bob@gmail.com'
158 |
159 | re.findall(r'[\w.-]+@[\w.-]+', s) # ['joe@gmail.com', 'bob@gmail.com']
160 | re.findall(r'([\w.-]+)@([\w.-]+)', s) # [('joe', 'gmail.com'), ('bob', 'gmail.com')]
161 |
162 |
163 | '''
164 | Option Flags:
165 |
166 | Options flags modify the behavior of the pattern matching
167 |
168 | default: matching is case sensitive
169 | re.IGNORECASE: ignore uppercase/lowercase differences ('a' matches 'a' or 'A')
170 |
171 | default: period matches any character except newline
172 | re.DOTALL: allow period to match newline
173 |
174 | default: within a string of many lines, ^ and $ match start and end of entire string
175 | re.MULTILINE: allow ^ and $ to match start and end of each line
176 |
177 | Option flag is third argument to re.search() or re.findall():
178 | re.search(r'pattern', string_to_search, re.IGNORECASE)
179 | re.findall(r'pattern', string_to_search, re.IGNORECASE)
180 | '''
181 |
182 | s = 'emails: nicole@ga.co, joe@gmail.com, PAT@GA.CO'
183 |
184 | re.findall(r'\w+@ga\.co', s) # ['nicole@ga.co']
185 | re.findall(r'\w+@ga\.co', s, re.IGNORECASE) # ['nicole@ga.co', 'PAT@GA.CO']
186 |
187 |
188 | '''
189 | Substitution:
190 |
191 | re.sub() finds all matches and replaces them with a specified string
192 | new_string = re.sub(r'pattern', r'replacement', string_to_search)
193 |
194 | Replacement string can refer to text from matching groups:
195 | \1 refers to group(1)
196 | \2 refers to group(2)
197 | etc.
198 | '''
199 |
200 | s = 'sid is missing class'
201 |
202 | re.sub(r'is ', r'was ', s) # 'sid was missing class'
203 |
204 | s = 'emails: joe@gmail.com, bob@gmail.com'
205 |
206 | re.sub(r'([\w.-]+)@([\w.-]+)', r'\1@yahoo.com', s) # 'emails: joe@yahoo.com, bob@yahoo.com'
207 |
208 |
209 | '''
210 | Useful, But Not Covered:
211 |
212 | re.split() splits a string by the occurrences of a pattern
213 | re.compile() compiles a pattern (for improved performance if it's used many times)
214 | A|B indicates a pattern that can match A or B
215 | '''
216 |
--------------------------------------------------------------------------------
/code/21_ensembles_example.py:
--------------------------------------------------------------------------------
1 | '''
2 | Imports
3 | '''
4 | import pandas as pd
5 | from sklearn.linear_model import LogisticRegression
6 | from sklearn.cross_validation import cross_val_score
7 | from sklearn.ensemble import RandomForestClassifier
8 | from sklearn.feature_extraction.text import CountVectorizer
9 | from sklearn.pipeline import make_pipeline
10 |
11 |
12 | '''
13 | Define a function that takes in a raw CSV file and returns a DataFrame that
14 | includes all created features (and any other modifications). That way, we
15 | can apply the same changes to both train.csv and test.csv.
16 | '''
17 |
18 | # Define the function
19 | def make_features(filename):
20 | # Read in dataframe
21 | df = pd.read_csv(filename, index_col=0)
22 |
23 | #Rename columns
24 | df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True)
25 |
26 | # Get length of title of post
27 | df['TitleLength'] = df.Title.apply(len)
28 |
29 | # Get length of body of post
30 | df['BodyLength'] = df.BodyMarkdown.apply(len)
31 |
32 | # Number of tags for post
33 | df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1)
34 |
35 | # Is the title lowercase?
36 | df['TitleLowercase'] = (df.Title.str.lower() == df.Title).astype(int)
37 |
38 | # Create features that represent whether Title contains certain words
39 | df['TitleQuestion'] = df.Title.str.contains('question', case=False).astype(int)
40 | df['TitleNeed'] = df.Title.str.contains('need', case=False).astype(int)
41 | df['TitleHelp'] = df.Title.str.contains('help', case=False).astype(int)
42 |
43 | return df
44 |
45 | # Apply function to the training data
46 | train = make_features('train.csv')
47 | X = train.drop('OpenStatus', axis=1)
48 | y = train.OpenStatus
49 |
50 | # Read in test data
51 | test = make_features('test.csv')
52 |
53 |
54 | # Split into training and testing sets
55 | #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
56 |
57 | '''
58 | Five feature logistic regression model
59 | '''
60 | # Define feature cols
61 | feature_cols_logreg = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags']
62 |
63 | # Perform cross validation to get an idea of the performance of the model
64 | logreg = LogisticRegression()
65 | -cross_val_score(logreg, X[feature_cols_logreg], y, scoring="log_loss", cv=5).mean()
66 |
67 | # Predict class probabilities for the actual testing data
68 | logreg.fit(X[feature_cols_logreg], y)
69 | y_prob_logreg = logreg.predict_proba(test[feature_cols_logreg])[:, 1]
70 |
71 | '''
72 | Five feature random forest model
73 | '''
74 | # Define feature cols
75 | feature_cols_rf = ['TitleLowercase', 'TitleQuestion', 'TitleNeed', 'TitleHelp']
76 |
77 | # Perform cross validation to get an idea of the performance of the model
78 | rf = RandomForestClassifier()
79 | -cross_val_score(rf, X[feature_cols_rf], y, scoring="log_loss", cv=5).mean()
80 |
81 | # Predict class probabilities for the actual testing data
82 | rf.fit(X[feature_cols_rf], y)
83 | y_prob_rf = rf.predict_proba(test[feature_cols_rf])[:, 1]
84 |
85 |
86 |
87 | '''
88 | Text logistic regression model on 'Title' using pipeline
89 | '''
90 |
91 | # Make pipleline
92 | pipe = make_pipeline(CountVectorizer(stop_words='english'), LogisticRegression())
93 |
94 | # Perform cross validation to get an idea of the performance of the model
95 | -cross_val_score(pipe, X['Title'], y, scoring="log_loss", cv=5).mean()
96 |
97 | # Predict class probabilities for the actual testing data
98 | pipe.fit(X['Title'], y)
99 | y_prob_pipe = pipe.predict_proba(test['Title'])[:, 1]
100 |
101 |
102 | '''
103 | Create submission
104 | '''
105 | # Ensemble predictions
106 | y_prob_combined = (y_prob_logreg + y_prob_rf + 2*y_prob_pipe) / 3
107 |
108 | # Create a DataFrame that has 'id' as the index, then export to a CSV file
109 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob_combined}).set_index('id')
110 | sub.to_csv('sub_ensemble.csv')
111 |
--------------------------------------------------------------------------------
/data/airline_safety.csv:
--------------------------------------------------------------------------------
1 | airline,avail_seat_km_per_week,incidents_85_99,fatal_accidents_85_99,fatalities_85_99,incidents_00_14,fatal_accidents_00_14,fatalities_00_14
2 | Aer Lingus,320906734,2,0,0,0,0,0
3 | Aeroflot*,1197672318,76,14,128,6,1,88
4 | Aerolineas Argentinas,385803648,6,0,0,1,0,0
5 | Aeromexico*,596871813,3,1,64,5,0,0
6 | Air Canada,1865253802,2,0,0,2,0,0
7 | Air France,3004002661,14,4,79,6,2,337
8 | Air India*,869253552,2,1,329,4,1,158
9 | Air New Zealand*,710174817,3,0,0,5,1,7
10 | Alaska Airlines*,965346773,5,0,0,5,1,88
11 | Alitalia,698012498,7,2,50,4,0,0
12 | All Nippon Airways,1841234177,3,1,1,7,0,0
13 | American*,5228357340,21,5,101,17,3,416
14 | Austrian Airlines,358239823,1,0,0,1,0,0
15 | Avianca,396922563,5,3,323,0,0,0
16 | British Airways*,3179760952,4,0,0,6,0,0
17 | Cathay Pacific*,2582459303,0,0,0,2,0,0
18 | China Airlines,813216487,12,6,535,2,1,225
19 | Condor,417982610,2,1,16,0,0,0
20 | COPA,550491507,3,1,47,0,0,0
21 | Delta / Northwest*,6525658894,24,12,407,24,2,51
22 | Egyptair,557699891,8,3,282,4,1,14
23 | El Al,335448023,1,1,4,1,0,0
24 | Ethiopian Airlines,488560643,25,5,167,5,2,92
25 | Finnair,506464950,1,0,0,0,0,0
26 | Garuda Indonesia,613356665,10,3,260,4,2,22
27 | Gulf Air,301379762,1,0,0,3,1,143
28 | Hawaiian Airlines,493877795,0,0,0,1,0,0
29 | Iberia,1173203126,4,1,148,5,0,0
30 | Japan Airlines,1574217531,3,1,520,0,0,0
31 | Kenya Airways,277414794,2,0,0,2,2,283
32 | KLM*,1874561773,7,1,3,1,0,0
33 | Korean Air,1734522605,12,5,425,1,0,0
34 | LAN Airlines,1001965891,3,2,21,0,0,0
35 | Lufthansa*,3426529504,6,1,2,3,0,0
36 | Malaysia Airlines,1039171244,3,1,34,3,2,537
37 | Pakistan International,348563137,8,3,234,10,2,46
38 | Philippine Airlines,413007158,7,4,74,2,1,1
39 | Qantas*,1917428984,1,0,0,5,0,0
40 | Royal Air Maroc,295705339,5,3,51,3,0,0
41 | SAS*,682971852,5,0,0,6,1,110
42 | Saudi Arabian,859673901,7,2,313,11,0,0
43 | Singapore Airlines,2376857805,2,2,6,2,1,83
44 | South African,651502442,2,1,159,1,0,0
45 | Southwest Airlines,3276525770,1,0,0,8,0,0
46 | Sri Lankan / AirLanka,325582976,2,1,14,4,0,0
47 | SWISS*,792601299,2,1,229,3,0,0
48 | TACA,259373346,3,1,3,1,1,3
49 | TAM,1509195646,8,3,98,7,2,188
50 | TAP - Air Portugal,619130754,0,0,0,0,0,0
51 | Thai Airways,1702802250,8,4,308,2,1,1
52 | Turkish Airlines,1946098294,8,3,64,8,2,84
53 | United / Continental*,7139291291,19,8,319,14,2,109
54 | US Airways / America West*,2455687887,16,7,224,11,2,23
55 | Vietnam Airlines,625084918,7,3,171,1,0,0
56 | Virgin Atlantic,1005248585,1,0,0,0,0,0
57 | Xiamen Airlines,430462962,9,1,82,2,0,0
58 |
--------------------------------------------------------------------------------
/data/drinks.csv:
--------------------------------------------------------------------------------
1 | country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
Afghanistan,0,0,0,0,AS
Albania,89,132,54,4.9,EU
Algeria,25,0,14,0.7,AF
Andorra,245,138,312,12.4,EU
Angola,217,57,45,5.9,AF
Antigua & Barbuda,102,128,45,4.9,NAm
Argentina,193,25,221,8.3,SA
Armenia,21,179,11,3.8,EU
Australia,261,72,212,10.4,OC
Austria,279,75,191,9.7,EU
Azerbaijan,21,46,5,1.3,EU
Bahamas,122,176,51,6.3,NAm
Bahrain,42,63,7,2,AS
Bangladesh,0,0,0,0,AS
Barbados,143,173,36,6.3,NAm
Belarus,142,373,42,14.4,EU
Belgium,295,84,212,10.5,EU
Belize,263,114,8,6.8,NAm
Benin,34,4,13,1.1,AF
Bhutan,23,0,0,0.4,AS
Bolivia,167,41,8,3.8,SA
Bosnia-Herzegovina,76,173,8,4.6,EU
Botswana,173,35,35,5.4,AF
Brazil,245,145,16,7.2,SA
Brunei,31,2,1,0.6,AS
Bulgaria,231,252,94,10.3,EU
Burkina Faso,25,7,7,4.3,AF
Burundi,88,0,0,6.3,AF
Cote d'Ivoire,37,1,7,4,AF
Cabo Verde,144,56,16,4,AF
Cambodia,57,65,1,2.2,AS
Cameroon,147,1,4,5.8,AF
Canada,240,122,100,8.2,NAm
Central African Republic,17,2,1,1.8,AF
Chad,15,1,1,0.4,AF
Chile,130,124,172,7.6,SA
China,79,192,8,5,AS
Colombia,159,76,3,4.2,SA
Comoros,1,3,1,0.1,AF
Congo,76,1,9,1.7,AF
Cook Islands,0,254,74,5.9,OC
Costa Rica,149,87,11,4.4,NAm
Croatia,230,87,254,10.2,EU
Cuba,93,137,5,4.2,NAm
Cyprus,192,154,113,8.2,EU
Czech Republic,361,170,134,11.8,EU
North Korea,0,0,0,0,AS
DR Congo,32,3,1,2.3,AF
Denmark,224,81,278,10.4,EU
Djibouti,15,44,3,1.1,AF
Dominica,52,286,26,6.6,NAm
Dominican Republic,193,147,9,6.2,NAm
Ecuador,162,74,3,4.2,SA
Egypt,6,4,1,0.2,AF
El Salvador,52,69,2,2.2,NAm
Equatorial Guinea,92,0,233,5.8,AF
Eritrea,18,0,0,0.5,AF
Estonia,224,194,59,9.5,EU
Ethiopia,20,3,0,0.7,AF
Fiji,77,35,1,2,OC
Finland,263,133,97,10,EU
France,127,151,370,11.8,EU
Gabon,347,98,59,8.9,AF
Gambia,8,0,1,2.4,AF
Georgia,52,100,149,5.4,EU
Germany,346,117,175,11.3,EU
Ghana,31,3,10,1.8,AF
Greece,133,112,218,8.3,EU
Grenada,199,438,28,11.9,NAm
Guatemala,53,69,2,2.2,NAm
Guinea,9,0,2,0.2,AF
Guinea-Bissau,28,31,21,2.5,AF
Guyana,93,302,1,7.1,SA
Haiti,1,326,1,5.9,NAm
Honduras,69,98,2,3,NAm
Hungary,234,215,185,11.3,EU
Iceland,233,61,78,6.6,EU
India,9,114,0,2.2,AS
Indonesia,5,1,0,0.1,AS
Iran,0,0,0,0,AS
Iraq,9,3,0,0.2,AS
Ireland,313,118,165,11.4,EU
Israel,63,69,9,2.5,AS
Italy,85,42,237,6.5,EU
Jamaica,82,97,9,3.4,NAm
Japan,77,202,16,7,AS
Jordan,6,21,1,0.5,AS
Kazakhstan,124,246,12,6.8,AS
Kenya,58,22,2,1.8,AF
Kiribati,21,34,1,1,OC
Kuwait,0,0,0,0,AS
Kyrgyzstan,31,97,6,2.4,AS
Laos,62,0,123,6.2,AS
Latvia,281,216,62,10.5,EU
Lebanon,20,55,31,1.9,AS
Lesotho,82,29,0,2.8,AF
Liberia,19,152,2,3.1,AF
Libya,0,0,0,0,AF
Lithuania,343,244,56,12.9,EU
Luxembourg,236,133,271,11.4,EU
Madagascar,26,15,4,0.8,AF
Malawi,8,11,1,1.5,AF
Malaysia,13,4,0,0.3,AS
Maldives,0,0,0,0,AS
Mali,5,1,1,0.6,AF
Malta,149,100,120,6.6,EU
Marshall Islands,0,0,0,0,OC
Mauritania,0,0,0,0,AF
Mauritius,98,31,18,2.6,AF
Mexico,238,68,5,5.5,NAm
Micronesia,62,50,18,2.3,OC
Monaco,0,0,0,0,EU
Mongolia,77,189,8,4.9,AS
Montenegro,31,114,128,4.9,EU
Morocco,12,6,10,0.5,AF
Mozambique,47,18,5,1.3,AF
Myanmar,5,1,0,0.1,AS
Namibia,376,3,1,6.8,AF
Nauru,49,0,8,1,OC
Nepal,5,6,0,0.2,AS
Netherlands,251,88,190,9.4,EU
New Zealand,203,79,175,9.3,OC
Nicaragua,78,118,1,3.5,NAm
Niger,3,2,1,0.1,AF
Nigeria,42,5,2,9.1,AF
Niue,188,200,7,7,OC
Norway,169,71,129,6.7,EU
Oman,22,16,1,0.7,AS
Pakistan,0,0,0,0,AS
Palau,306,63,23,6.9,OC
Panama,285,104,18,7.2,NAm
Papua New Guinea,44,39,1,1.5,OC
Paraguay,213,117,74,7.3,SA
Peru,163,160,21,6.1,SA
Philippines,71,186,1,4.6,AS
Poland,343,215,56,10.9,EU
Portugal,194,67,339,11,EU
Qatar,1,42,7,0.9,AS
South Korea,140,16,9,9.8,AS
Moldova,109,226,18,6.3,EU
Romania,297,122,167,10.4,EU
Russian Federation,247,326,73,11.5,AS
Rwanda,43,2,0,6.8,AF
St. Kitts & Nevis,194,205,32,7.7,NAm
St. Lucia,171,315,71,10.1,NAm
St. Vincent & the Grenadines,120,221,11,6.3,NAm
Samoa,105,18,24,2.6,OC
San Marino,0,0,0,0,EU
Sao Tome & Principe,56,38,140,4.2,AF
Saudi Arabia,0,5,0,0.1,AS
Senegal,9,1,7,0.3,AF
Serbia,283,131,127,9.6,EU
Seychelles,157,25,51,4.1,AF
Sierra Leone,25,3,2,6.7,AF
Singapore,60,12,11,1.5,AS
Slovakia,196,293,116,11.4,EU
Slovenia,270,51,276,10.6,EU
Solomon Islands,56,11,1,1.2,OC
Somalia,0,0,0,0,AF
South Africa,225,76,81,8.2,AF
Spain,284,157,112,10,EU
Sri Lanka,16,104,0,2.2,AS
Sudan,8,13,0,1.7,AF
Suriname,128,178,7,5.6,SA
Swaziland,90,2,2,4.7,AF
Sweden,152,60,186,7.2,EU
Switzerland,185,100,280,10.2,EU
Syria,5,35,16,1,AS
Tajikistan,2,15,0,0.3,AS
Thailand,99,258,1,6.4,AS
Macedonia,106,27,86,3.9,EU
Timor-Leste,1,1,4,0.1,AS
Togo,36,2,19,1.3,AF
Tonga,36,21,5,1.1,OC
Trinidad & Tobago,197,156,7,6.4,NAm
Tunisia,51,3,20,1.3,AF
Turkey,51,22,7,1.4,AS
Turkmenistan,19,71,32,2.2,AS
Tuvalu,6,41,9,1,OC
Uganda,45,9,0,8.3,AF
Ukraine,206,237,45,8.9,EU
United Arab Emirates,16,135,5,2.8,AS
United Kingdom,219,126,195,10.4,EU
Tanzania,36,6,1,5.7,AF
USA,249,158,84,8.7,NAm
Uruguay,115,35,220,6.6,SA
Uzbekistan,25,101,8,2.4,AS
Vanuatu,21,18,11,0.9,OC
Venezuela,333,100,3,7.7,SA
Vietnam,111,2,1,2,AS
Yemen,6,0,0,0.1,AS
Zambia,32,19,4,2.5,AF
Zimbabwe,64,18,4,4.7,AF
--------------------------------------------------------------------------------
/data/imdb_movie_ratings_top_1000.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/data/imdb_movie_ratings_top_1000.csv
--------------------------------------------------------------------------------
/data/imdb_movie_urls.csv:
--------------------------------------------------------------------------------
1 | http://www.imdb.com/title/tt1856010/
http://www.imdb.com/title/tt0816692/
http://www.imdb.com/title/tt1826940/
http://www.imdb.com/title/tt0993846/
http://www.imdb.com/title/tt0285403/
http://www.imdb.com/title/tt2084970/
http://www.imdb.com/title/tt2980516/
http://www.imdb.com/title/tt0386676/
http://www.imdb.com/title/tt1266020/
--------------------------------------------------------------------------------
/data/sales.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/data/sales.db
--------------------------------------------------------------------------------
/data/vehicles.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/data/vehicles.db
--------------------------------------------------------------------------------
/data/vehicles_test.csv:
--------------------------------------------------------------------------------
1 | price,year,miles,doors,type
2 | 3000,2003,130000,4,truck
3 | 6000,2005,82500,4,car
4 | 12000,2010,60000,2,car
5 |
--------------------------------------------------------------------------------
/data/vehicles_train.csv:
--------------------------------------------------------------------------------
1 | price,year,miles,doors,type
2 | 22000,2012,13000,2,car
3 | 14000,2010,30000,2,car
4 | 13000,2010,73500,4,car
5 | 9500,2009,78000,4,car
6 | 9000,2007,47000,4,car
7 | 4000,2006,124000,2,car
8 | 3000,2004,177000,4,car
9 | 2000,2004,209000,4,truck
10 | 3000,2003,138000,2,car
11 | 1900,2003,160000,4,car
12 | 2500,2003,190000,2,truck
13 | 5000,2001,62000,4,car
14 | 1800,1999,163000,2,truck
15 | 1300,1997,138000,4,car
16 |
--------------------------------------------------------------------------------
/homework/02_command_line_hw_soln.md:
--------------------------------------------------------------------------------
1 | ## Command Line Homework Solution
2 | #### The following solution assumes you are working from the class "DAT5" directory.
3 | * How many text messages are there?
4 | * Answer: 5574
5 | * Code: `wc data/SMSSpamCollection.txt` gives you the line count, word count, and character count
6 |
7 | * What is the average number of words per text? What is the average number of characters per text?
8 | * Answer: Words per text: 15.6 or 16.6 (see below for explanation); Characters per text: 85.7 or 81.9 (see below)
9 | * Code:
10 | * `wc data/SMSSpamCollection.txt` gives you the line count, word count, and character count. You can divide the word count by the line count (so the number of words in each line which represents one text) to get 92482/5574 = 16.6 words per text. However, if you want to be more technical about it, each line contains an extra word that is not part of the text, the "spam" or "ham" label. You could remove the number of "spam"/"ham" labels (one per line) from the total word count to get (92482 - 5574)/5574 = 15.6.
11 | * Similarly, using the line count and character count from the `wc` command, you can divide the character count by the line count to get 477907/5574 = 85.7. If you remove the characters counted for the "spam" and "ham" labels, you get (477907 - 4*(# of hams) - 5*(# of spams) )/5574 = (477907 - 4*(4827) - 5*(747) )/5574 = 81.9. **Note**: The point of this wasn't to necessarily get the exact numbers but to identify that you can use `wc` to get a quick idea of features and labels in your data without having to open it.
12 |
13 |
14 | * How many messages are spam? How many are ham?
15 | * Answer: Spam: 4827 Ham: 747
16 | * Code:
17 | * `grep -w 'ham' data/SMSSpamCollection.txt | wc` gives you the line count of lines labeled 'ham' in the file.
18 | * `grep -w 'spam' data/SMSSpamCollection.txt | wc` gives you the line count of lines labeled 'spam' in the file.
19 |
20 | * Is there a difference between the number of words per text and characters per text in messages that are spam vs. those that are ham? What are these numbers?
21 | * Answer: Yes, there is a difference. It seems that the "spam" texts have a much higher words per text and characters per text. Using the simplified calculations (i.e. not remove the "spam" and "ham" from the word counts), we get the following numbers.
22 | ```
23 | Words per Text Char per Text
24 | Ham: 15.3 76.6
25 | Spam: 24.9 145.12
26 | ```
27 | * Code:
28 | * `grep -w 'ham' data/SMSSpamCollection.txt | wc` gives the line, word, and character count for all of the lines labeled 'ham'. You can divide the word count by line count to get the 'Words per Text' and divide the character count by line count to get the 'Characters per Text'.
29 | * `grep -w 'spam' data/SMSSpamCollection.txt | wc` gives the line, word, and character count for all of the lines labeled 'spam'. You can divide the word count by line count to get the 'Words per Text' and divide the character count by line count to get the 'Characters per Text'.
30 |
31 | * **Bonus**: If you feel that this is too easy, research the `awk` command to learn how to calculate and print out these averages in the console.
32 | * Answer: See below
33 | * Code:
34 | * `grep -w 'spam' data/SMSSpamCollection.txt | wc | awk '{print "Words per text: "$2/$1}'` will give you the words per text. Notice the format of `awk` here. You are telling it to print something and pass it column number labels. `wc` prints out three columns: lines, words, and characters. For the words per text (i.e. line) we want to divide the second column by the first.
35 | * `grep -w 'spam' data/SMSSpamCollection.txt | wc | awk '{print "Characters per text: "$3/$1}'` will give you the character per text.
36 |
37 | * Separate the spam and ham messages into files "spam_messages.txt" and "ham_messages.txt".
38 | * Answer: The code below accomplishes this.
39 | * Code:
40 | * `grep -w 'ham' data/SMSSpamCollection.txt > ham.txt` takes the output of the `grep`, which is all of the lines that have a label ham, and puts it into a file called `ham.txt` using the `>` operator.
41 | * `grep -w 'spam' data/SMSSpamCollection.txt > spam.txt` takes the output of the `grep`, which is all of the lines that have a label spam, and puts it into a file called `spam.txt` using the `>` operator.
42 |
--------------------------------------------------------------------------------
/homework/03_pandas_hw_soln.py:
--------------------------------------------------------------------------------
1 | '''
2 | Exploratory Data Analysis Homework Solution
3 | '''
4 |
5 | '''
6 | Use the automotive mpg data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.csv)
7 | to complete the following parts. Please turn in your code for each part.
8 | Before each code chunk, give a brief description (one line) of what the code is
9 | doing (e.g. "Loads the data" or "Creates scatter plot of mpg and weight"). If
10 | the code output produces a plot or answers a question, give a brief
11 | interpretation of the output (e.g. "This plot shows X,Y,Z" or "The mean for
12 | group A is higher than the mean for group B which means X,Y,Z").
13 | '''
14 |
15 | '''
16 | Part 1
17 | Load the data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt)
18 | into a DataFrame. Try looking at the "head" of the file in the command line
19 | to see how the file is delimited and how to load it.
20 | Note: You do not need to turn in any command line code you may use.
21 | '''
22 |
23 | # Imports
24 | import pandas as pd
25 |
26 | # Reads text file and uses '|' as separator
27 | auto = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt', sep='|')
28 | auto = pd.read_table('auto_mpg.txt', sep='|') # This is if you are reading from you computer
29 | # Note: This assumes that '.../DAT5/data' is your working directory.
30 |
31 |
32 | '''
33 | Part 2
34 | Get familiar with the data. Answer the following questions:
35 | - What is the shape of the data? How many rows and columns are there?
36 | - What variables are available?
37 | - What are the ranges for the values in each numeric column?
38 | - What is the average value for each column? Does that differ significantly
39 | from the median?
40 | '''
41 |
42 | auto.shape # There are 392 rows and 9 columns
43 |
44 | auto.columns # This lists the column names that are available
45 | auto.info() # This lists the column names as well as their data type.
46 |
47 | # You can infer the range from the information available in describe
48 | auto.describe() # This will give you the five number summary for all numeric variables
49 | auto.min(numeric_only=True) # This will give you all of the minimums for numeric variables
50 | auto.max(numeric_only=True) # This will give you all of the maximums for numeric variables
51 | # You can calculate the range with the above info as shown below.
52 | auto.max(numeric_only=True) - auto.min(numeric_only=True) # Range
53 |
54 | auto.mean() # Means for all numeric variables
55 | auto.median() # Medians for all numeric variables
56 | # How much greater is the mean than the median?
57 | auto.mean() - auto.median()
58 | # The means are somewhat greater than the medians.
59 |
60 |
61 | '''
62 | Part 3
63 | Use the data to answer the following questions:
64 | - Which 5 cars get the best gas mileage?
65 | - Which 5 cars with more than 4 cylinders get the best gas mileage?
66 | - Which 5 cars get the worst gas mileage?
67 | - Which 5 cars with 4 or fewer cylinders get the worst gas mileage?
68 | '''
69 |
70 | # 5 cars that get best gas mileage
71 | auto.sort_index(by='mpg', ascending=False)[0:5][['car_name','mpg']]
72 |
73 | # 5 cars with more than 4 cylinders that get the best gas mileage
74 | auto[auto.cylinders > 4].sort_index(by='mpg', ascending=False)[0:5][['car_name','mpg']]
75 |
76 | # 5 cars that get worst gas mileage
77 | auto.sort_index(by='mpg')[0:5][['car_name','mpg']]
78 |
79 | # 5 cars with 4 or fewer cylinders that get the worst gas mileage
80 | auto[auto.cylinders > 4].sort_index(by='mpg')[0:5][['car_name','mpg']]
81 |
82 |
83 | '''
84 | Part 4
85 | Use groupby and aggregations to explore the relationships
86 | between mpg and the other variables. Which variables seem to have the greatest
87 | effect on mpg?
88 | Some examples of things you might want to look at are:
89 | - What is the mean mpg for cars for each number of cylindres (i.e. 3 cylinders,
90 | 4 cylinders, 5 cylinders, etc)?
91 | - Did mpg rise or fall over the years contained in this dataset?
92 | - What is the mpg for the group of lighter cars vs the group of heaver cars?
93 | Note: Be creative in the ways in which you divide up the data. You are trying
94 | to create segments of the data using logical filters and comparing the mpg
95 | for each segment of the data.
96 | '''
97 |
98 | # Mean mpg for cars for each number of cylinders
99 | auto.groupby(by='cylinders').mpg.mean()
100 |
101 | # Mpg usually rose over the years contianed in this dataset
102 | auto.groupby(by='model_year').mpg.mean()
103 |
104 | # The mpg for the gorup of lighter cars vs the group of heavier cars
105 | # We can divide the dataset in half by the median (the lower half being the
106 | # lighter cars and the upper half being the heavier cars).
107 | auto[auto.weight <= auto.weight.median()].mpg.mean() # light cars mean mpg
108 | auto[auto.weight > auto.weight.median()].mpg.mean() # heavier cars mean mpg
109 | # It appears that the lighter cars get better gas mileage than the heavier cars
110 |
111 | # This question was pretty open ended, but here are some other things you could have looked at
112 |
113 | # The average mpg for the four quartiles of displacement
114 | # We didn't talk about the 'quantile' function in class, but it's a useful one!
115 | auto[auto.displacement <= auto.displacement.quantile(0.25)].mpg.mean()
116 | auto[(auto.displacement > auto.displacement.quantile(0.25)) & (auto.displacement <= auto.displacement.quantile(0.50))].mpg.mean()
117 | auto[(auto.displacement > auto.displacement.quantile(0.50)) & (auto.displacement <= auto.displacement.quantile(0.75))].mpg.mean()
118 | auto[auto.displacement > auto.displacement.quantile(0.75)].mpg.mean()
119 | # It appears that as engine displacement (size) increases, the average mpg decreases. This makes sense.
120 |
121 | # Instead of using the somewhat complicated logic of the 'quantile', you can easily divide your dataset
122 | # into buckets using the `cut` function.
123 | auto.groupby(pd.cut(auto.horsepower,5)).mpg.mean()
124 | # It appears that as horsepower increases, the average mpg decreases. This makes sense.
125 |
126 | auto.groupby(pd.cut(auto.acceleration, 5)).mpg.mean()
127 | # It appears that as acceleration increases, the average mpg increases.
128 |
129 |
130 | '''
131 | I'll also include something I found particularly cool from Lloyd's homework.
132 | He wanted to look at how MPG has changed over time, but he also wanted to consider
133 | how specific groups have changed. He wanted to look at low, mid, and high power
134 | cars based upon their horsepower and see how these groups have changed over time.
135 | His code is below. In his data, he called the original dataset 'auto'.
136 | '''
137 | # Now to look at how efficency has changed over time based on power and weight classes,
138 | # two things that we know play a large role in gas mileage. First, we create a table of
139 | # efficeincy by power class and year.
140 |
141 | horsey = pd.DataFrame()
142 |
143 | # Defines low power as below 100 horsepower
144 | horsey['low_power'] = auto[(auto.horsepower < 100)].groupby('model_year').mpg.mean()
145 |
146 | # Defines mid power as between 100 and 150 (inclusive) horsepower
147 | horsey['mid_power'] = auto[(auto.horsepower >= 100) & (auto.horsepower <= 150)].groupby('model_year').mpg.mean()
148 |
149 | # Defines high power as above 150 horsepower
150 | horsey['high_power'] = auto[auto.horsepower > 150].groupby('model_year').mpg.mean()
151 | '''
152 | low_power mid_power high_power
153 | model_year
154 | 70 23.300000 18.333333 13.076923
155 | 71 26.357143 17.285714 13.333333
156 | 72 23.500000 15.000000 12.857143
157 | 73 22.166667 16.352941 12.727273
158 | 74 27.312500 15.500000 NaN
159 | 75 22.470588 17.500000 16.000000
160 | 76 25.750000 17.071429 15.500000
161 | 77 28.433333 18.100000 15.666667
162 | 78 28.363158 19.350000 17.700000
163 | 79 29.225000 20.266667 16.900000
164 | 80 34.516667 28.100000 NaN
165 | 81 31.372727 25.833333 NaN
166 | 82 32.607143 23.500000 NaN
167 |
168 | We can see from the data here that low power cars have seen much better gains in efficiency than
169 | mid or high power cars. I then wanted to see how much car weights have changed in that same time.
170 | '''
--------------------------------------------------------------------------------
/homework/04_visualization_hw_soln.py:
--------------------------------------------------------------------------------
1 | '''
2 | Visualization Homework Solution
3 | '''
4 |
5 | '''
6 | Use the automotive mpg data (https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt)
7 | to complete the following parts. Please turn in your code for each part.
8 | Before each code chunk, give a brief description (one line) of what the code is
9 | doing (e.g. "Loads the data" or "Creates scatter plot of mpg and weight"). If
10 | the code output produces a plot or answers a question, give a brief
11 | interpretation of the output (e.g. "This plot shows X,Y,Z" or "The mean for
12 | group A is higher than the mean for group B which means X,Y,Z").
13 | '''
14 | # Imports
15 | import pandas as pd
16 | import numpy as np
17 | import matplotlib.pyplot as plt
18 |
19 | # Reads text file and uses '|' as separator
20 | auto = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/auto_mpg.txt', sep='|')
21 |
22 | '''
23 | Part 1
24 | Produce a plot that compares the mean mpg for the different numbers of cylinders.
25 | '''
26 |
27 | # The first part of creating this plot is to generate the appropriate data.
28 | # Since we want mean mpg FOR EACH number of cylinders, we should use a 'groupby'.
29 | auto.groupby('cylinders').mpg.mean() # Give us mean mpg for each cylinder count
30 |
31 | # Now that we have the data we want, we can think about how to plot it.
32 | # The keyword 'compare' indiciated that you probably want to use a bar chart.
33 | auto.groupby('cylinders').mpg.mean().plot(kind='bar') # Create plot from data
34 | plt.title("Comparing Mean MPG for Different Numbers of Cylinders") # Add title
35 | plt.xlabel("Number of Cylinders") # Add x label
36 | plt.ylabel("Average MPG") # Add y label
37 | plt.show() # Show plot
38 | # With the exception of the three cylinder car (of which there are only 4),
39 | # we can see that mean mpg decreases as number of cylinders increases.
40 |
41 |
42 | '''
43 | Part 2
44 | Use a scatter matrix to explore relationships between different numeric variables.
45 | '''
46 | pd.scatter_matrix(auto) # Generate scatter matrix
47 | pd.scatter_matrix(auto, c=auto.mpg) # Consider adding color to your scatter matrix too.
48 | plt.show() # Show plot
49 | '''
50 | There are several things to notice here. First, we can talk about different
51 | variables' relationships with mpg. Looking across the top row, where mpg is on
52 | the y axis, we see that there is a clearly negative relationship between mpg
53 | and number of cylinders, displacement, horsepower, and weight. There is a
54 | clearly positive relationship between mpg and model_year. There is a vaguely
55 | positive relationships between mpg and acceleration, though it's not a very
56 | clear one. There also seems to be a weakly positive relationship between mpg
57 | and origin.
58 |
59 | There are also several other relationships you may notice:
60 | * Dipslacement and horsepower have a positive relationship. This makes sense,
61 | because horsepower should increase as the engine volume (displacement) gets
62 | larger.
63 | * Displacement and weight have a positive relationship. This makes sense,
64 | because heavier cars tend to need bigger engines.
65 | * Horsepower and weight have a positive relationship. This makes sense,
66 | because larger cars tend to have higher horsepower engines.
67 |
68 | There may be other inferences you could draw from this plot as well, but this
69 | demonstrates the usefulness of the scatter matrix in understanding your data
70 | visually.
71 | '''
72 |
73 |
74 | '''
75 | Part 3
76 | Use a plot to answer the following questions:
77 | '''
78 |
79 | '''
80 | -Do heavier or lighter cars get better mpg?
81 | '''
82 | # Since we want to look at the relationship between two numeric variables, we
83 | # can use a scatterplot to see how they "move" with each other.
84 | auto.plot(kind='scatter', x='weight', y='mpg', alpha=0.5) # Create scatter plot
85 | plt.title('Car MPG by Weight')
86 | plt.xlabel('Car weight')
87 | plt.ylabel('MPG')
88 | plt.show()
89 | # From the plot, it appears lighter cars get better mpg. As weight increase,
90 | # mpg decreases.
91 |
92 | '''
93 | -How are horsepower and displacement related?
94 | '''
95 | # Once again, since we want to look at the relationship between two numeric
96 | # variables, we can use a scatterplot.
97 | # Notice that I didn't specify whether displacement or horsepower should be on
98 | # the x-axis. However, using my (limited) domain expertise, I would think that
99 | # horsepower is affected by the displacement of the engine. So I put
100 | # displacement on the x-axis and horsepower on the y-axis.
101 | auto.plot(kind='scatter', x='displacement', y='horsepower', alpha=0.5)
102 | plt.title('Horsepower by Engine Displacement')
103 | plt.xlabel('Engine Displacement')
104 | plt.ylabel('Horsepower ')
105 | plt.show()
106 | # This plot shows that displacement and horsepower have a positive relationship.
107 |
108 | '''
109 | -What does the distribution of acceleration look like?
110 | '''
111 | # Since I'm interested in the distribution of acceleration, I can use a
112 | # histogram to investigate that.
113 | auto.acceleration.hist()
114 | plt.title('Distribution of Acceleration')
115 | plt.xlabel('Acceleration')
116 | plt.ylabel('Frequency')
117 | plt.show()
118 | # We can see that acceleration has an almost normal distribution. The most
119 | # frequent value of acceleration is around 16. The values of acceleration
120 | # range from 8 to 25.
121 |
122 | '''
123 | -How is mpg spread for cars with different numbers of cylinders?
124 | '''
125 | # Since we are interested in the spread (as in the range of different values)
126 | # for each of the different cylinder counts, we should use a boxplot as it
127 | # illustrates the spread of a numeric variable and accepts the "by" parameter,
128 | # which allows us to generate a plot for each value of a variable.
129 | auto.boxplot('mpg', by='cylinders')
130 | plt.title('Car MPG by Number of Cylinders')
131 | plt.xlabel('Number of Cylinders')
132 | plt.ylabel('MPG')
133 | plt.show()
134 | # This plot gives us a lot of information. I'll list a few things to notice:
135 | # * The range for 3 cylinders is pretty small, which might be because there are
136 | # 4 observations.
137 | # * As shown in our earlier plots, mpg decreases as number of cylinders increases.
138 | # * Interestingly, there are 4 cylinder cars that get relatively low gas mileage.
139 | # * Over half of the 4 cylinder cars get better mpg than all of the 8 cylinders cars.
140 |
141 | '''
142 | -Do cars made before or after 1975 get better average mpg? (Hint: You need to
143 | create a new column that encodes whether a year is before or after 1975.)
144 | '''
145 | # There are several different ways to do this one. The most straightforward
146 | # way could be to create a new column called 'before_1975' that contains a
147 | # 'Before 1975' or 'After 1975'. We'll include 1975 in 'After 1975'.
148 | auto['before_1975'] = np.where(auto.model_year < 75,'Before 1975', 'After 1975')
149 | # Remember that np.where is like the IF function in Excel:
150 | # np.where(, , )
151 |
152 | # Now we can get the data we need by use a group by.
153 | auto.groupby('before_1975').mpg.mean().plot(kind='bar')
154 | plt.title('Average MPG Before and After 1975')
155 | plt.xlabel('')
156 | plt.ylabel('Average MPG')
157 | plt.show()
158 |
159 | # The labels are a little cut off, so you can use some extra matplotlib for
160 | # formatting. 'set_xticklabels' let's you set several things.
161 | auto.groupby('before_1975').mpg.mean().plot(kind='bar').set_xticklabels(['After 1975','Before 1975'], rotation=0)
162 | plt.title('Average MPG Before and After 1975')
163 | plt.xlabel('')
164 | plt.ylabel('Average MPG')
165 | plt.show()
166 | # We can see that the average mpg for cars after 1975 is higher.
167 |
168 | # This could have been done without creating the extra variable.
169 | auto.groupby(auto.model_year < 75).mpg.mean().plot(kind='bar').set_xticklabels(['After 1975','Before 1975'], rotation=0)
170 | plt.title('Average MPG Before and After 1975')
171 | plt.xlabel('')
172 | plt.ylabel('Average MPG')
173 | plt.show()
174 | # We get the same results but without the intermediate step.
--------------------------------------------------------------------------------
/homework/06_bias_variance.md:
--------------------------------------------------------------------------------
1 | ## Class 6 Pre-work: Bias-Variance Tradeoff
2 |
3 | Read this excellent article, [Understanding the Bias-Variance Tradeoff](http://scott.fortmann-roe.com/docs/BiasVariance.html), and be prepared to **discuss it in class** on Monday.
4 |
5 | **Note:** You can ignore sections 4.2 and 4.3.
6 |
7 | Here are some questions to think about while you read:
8 | * In the Party Registration example, what are the features? What is the response? Is this a regression or classification problem?
9 | * Conceptually, how is KNN being applied to this problem to make a prediction?
10 | * How do the four visualizations in section 3 relate to one another? Change the value of K using the slider, and make sure you understand what changed in the visualizations (and why it changed).
11 | * In figures 4 and 5, what do the lighter colors versus the darker colors mean? How is the darkness calculated?
12 | * What does the black line in figure 5 represent? What predictions would an ideal machine learning model make, with respect to this line?
13 | * Choose a very small value of K, and click the button "Generate New Training Data" a number of times. Do you "see" low variance or high variance? Do you "see" low bias or high bias?
14 | * Repeat this with a very large value of K. Do you "see" low variance or high variance? Do you "see" low bias or high bias?
15 | * Try using other values of K. What value of K do you think is "best"? How do you define "best"?
16 | * Why should we care about variance at all? Shouldn't we just minimize bias and ignore variance?
17 | * Does a high value for K cause "overfitting" or "underfitting"?
18 |
--------------------------------------------------------------------------------
/homework/07_glass_identification.md:
--------------------------------------------------------------------------------
1 | ## Class 7 Homework: Glass Identification
2 |
3 | Let's practice what we have learned using the [Glass Identification dataset](http://archive.ics.uci.edu/ml/datasets/Glass+Identification).
4 |
5 | 1. Read the data into a DataFrame.
6 | 2. Briefly explore the data to make sure the DataFrame matches your expectations.
7 | 3. Let's convert this into a binary classification problem. Create a new DataFrame column called "binary":
8 | * If type of glass = 1/2/3/4, binary = 0.
9 | * If type of glass = 5/6/7, binary = 1.
10 | 4. Create a feature matrix "X". (Think carefully about which columns are actually features!)
11 | 5. Create a response vector "y" from the "binary" column.
12 | 6. Split X and y into training and testing sets.
13 | 7. Fit a KNN model on the training set using K=5.
14 | 8. Make predictions on the testing set and calculate accuracy.
15 | 9. Calculate the "null accuracy", which is the classification accuracy that could be achieved by always predicting the majority class.
16 |
17 | **Bonus:**
18 | * Write a for loop that computes the test set accuracy for a range of K values.
19 | * Plot K versus test set accuracy to help you choose an optimal value for K.
20 |
--------------------------------------------------------------------------------
/homework/11_roc_auc.md:
--------------------------------------------------------------------------------
1 | ## Class 11 Pre-work: ROC Curves and Area Under the Curve (AUC)
2 |
3 | Before learning about ROC curves, it's important to be comfortable with the following terms: true positive, true negative, false positive, false negative, sensitivity, and specificity. If you aren't yet comfortable, Rahul Patwari has excellent videos on [Intuitive sensitivity and specificity](https://www.youtube.com/watch?v=U4_3fditnWg) (9 minutes) and [The tradeoff between sensitivity and specificity](https://www.youtube.com/watch?v=vtYDyGGeQyo) (13 minutes).
4 |
5 | Then, watch Kevin's video on [ROC Curves and Area Under the Curve](https://www.youtube.com/watch?v=OAl6eAyP-yo) (14 minutes), and be prepared to **discuss it in class** on Wednesday. (There's a blog post containing the [video transcript and screenshots](http://www.dataschool.io/roc-curves-and-auc-explained/), which might serve as a useful reference.) You can also play with the [visualization](http://www.navan.name/roc/) shown in the video. Optionally, you could also watch Rahul Patwari's video on [ROC curves](https://www.youtube.com/watch?v=21Igj5Pr6u4) (12 minutes).
6 |
7 | Here are some questions to think about:
8 |
9 | - If you have a classification model that outputs predicted probabilities, how could you convert those probabilities to class predictions?
10 | - What are the methods in scikit-learn that output predicted probabilities and class predictions?
11 | - Why are predicted probabilities (rather than just class predictions) required to generate an ROC curve?
12 | - Could you use an ROC curve for a regression problem? Why or why not?
13 | - What's another term for True Positive Rate?
14 | - If I wanted to increase specificity, how would I change the classification threshold?
15 | - Is it possible to adjust your classification threshold such that both sensitivity and specificity increase simultaneously? Why or why not?
16 | - What are the primary benefits of ROC curves over classification accuracy?
17 | - What should you do if your AUC is 0.2?
18 | - What would the plot of reds and blues look like for a dataset in which each observation was a credit card transaction, and the response variable was whether or not the transaction was fraudulent? (0 = not fraudulent, 1 = fraudulent)
19 | - Let's say your classifier has a sensitivity of 0.95 and a specificity of 0.3, and the classes are balanced. Would it result in more false positives or false negatives?
20 | - What's a real-world scenario in which you would prefer a high specificity (rather than a high sensitivity) for your classifier?
21 |
--------------------------------------------------------------------------------
/homework/11_roc_auc_annotated.md:
--------------------------------------------------------------------------------
1 | ## Class 11 Pre-work: ROC Curves and Area Under the Curve (AUC)
2 |
3 | Before learning about ROC curves, it's important to be comfortable with the following terms: true positive, true negative, false positive, false negative, sensitivity, and specificity. If you aren't yet comfortable, Rahul Patwari has excellent videos on [Intuitive sensitivity and specificity](https://www.youtube.com/watch?v=U4_3fditnWg) (9 minutes) and [The tradeoff between sensitivity and specificity](https://www.youtube.com/watch?v=vtYDyGGeQyo) (13 minutes).
4 |
5 | Then, watch Kevin's video on [ROC Curves and Area Under the Curve](https://www.youtube.com/watch?v=OAl6eAyP-yo) (14 minutes), and be prepared to **discuss it in class** on Wednesday. (There's a blog post containing the [video transcript and screenshots](http://www.dataschool.io/roc-curves-and-auc-explained/), which might serve as a useful reference.) You can also play with the [visualization](http://www.navan.name/roc/) shown in the video. Optionally, you could also watch Rahul Patwari's video on [ROC curves](https://www.youtube.com/watch?v=21Igj5Pr6u4) (12 minutes).
6 |
7 | Here are some questions to think about:
8 |
9 | - If you have a classification model that outputs predicted probabilities, how could you convert those probabilities to class predictions?
10 | - Set a threshold, and classify everything above the threshold as a 1 and everything below the threshold as a 0.
11 | - What are the methods in scikit-learn that output predicted probabilities and class predictions?
12 | - predict_proba and predict.
13 | - Why are predicted probabilities (rather than just class predictions) required to generate an ROC curve?
14 | - Because an ROC curve is measuring the performance of a classifier at all possible thresholds, and thresholds only make sense in the context of predicted probabilities.
15 | - Could you use an ROC curve for a regression problem? Why or why not?
16 | - No, because ROC is a plot of TPR vs FPR, and those concepts have no meaning in a regression problem.
17 | - What's another term for True Positive Rate?
18 | - Sensitivity or recall.
19 | - If I wanted to increase specificity, how would I change the classification threshold?
20 | - Increase it.
21 | - Is it possible to adjust your classification threshold such that both sensitivity and specificity increase simultaneously? Why or why not?
22 | - No, because increasing either of those requires moving the threshold in opposite directions.
23 | - What are the primary benefits of ROC curves over classification accuracy?
24 | - Doesn't require setting a classification threshold, allows you to visualize the performance of your classifier, works well for unbalanced classes.
25 | - What should you do if your AUC is 0.2?
26 | - Reverse your predictions so that your AUC is 0.8.
27 | - What would the plot of reds and blues look like for a dataset in which each observation was a credit card transaction, and the response variable was whether or not the transaction was fraudulent? (0 = not fraudulent, 1 = fraudulent)
28 | - Blues would be significantly larger, lots of overlap between blues and reds.
29 | - Let's say your classifier has a sensitivity of 0.95 and a specificity of 0.3, and the classes are balanced. Would it result in more false positives or false negatives?
30 | - False positives, meaning it falsely predicted positive (the true status is negative).
31 | - What's a real-world scenario in which you would prefer a high specificity (rather than a high sensitivity) for your classifier?
32 | - Speed cameras issuing speeding tickets.
33 |
--------------------------------------------------------------------------------
/homework/13_spam_filtering.md:
--------------------------------------------------------------------------------
1 | ## Class 13 Pre-work: Spam Filtering
2 |
3 | Read Paul Graham's [A Plan for Spam](http://www.paulgraham.com/spam.html) and be prepared to **discuss it in class on Wednesday**.
4 |
5 | Here are some questions to think about while you read:
6 | * Should a spam filter optimize for sensitivity or specificity, in Paul's opinion?
7 | * Before he tried the "statistical approach" to spam filtering, what was his approach?
8 | * What are the key components of his statistical filtering system? In other words, how does it work?
9 | * What did Paul say were some of the benefits of the statistical approach?
10 | * How good was his prediction of the "spam of the future"?
11 |
--------------------------------------------------------------------------------
/homework/13_spam_filtering_annotated.md:
--------------------------------------------------------------------------------
1 | ## Class 13 Pre-work: Spam Filtering
2 |
3 | Read Paul Graham's [A Plan for Spam](http://www.paulgraham.com/spam.html) and be prepared to **discuss it in class on Wednesday**.
4 |
5 | Here are some questions to think about while you read:
6 | * Should a spam filter optimize for sensitivity or specificity, in Paul's opinion?
7 | * specificity to minimize false positives
8 | * Before he tried the "statistical approach" to spam filtering, what was his approach?
9 | * hand engineering features and computing a "score"
10 | * What are the key components of his statistical filtering system? In other words, how does it work?
11 | * scan the entire text (including headers) and tokenize it
12 | * count number of occurrences of each token in ham corpus and spam corpus
13 | * assign each token a spam score based upon its relative frequency
14 | * for new mail, only take 15 most interesting tokens into account
15 | * What did Paul say were some of the benefits of the statistical approach?
16 | * it works better (almost no false positives)
17 | * less work for him because it discovers features automatically
18 | * you know what the "score" means
19 | * can easily be tuned to the individual user
20 | * evolves with the spam
21 | * creates an implicit whitelist/blacklist of email addresses, server names, etc.
22 | * How good was his prediction of the "spam of the future"?
23 | * great!
24 |
--------------------------------------------------------------------------------
/notebooks/11_titanic_exercise.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:557342100eb7ce91ca76e7e4f24737943f3625640543427282904a15759174c8"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "# Titanic Exercise"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "collapsed": false,
21 | "input": [
22 | "import pandas as pd\n",
23 | "from sklearn.cross_validation import train_test_split, cross_val_score\n",
24 | "from sklearn.linear_model import LogisticRegression\n",
25 | "from sklearn import metrics\n",
26 | "import numpy as np\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "%matplotlib inline"
29 | ],
30 | "language": "python",
31 | "metadata": {},
32 | "outputs": []
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "## Preparing the data"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "Read in the data and look at the first 10 rows."
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "collapsed": false,
51 | "input": [],
52 | "language": "python",
53 | "metadata": {},
54 | "outputs": []
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "Check for missing values."
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "collapsed": false,
66 | "input": [],
67 | "language": "python",
68 | "metadata": {},
69 | "outputs": []
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "We are going to focus on Pclass, Sex, Age, and Embarked:\n",
76 | "\n",
77 | "- **Pclass:** leave as-is\n",
78 | "- **Sex:** convert \"male\" to 0 and \"female\" to 1\n",
79 | "- **Age:** fill in missing values using the mean\n",
80 | "- **Embarked:** create dummy variables"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "collapsed": false,
86 | "input": [],
87 | "language": "python",
88 | "metadata": {},
89 | "outputs": []
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "Create X and y using the features we have chosen."
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "collapsed": false,
101 | "input": [],
102 | "language": "python",
103 | "metadata": {},
104 | "outputs": []
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "## Train/Test Split"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "Split X and y into training and testing sets."
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "collapsed": false,
123 | "input": [],
124 | "language": "python",
125 | "metadata": {},
126 | "outputs": []
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "## Logistic Regression"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "Fit a logistic regression model on the training data."
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "collapsed": false,
145 | "input": [],
146 | "language": "python",
147 | "metadata": {},
148 | "outputs": []
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "Print the model's intercept."
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "collapsed": false,
160 | "input": [],
161 | "language": "python",
162 | "metadata": {},
163 | "outputs": []
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "Print the model's coefficients. How do we interpret them?"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "collapsed": false,
175 | "input": [],
176 | "language": "python",
177 | "metadata": {},
178 | "outputs": []
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "Predict the probability of survival for the first person in X_train using scikit-learn."
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "collapsed": false,
190 | "input": [],
191 | "language": "python",
192 | "metadata": {},
193 | "outputs": []
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "Do this same calculation manually."
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "collapsed": false,
205 | "input": [],
206 | "language": "python",
207 | "metadata": {},
208 | "outputs": []
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "Pretend this person was 10 years older, and calculate their probability of survival (manually)."
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "collapsed": false,
220 | "input": [],
221 | "language": "python",
222 | "metadata": {},
223 | "outputs": []
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "Pretend this person was a woman, and calculate their probability of survival (manually)."
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "collapsed": false,
235 | "input": [],
236 | "language": "python",
237 | "metadata": {},
238 | "outputs": []
239 | },
240 | {
241 | "cell_type": "markdown",
242 | "metadata": {},
243 | "source": [
244 | "## Model Evaluation"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {},
250 | "source": [
251 | "Make predictions on the testing data and calculate the accuracy."
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "collapsed": false,
257 | "input": [],
258 | "language": "python",
259 | "metadata": {},
260 | "outputs": []
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "Compare this to the null accuracy."
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "collapsed": false,
272 | "input": [],
273 | "language": "python",
274 | "metadata": {},
275 | "outputs": []
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "Print the confusion matrix. Does this model tend towards specificity or sensitivity?"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "collapsed": false,
287 | "input": [],
288 | "language": "python",
289 | "metadata": {},
290 | "outputs": []
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "Calculate the specificity and the sensitivity."
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "collapsed": false,
302 | "input": [],
303 | "language": "python",
304 | "metadata": {},
305 | "outputs": []
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "metadata": {},
310 | "source": [
311 | "Change the threshold to make the model more sensitive, then print the new confusion matrix."
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "collapsed": false,
317 | "input": [],
318 | "language": "python",
319 | "metadata": {},
320 | "outputs": []
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": [
326 | "Recalculate the specificity and the sensitivity."
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "collapsed": false,
332 | "input": [],
333 | "language": "python",
334 | "metadata": {},
335 | "outputs": []
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "Plot the ROC curve. How can we interpret the results?"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "collapsed": false,
347 | "input": [],
348 | "language": "python",
349 | "metadata": {},
350 | "outputs": []
351 | },
352 | {
353 | "cell_type": "markdown",
354 | "metadata": {},
355 | "source": [
356 | "Calculate the AUC."
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "collapsed": false,
362 | "input": [],
363 | "language": "python",
364 | "metadata": {},
365 | "outputs": []
366 | },
367 | {
368 | "cell_type": "markdown",
369 | "metadata": {},
370 | "source": [
371 | "## Cross-Validation"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "metadata": {},
377 | "source": [
378 | "Use cross-validation to check the AUC for the current model."
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "collapsed": false,
384 | "input": [],
385 | "language": "python",
386 | "metadata": {},
387 | "outputs": []
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "metadata": {},
392 | "source": [
393 | "Remove Embarked from the model and check AUC again using cross-validation."
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "collapsed": false,
399 | "input": [],
400 | "language": "python",
401 | "metadata": {},
402 | "outputs": []
403 | }
404 | ],
405 | "metadata": {}
406 | }
407 | ]
408 | }
--------------------------------------------------------------------------------
/notebooks/13_naive_bayes_spam.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:b7e3a62e1216c53fa3e7d0fa56c5373dfe7f58c3817a1468b0abbc52dfe7b6a7"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "# Applying Naive Bayes classification to spam email"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "Let's pretend we have an email with three words: \"Send money now.\" We want to classify that email as ham or spam.\n",
23 | "\n",
24 | "We'll use Naive Bayes classification:\n",
25 | "\n",
26 | "$$P(spam | \\text{send money now}) = \\frac {P(\\text{send money now} | spam) \\times P(spam)} {P(\\text{send money now})}$$\n",
27 | "\n",
28 | "By assuming that the features (the words) are conditionally independent, we can simplify the likelihood function:\n",
29 | "\n",
30 | "$$P(spam | \\text{send money now}) \\approx \\frac {P(\\text{send} | spam) \\times P(\\text{money} | spam) \\times P(\\text{now} | spam) \\times P(spam)} {P(\\text{send money now})}$$\n",
31 | "\n",
32 | "We could calculate all of the values in the numerator by examining a corpus of spam:\n",
33 | "\n",
34 | "$$P(spam | \\text{send money now}) \\approx \\frac {0.2 \\times 0.1 \\times 0.1 \\times 0.9} {P(\\text{send money now})} = \\frac {0.0018} {P(\\text{send money now})}$$\n",
35 | "\n",
36 | "We could repeat this process to calculate the probability that the email is ham:\n",
37 | "\n",
38 | "$$P(ham | \\text{send money now}) \\approx \\frac {0.05 \\times 0.01 \\times 0.1 \\times 0.1} {P(\\text{send money now})} = \\frac {0.000005} {P(\\text{send money now})}$$\n",
39 | "\n",
40 | "All we care about is whether spam or ham has the higher probability, and so we predict that the email is spam.\n",
41 | "\n",
42 | "What have we learned from this exercise?\n",
43 | "\n",
44 | "- The \"naive\" assumption of Naive Bayes (that the features are conditionally independent) is critical to making these calculations simple.\n",
45 | "- The normalization constant (the denominator) can be ignored since it's the same for all classes.\n",
46 | "- The prior probability is basically irrelevant once you have a lot of features.\n",
47 | "- The Naive Bayes classifier can handle a lot of irrelevant features."
48 | ]
49 | }
50 | ],
51 | "metadata": {}
52 | }
53 | ]
54 | }
--------------------------------------------------------------------------------
/notebooks/18_regularization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:283eafa4edacfbbb8b51d404c8feab98319104a044aaa4138d97957373762033"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Overfitting, revisited\n",
16 | "\n",
17 | "What is overfitting? Here are a few ways of explaining it:\n",
18 | "\n",
19 | "- Building a model that matches the training set too closely.\n",
20 | "- Building a model that does well on the training data, but doesn't generalize to out-of-sample data.\n",
21 | "- Learning from the noise in the data, rather than just the signal."
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "### Overfitting\n",
29 | "\n",
30 | "
"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "### Underfitting vs Overfitting\n",
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "**What are some ways to overfit the data?**\n",
46 | "\n",
47 | "- Train and test on the same data\n",
48 | "- Create a model that is overly complex (one that doesn't generalize well)\n",
49 | " - Example: KNN in which K is too low\n",
50 | " - Example: Decision tree that is grown too deep\n",
51 | "\n",
52 | "An overly complex model has **low bias** but **high variance**."
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "## Linear Regression, revisited\n",
60 | "\n",
61 | "**Question:** Are linear regression models high bias/low variance, or low bias/high variance?\n",
62 | "\n",
63 | "**Answer:** High bias/low variance (generally speaking)\n",
64 | "\n",
65 | "Great! So as long as we don't train and test on the same data, we don't have to worry about overfitting, right? Not so fast...."
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "### Overfitting with Linear Regression (part 1)\n",
73 | "\n",
74 | "Linear models can overfit if you include irrelevant features.\n",
75 | "\n",
76 | "**Question:** Why would that be the case?\n",
77 | "\n",
78 | "**Answer:** Because it will learn a coefficient for any feature you feed into the model, regardless of whether that feature has the signal or the noise.\n",
79 | "\n",
80 | "This is especially a problem when **p (number of features) is close to n (number of observations)**, because that model will naturally have high variance."
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "### Overfitting with Linear Regression (part 2)\n",
88 | "\n",
89 | "Linear models can also overfit when the included features are highly correlated. From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares):\n",
90 | "\n",
91 | "> \"...coefficient estimates for Ordinary Least Squares rely on the independence of the model terms. When terms are correlated and the columns of the design matrix X have an approximate linear dependence, the design matrix becomes close to singular and as a result, the least-squares estimate becomes highly sensitive to random errors in the observed response, producing a large variance.\""
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "### Overfitting with Linear Regression (part 3)\n",
99 | "\n",
100 | "Linear models can also overfit if the coefficients are too large.\n",
101 | "\n",
102 | "**Question:** Why would that be the case?\n",
103 | "\n",
104 | "**Answer:** Because the larger the absolute value of the coefficient, the more power it has to change the predicted response. Thus it tends toward high variance, which can result in overfitting."
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Regularization\n",
112 | "\n",
113 | "Regularization is a method for \"constraining\" or \"regularizing\" the size of the coefficients, thus \"shrinking\" them towards zero. It tends to reduce variance more than it increases bias, and thus minimizes overfitting.\n",
114 | "\n",
115 | "Common regularization techniques for linear models:\n",
116 | "\n",
117 | "- **Ridge regression** (also known as \"L2 regularization\"): shrinks coefficients toward zero (but they never reach zero)\n",
118 | "- **Lasso regularization** (also known as \"L1 regularization\"): shrinks coefficients all the way to zero\n",
119 | "- **ElasticNet regularization**: balance between Ridge and Lasso\n",
120 | "\n",
121 | "Lasso regularization is useful if we believe many features are irrelevant, since a feature with a zero coefficient is essentially removed from the model. Thus, it is a useful technique for feature selection.\n",
122 | "\n",
123 | "How does regularization work?\n",
124 | "\n",
125 | "- A tuning parameter alpha (or sometimes lambda) imposes a penalty on the size of coefficients.\n",
126 | "- Instead of minimizing the \"loss function\" (mean squared error), it minimizes the \"loss plus penalty\".\n",
127 | "- A tiny alpha imposes no penalty on the coefficient size, and is equivalent to a normal linear model.\n",
128 | "- Increasing the alpha penalizes the coefficients and shrinks them toward zero."
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Bias-variance trade-off\n",
136 | "\n",
137 | "Our goal is to locate the optimum model complexity, and thus regularization is useful when we believe our model is too complex.\n",
138 | "\n",
139 | "
"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "### Standardizing features\n",
147 | "\n",
148 | "It's usually recommended to standardize your features when using regularization.\n",
149 | "\n",
150 | "**Question:** Why would that be the case?\n",
151 | "\n",
152 | "**Answer:** If you don't standardize, features would be penalized simply because of their scale. Also, standardizing avoids penalizing the intercept (which wouldn't make intuitive sense)."
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "### Ridge vs Lasso Coefficient Plots\n",
160 | "\n",
161 | "Below is a visualization of what happens when you apply regularization. The general idea is that you are restricting the \"space\" in which your coefficients can be fit. This means you are shriking the coefficient space. You still want the coefficients that give the \"best\" model (as determined by you metric, e.g. RMSE, accuracy, AUC, etc), but you are restricting the area in which you can evaluate coefficients.\n",
162 | "\n",
163 | "In this specific image, we are fitting a model with two predictors, B1 and B2. The x-axis shows B1 and the y-axis shows B2. There is a third dimension here, our evaluation metric. For the sake of example, we can assume this is linear regression, so we are trying to minimize our Root Mean Squared Error (RMSE). B-hat represents the set of coefficients, B1 and B2, where RMSE is minimized. While this is the \"best\" model according to our criterion, we've imposed a penalty that restricts the coefficients to the blue box. So we want to find the point (representing two coeffcients B1 and B2) where RMSE is minimized within our blue box. Technically, the RMSE will be higher here, but it will be the lowest within our penalized box. Due to the shape or space for the regression problem and the shape of our penalty box, many of the \"optimal\" coefficients will be close to zero for Ridge Regression and exactly zero for LASSO Regression.\n",
164 | "\n",
165 | "
"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "### Ridge vs Lasso path diagrams\n",
173 | "\n",
174 | "Larger alpha (on the left here) means more regularization, which means more coefficients close to zero.\n",
175 | "
"
176 | ]
177 | }
178 | ],
179 | "metadata": {}
180 | }
181 | ]
182 | }
--------------------------------------------------------------------------------
/notebooks/images/18_bias_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_bias_variance.png
--------------------------------------------------------------------------------
/notebooks/images/18_overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_overfitting.png
--------------------------------------------------------------------------------
/notebooks/images/18_ridge_lasso_path.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_ridge_lasso_path.png
--------------------------------------------------------------------------------
/notebooks/images/18_ridge_lasso_regression_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_ridge_lasso_regression_coefficients.png
--------------------------------------------------------------------------------
/notebooks/images/18_underfitting_overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/18_underfitting_overfitting.png
--------------------------------------------------------------------------------
/notebooks/images/cross_validation_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/cross_validation_diagram.png
--------------------------------------------------------------------------------
/notebooks/images/cross_validation_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/cross_validation_example.png
--------------------------------------------------------------------------------
/notebooks/images/estimating_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/estimating_coefficients.png
--------------------------------------------------------------------------------
/notebooks/images/obama_clinton_tree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/obama_clinton_tree.jpg
--------------------------------------------------------------------------------
/notebooks/images/overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/overfitting.png
--------------------------------------------------------------------------------
/notebooks/images/r_squared.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/r_squared.png
--------------------------------------------------------------------------------
/notebooks/images/salary_color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_color.png
--------------------------------------------------------------------------------
/notebooks/images/salary_regions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_regions.png
--------------------------------------------------------------------------------
/notebooks/images/salary_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_tree.png
--------------------------------------------------------------------------------
/notebooks/images/salary_tree_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_tree_annotated.png
--------------------------------------------------------------------------------
/notebooks/images/salary_unpruned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/salary_unpruned.png
--------------------------------------------------------------------------------
/notebooks/images/slope_intercept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/slope_intercept.png
--------------------------------------------------------------------------------
/notebooks/images/train_test_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/train_test_split.png
--------------------------------------------------------------------------------
/notebooks/images/training_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/training_error.png
--------------------------------------------------------------------------------
/notebooks/images/tree_titanic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/tree_titanic.png
--------------------------------------------------------------------------------
/notebooks/images/tree_vehicles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/tree_vehicles.png
--------------------------------------------------------------------------------
/notebooks/images/tree_vs_linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/tree_vs_linear.png
--------------------------------------------------------------------------------
/notebooks/images/underfitting_overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/notebooks/images/underfitting_overfitting.png
--------------------------------------------------------------------------------
/other/peer_review.md:
--------------------------------------------------------------------------------
1 | ## Peer Review Guidelines
2 |
3 | You will be assigned to review the project drafts of two of your peers. You will have one week to provide them with feedback.
4 |
5 | Expectations:
6 | * Read everything they wrote!
7 | * If they provided their data, review it and try to understand it.
8 | * Read their code and try to understand their thought process.
9 | * If their code can be run, try running it.
10 | * Spend at least one hour reviewing their project (including the time it takes to write the feedback).
11 |
12 | Your feedback would ideally consist of:
13 | * Strengths of their project (things you particularly like about it)
14 | * Comments about things you think could be improved
15 | * Questions about things you don't understand
16 | * Comments about their code
17 | * Suggestions for next steps
18 | * Guiding principle: Give feedback that would be helpful to you if it was your project!
19 |
20 | You should take a quick glance through their project as soon as possible, to make sure you understand what they have given you and what files you should be reviewing. If you're unclear, ask them about it!
21 |
--------------------------------------------------------------------------------
/other/project.md:
--------------------------------------------------------------------------------
1 | # Course Project
2 |
3 |
4 | ## Overview
5 |
6 | The final project should represent significant original work applying data science techniques to an interesting problem. Final projects are individual attainments, but you should be talking frequently with your instructors and classmates about them.
7 |
8 | Address a data-related problem in your professional field or a field you're interested in. Pick a subject that you're passionate about; if you're strongly interested in the subject matter it'll be more fun for you and you'll produce a better project!
9 |
10 | To stimulate your thinking, here is an excellent list of [public data sources](public_data.md). Using public data is the most common choice. If you have access to private data, that's also an option, though you'll have to be careful about what results you can release. You are also welcome to compete in a [Kaggle competition](http://www.kaggle.com/) as your project, in which case the data will be provided to you.
11 |
12 | You should also take a look at [past projects](https://github.com/justmarkham/DAT-project-examples) from other GA Data Science students, to get a sense of the variety and scope of projects.
13 |
14 |
15 | ## Project Milestones
16 |
17 |
18 | ### March 30: Deadline for Discussing Project Ideas with an Instructor
19 |
20 | By March 30, you should talk with one of your instructors about your project idea(s). They can help you to choose between different project ideas, advise you on the appropriate scope for your project, and ensure that your project question might reasonably be answerable using the data science tools and techniques taught in the course.
21 |
22 |
23 | ### April 6: Project Question and Dataset
24 |
25 | Create a GitHub repository for your project. It should include a document that answers these questions:
26 |
27 | What is the question you hope to answer? What data are you planning to use to answer that question? What do you know about the data so far? Why did you choose this topic?
28 |
29 | Example:
30 | * I'm planning to predict passenger survival on the Titanic.
31 | * I have Kaggle's Titanic dataset with 10 passenger characteristics.
32 | * I know that many of the fields have missing values, that some of the text fields are messy and will require cleaning, and that about 38% of the passengers in the training set survive.
33 | * I chose this topic because I'm fascinated by the history of the Titanic.
34 |
35 |
36 | ### April 27: Project Presentation #1: Data Exploration and Analysis Plan
37 |
38 | You'll be giving a presentation to the class about the work you have done so far, as well as your plans for the project going forward. Your presentation should use slides (or a similar format). Your slides, exploratory code, and visualizations should be included in your GitHub repository. Here are some questions that you should address in your presentation:
39 |
40 | What data have you gathered, and how did you gather it? What steps have you taken to explore the data? Which areas of the data have you cleaned, and which areas still need cleaning? What insights have you gained from your exploration? Will you be able to answer your question with this data, or do you need to gather more data (or adjust your question)? How might you use modeling to answer your question?
41 |
42 | Example:
43 | * I've created visualizations and numeric summaries to explore how survivability differs by passenger characteristic, and it appears that gender and class have a large role in determining survivability.
44 | * I estimated missing values for age using the titles provided in the Name column.
45 | * I created features to represent "spouse on board" and "child on board" by further analyzing names.
46 | * I think that the fare and ticket columns might be useful for predicting survival, but I still need to clean those columns.
47 | * I analyzed the differences between the training and testing sets, and found that the average fare was slightly higher in the testing set.
48 | * Since I'm predicting a binary outcome, I plan to use a classification method such as logistic regression to make my predictions.
49 |
50 |
51 | ### May 18: First Draft Due
52 |
53 | **At a minimum**, your project repository on GitHub should contain:
54 | * A draft of your project paper (in the format specified [below](#june-3-project-presentation-2))
55 | * Code, with lots of comments
56 | * Visualizations of your data
57 |
58 | **Ideally**, you would also include:
59 | * Draft slides for presentation #2
60 | * Data and data dictionary
61 |
62 | Your peers and instructors will provide feedback by May 25, according to [these guidelines](peer_review.md).
63 |
64 | **Tips for success:**
65 | * The work should stand "on its own", and should not depend upon the reader remembering your first project presentation.
66 | * The better you explain your project, and the easier it is to follow, the more useful feedback you will receive!
67 | * If your reviewers can actually run your code on the provided data, they will be able to give you more useful feedback on your code. (It can be very hard to make useful code suggestions on code that can't be run!)
68 |
69 |
70 | ### June 3: Project Presentation #2
71 |
72 | Your **project paper** should be written with a technical audience in mind. Here are the components you should cover:
73 |
74 | * Problem statement and hypothesis
75 | * Description of your data set and how it was obtained
76 | * Description of any pre-processing steps you took
77 | * What you learned from exploring the data, including visualizations
78 | * How you chose which features to use in your analysis
79 | * Details of your modeling process, including how you selected your models and validated them
80 | * Your challenges and successes
81 | * Possible extensions or business applications of your project
82 | * Conclusions and key learnings
83 |
84 | Your **presentation** should cover these components with less breadth and depth. Focus on creating an engaging, clear, and informative presentation that tells the story of your project and is suitable for a non-technical audience.
85 |
86 | Your project repository on GitHub should contain the following:
87 |
88 | * **Project paper:** any format (PDF, Markdown, etc.)
89 | * **Presentation slides:** any format (PDF, PowerPoint, Google Slides, etc.)
90 | * **Code:** commented Python scripts, and any other code you used in the project
91 | * **Visualizations:** integrated into your paper and/or slides
92 | * **Data:** data files in "raw" or "processed" format
93 | * **Data dictionary (aka "code book"):** description of each variable, including units
94 |
95 | If it's not possible or practical to include your entire dataset, you should link to your data source and provide a sample of the data. (GitHub has a [size limit](https://help.github.com/articles/what-is-my-disk-quota/) of 100 MB per file and 1 GB per repository.) If your data is private, you can either include an "anonymized" version of your data or create a private GitHub repository.
96 |
--------------------------------------------------------------------------------
/other/public_data.md:
--------------------------------------------------------------------------------
1 | ## Public Data Sources
2 |
3 | * Open data catalogs from various governments and NGOs:
4 | * [NYC Open Data](https://nycopendata.socrata.com/)
5 | * [DC Open Data Catalog](http://data.dc.gov/) / [OpenDataDC](http://www.opendatadc.org/)
6 | * [DataLA](https://data.lacity.org/)
7 | * [data.gov](https://www.data.gov/) (see also: [Project Open Data Dashboard](http://data.civicagency.org/))
8 | * [data.gov.uk](http://data.gov.uk/)
9 | * [US Census Bureau](http://www.census.gov/)
10 | * [World Bank Open Data](http://data.worldbank.org/)
11 | * [Humanitarian Data Exchange](http://docs.hdx.rwlabs.org/)
12 | * [Sunlight Foundation](http://sunlightfoundation.com/api/): government-focused data
13 | * [ProPublica Data Store](https://projects.propublica.org/data-store/)
14 | * Datasets hosted by academic institutions:
15 | * [UC Irvine Machine Learning Repository](http://archive.ics.uci.edu/ml/): datasets specifically designed for machine learning
16 | * [Stanford Large Network Dataset Collection](http://snap.stanford.edu/data/): graph data
17 | * [Inter-university Consortium for Political and Social Research](http://www.icpsr.umich.edu/)
18 | * [Pittsburgh Science of Learning Center's DataShop](http://www.learnlab.org/technologies/datashop/)
19 | * [Academic Torrents](http://academictorrents.com/): distributed network for sharing large research datasets
20 | * [Dataverse Project](http://dataverse.org/): searchable archive of research data
21 | * Datasets hosted by private companies:
22 | * [Quandl](https://www.quandl.com/): over 10 million financial, economic, and social datasets
23 | * [Amazon Web Services Public Data Sets](http://aws.amazon.com/datasets/)
24 | * [Kaggle](http://www.kaggle.com/) provides datasets with their challenges, but each competition has its own rules as to whether the data can be used outside of the scope of the competition.
25 | * Big lists of datasets:
26 | * [Awesome Public Datasets](https://github.com/caesar0301/awesome-public-datasets): Well-organized and frequently updated
27 | * [Rdatasets](http://vincentarelbundock.github.io/Rdatasets/): collection of 700+ datasets originally distributed with R packages
28 | * [RDataMining.com](http://www.rdatamining.com/resources/data)
29 | * [KDnuggets](http://www.kdnuggets.com/datasets/index.html)
30 | * [inside-R](http://www.inside-r.org/howto/finding-data-internet)
31 | * [100+ Interesting Data Sets for Statistics](http://rs.io/2014/05/29/list-of-data-sets.html)
32 | * [20 Free Big Data Sources](http://smartdatacollective.com/bernardmarr/235366/big-data-20-free-big-data-sources-everyone-should-know)
33 | * [Sebastian Raschka](https://github.com/rasbt/pattern_classification/blob/master/resources/dataset_collections.md): datasets categorized by format and topic
34 | * APIs:
35 | * [Apigee](https://apigee.com/providers): explore dozens of popular APIs
36 | * [Mashape](https://www.mashape.com/): explore hundreds of APIs
37 | * [Python APIs](http://www.pythonforbeginners.com/api/list-of-python-apis): Python wrappers for many APIs
38 | * Other interesting datasets:
39 | * [FiveThirtyEight](https://github.com/fivethirtyeight/data): data and code related to their articles
40 | * [The Upshot](https://github.com/TheUpshot/): data related to their articles
41 | * [Yelp Dataset Challenge](http://www.yelp.com/dataset_challenge): Yelp reviews, business attributes, users, and more from 10 cities
42 | * [Donors Choose](http://data.donorschoose.org/open-data/overview/): data related to their projects
43 | * [200,000+ Jeopardy questions](http://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/)
44 | * [CrowdFlower](http://www.crowdflower.com/data-for-everyone): interesting datasets created or enhanced by their contributors
45 | * [UFO reports](https://github.com/planetsig/ufo-reports): geolocated and time-standardized UFO reports for close to a century
46 | * [Reddit Top 2.5 Million](https://github.com/umbrae/reddit-top-2.5-million): all-time top 1,000 posts from each of the top 2,500 subreddits
47 | * Other resources:
48 | * [Datasets subreddit](http://www.reddit.com/r/datasets/): ask for help finding a specific data set, or post your own
49 | * [Center for Data Innovation](http://www.datainnovation.org/category/publications/data-set-blog/): blog posts about interesting, recently-released data sets.
50 |
51 | This is just the tip of the iceberg; there's a lot of data out there!
52 |
--------------------------------------------------------------------------------
/other/resources.md:
--------------------------------------------------------------------------------
1 | # Resources for Continued Learning
2 |
3 |
4 | ## Blogs
5 |
6 | * [Simply Statistics](http://simplystatistics.org/): Written by the Biostatistics professors at Johns Hopkins University who also run Coursera's [Data Science Specialization](https://www.coursera.org/specialization/jhudatascience/1)
7 | * [yhat's blog](http://blog.yhathq.com/): Beginner-friendly content, usually in Python
8 | * [No Free Hunch](http://blog.kaggle.com/) (Kaggle's blog): Mostly interviews with competition winners, or updates on their competitions
9 | * [FastML](http://fastml.com/): Various machine learning content, often with code
10 | * [Edwin Chen](http://blog.echen.me/): Infrequently updated, but long and thoughtful pieces
11 | * [FiveThirtyEight](http://fivethirtyeight.com/): Tons of timely data-related content
12 | * [Machine Learning Mastery](http://machinelearningmastery.com/blog/): Frequent posts on machine learning, very accessible
13 | * [Data School](http://www.dataschool.io/): Kevin Markham's blog! Beginner-focused, with reference guides and videos
14 | * [MLWave](http://mlwave.com/): Detailed posts on Kaggle competitions, by a Kaggle Master
15 | * [Data Science 101](http://101.datascience.community/): Short, frequent content about all aspects of data science
16 | * [ML in the Valley](http://ml.posthaven.com/): Thoughtful pieces by the Director of Analytics at Codecademy
17 |
18 |
19 | ## Aggregators
20 |
21 | * [DataTau](http://www.datatau.com/): Like [Hacker News](https://news.ycombinator.com/), but for data
22 | * [MachineLearning on reddit](http://www.reddit.com/r/MachineLearning/): Very active subreddit
23 | * [Quora's Machine Learning section](http://www.quora.com/Machine-Learning): Lots of interesting Q&A
24 | * [Quora's Data Science topic FAQ](https://www.quora.com/What-is-the-Data-Science-topic-FAQ)
25 | * [KDnuggets](http://www.kdnuggets.com/): Data mining news, jobs, classes and more
26 |
27 |
28 | ## DC Data Groups
29 |
30 | * [Data Community DC](http://www.datacommunitydc.org/): Coordinates six local data-related meetup groups
31 | * [District Data Labs](http://www.districtdatalabs.com/): Offers courses and other projects to local data scientists
32 |
33 |
34 | ## Online Classes
35 |
36 | * [Coursera's Data Science Specialization](https://www.coursera.org/specialization/jhudatascience/1): Nine courses (running every month) and a Capstone project, taught in R
37 | * [Stanford's Statistical Learning](http://online.stanford.edu/course/statistical-learning): By the authors of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) and [Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/), taught in R, highly recommended (preview the [lecture videos](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/))
38 | * [Coursera's Machine Learning](https://www.coursera.org/learn/machine-learning/): Andrew Ng's acclaimed course, taught in MATLAB/Octave
39 | * [Caltech's Learning from Data](http://work.caltech.edu/telecourse.html): Widely praised, not language-specific
40 | * [Udacity's Data Analyst Nanodegree](https://www.udacity.com/course/nd002): Project-based curriculum using Python, R, MapReduce, MongoDB
41 | * [Coursera's Data Mining Specialization](https://www.coursera.org/specialization/datamining/20): New specialization that began February 2015
42 | * [Coursera's Natural Language Processing](https://www.coursera.org/course/nlp): No upcoming sessions, but [lectures](https://class.coursera.org/nlp/lecture) and [slides](http://web.stanford.edu/~jurafsky/NLPCourseraSlides.html) are available
43 | * [SlideRule's Data Analysis Learning Path](https://www.mysliderule.com/learning-paths/data-analysis): Curated content from various online classes
44 | * [Udacity's Intro to Artificial Intelligence](https://www.udacity.com/course/intro-to-artificial-intelligence--cs271): Taught by Peter Norvig and Sebastian Thrun
45 | * [Coursera's Neural Networks for Machine Learning](https://www.coursera.org/course/neuralnets): Taught by Geoffrey Hinton, no upcoming sessions
46 | * [statistics.com](http://www.statistics.com/data-science/): Many online courses in data science
47 | * [CourseTalk](http://www.coursetalk.com/): Read reviews of online courses
48 |
49 |
50 | ## Online Content from Offline Classes
51 |
52 | * [Harvard's CS109 Data Science](http://cs109.github.io/2014/): Similar topics as General Assembly's course
53 | * [Columbia's Data Mining Class](http://www2.research.att.com/~volinsky/DataMining/Columbia2011/Columbia2011.html): Excellent slides
54 | * [Harvard's CS171 Visualization](http://www.cs171.org/2015/index.html): Includes programming in D3
55 |
56 |
57 | ## Face-to-Face Educational Programs
58 |
59 | * [Comparison of data science bootcamps](http://yet-another-data-blog.blogspot.com/2014/04/data-science-bootcamp-landscape-full.html): Up-to-date list maintained by a Zipfian Academy graduate
60 | * [The Complete List of Data Science Bootcamps & Fellowships](http://www.skilledup.com/articles/list-data-science-bootcamps/)
61 | * [Galvanize](http://www.galvanize.com/) (acquired [Zipfian Academy](http://www.zipfianacademy.com/)): Offers Data Science Immersive (Denver, Seattle, San Francisco)
62 | * [GalvanizeU](http://www.galvanizeu.com/): Offers Master of Engineering in Big Data (San Francisco)
63 | * [Data Science Retreat](http://datascienceretreat.com/): Primarily uses R (Berlin)
64 | * [Metis Data Science Bootcamp](http://www.thisismetis.com/data-science): Newer bootcamp (New York)
65 | * [Persontyle](http://www.persontyle.com/): Various course offerings (based in London)
66 | * [Software Carpentry](http://software-carpentry.org/): Two-day workshops, primarily for researchers and hosted by universities (worldwide)
67 | * [Colleges and Universities with Data Science Degrees](http://datascience.community/colleges)
68 |
69 |
70 | ## Conferences
71 |
72 | * [Knowledge Discovery and Data Mining (KDD)](http://www.kdd.org/): Hosted by ACM
73 | * [O'Reilly Strata + Hadoop World](http://strataconf.com/): Big focus on "big data" (San Jose, London, New York)
74 | * [PyData](http://pydata.org/): For developers and users of Python data tools (worldwide)
75 | * [PyCon](https://us.pycon.org/): For developers and users of Python (Portland in 2016)
76 |
77 |
78 | ## Books
79 |
80 | * [An Introduction to Statistical Learning with Applications in R](http://www-bcf.usc.edu/~gareth/ISL/) (free PDF)
81 | * [Elements of Statistical Learning](http://www-stat.stanford.edu/~tibs/ElemStatLearn/) (free PDF)
82 | * [Think Stats](http://www.greenteapress.com/thinkstats/) (free PDF or HTML)
83 | * [Mining of Massive Datasets](http://www.mmds.org/) (free PDF)
84 | * [Python for Informatics](http://www.pythonlearn.com/book.php) (free PDF or HTML)
85 | * [Statistics: Methods and Applications](http://www.statsoft.com/Textbook) (free HTML)
86 | * [Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do)
87 | * [Data Smart: Using Data Science to Transform Information into Insight](http://www.amazon.com/gp/product/111866146X/)
88 | * [Sams Teach Yourself SQL in 10 Minutes](http://www.amazon.com/Sams-Teach-Yourself-Minutes-Edition/dp/0672336073)
89 |
90 |
91 | ## Other Resources
92 |
93 | * [Open Source Data Science Masters](https://github.com/datasciencemasters/go): Huge list of resources
94 | * [Data Science Trello Board](https://trello.com/b/rbpEfMld/data-science): Another list of resources
95 | * [The Hitchhiker's Guide to Python](http://docs.python-guide.org/en/latest/): Online guide to understanding Python and getting good at it
96 | * [Python Reference](https://github.com/rasbt/python_reference): Python tips, tutorials, and more
97 | * [videolectures.net](http://videolectures.net/Top/Computer_Science/): Tons of academic videos
98 | * [Metacademy](http://www.metacademy.org/list): Quick summary of many machine learning terms, with links to resources for learning more
99 | * [Terms in data science defined in one paragraph](https://github.com/rasbt/pattern_classification/blob/master/resources/data_glossary.md)
100 |
--------------------------------------------------------------------------------
/slides/01_course_overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/01_course_overview.pdf
--------------------------------------------------------------------------------
/slides/01_course_overview.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/01_course_overview.pptx
--------------------------------------------------------------------------------
/slides/02_git_github.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/02_git_github.pdf
--------------------------------------------------------------------------------
/slides/02_git_github.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/02_git_github.pptx
--------------------------------------------------------------------------------
/slides/04_apis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/04_apis.pdf
--------------------------------------------------------------------------------
/slides/04_apis.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/04_apis.pptx
--------------------------------------------------------------------------------
/slides/04_visualization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/04_visualization.pdf
--------------------------------------------------------------------------------
/slides/04_visualization.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/04_visualization.pptx
--------------------------------------------------------------------------------
/slides/05_intro_to_data_science.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/05_intro_to_data_science.pdf
--------------------------------------------------------------------------------
/slides/05_intro_to_data_science.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/05_intro_to_data_science.pptx
--------------------------------------------------------------------------------
/slides/05_machine_learning_knn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/05_machine_learning_knn.pdf
--------------------------------------------------------------------------------
/slides/05_machine_learning_knn.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/05_machine_learning_knn.pptx
--------------------------------------------------------------------------------
/slides/08_web_scraping.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/08_web_scraping.pdf
--------------------------------------------------------------------------------
/slides/08_web_scraping.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/08_web_scraping.pptx
--------------------------------------------------------------------------------
/slides/10_logistic_regression_confusion_matrix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/10_logistic_regression_confusion_matrix.pdf
--------------------------------------------------------------------------------
/slides/10_logistic_regression_confusion_matrix.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/10_logistic_regression_confusion_matrix.pptx
--------------------------------------------------------------------------------
/slides/11_drawing_roc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/11_drawing_roc.pdf
--------------------------------------------------------------------------------
/slides/11_drawing_roc.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/11_drawing_roc.pptx
--------------------------------------------------------------------------------
/slides/13_bayes_theorem.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/13_bayes_theorem.pdf
--------------------------------------------------------------------------------
/slides/13_bayes_theorem.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/13_bayes_theorem.pptx
--------------------------------------------------------------------------------
/slides/13_naive_bayes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/13_naive_bayes.pdf
--------------------------------------------------------------------------------
/slides/13_naive_bayes.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/13_naive_bayes.pptx
--------------------------------------------------------------------------------
/slides/15_kaggle.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/15_kaggle.pdf
--------------------------------------------------------------------------------
/slides/15_kaggle.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/15_kaggle.pptx
--------------------------------------------------------------------------------
/slides/18_clustering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/18_clustering.pdf
--------------------------------------------------------------------------------
/slides/18_clustering.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/18_clustering.pptx
--------------------------------------------------------------------------------
/slides/20_sales_db_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/20_sales_db_schema.png
--------------------------------------------------------------------------------
/slides/20_sql.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/20_sql.pdf
--------------------------------------------------------------------------------
/slides/20_sql.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/DAT5/87aa6d195393db67ba33e3e170b9f4c8662cdea4/slides/20_sql.pptx
--------------------------------------------------------------------------------