├── .gitignore
├── baby_names
└── name_search.py
├── naive_bayes
├── app.py
├── classify.py
├── naive_bayes.py
├── sample.py
├── test_baby.json
├── test_food.json
├── test_home.json
├── test_pet.json
├── test_tool.json
├── train_baby.json
├── train_food.json
├── train_home.json
├── train_pet.json
└── train_tool.json
├── scraping
├── data_scrapy.py
├── helpers.py
├── page.html
├── requesting_html.py
├── requirements.txt
├── scraping_html.py
└── selenium_test.py
├── sklearn_classify
└── classify.py
└── tourstats
├── analyze.py
├── distance_vs_putts.csv
├── driving_distance.html
├── driving_vs_putts.py
├── gather.py
├── models.py
├── models.pyc
└── seed.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | naive_bayes/data
3 | tourstats/stats*
4 | tourstats/pgatourstats.zip
5 |
6 |
7 | # we don't want to save the data files.
8 | #Just the code to scrap the data files if that's what you're looking for.
9 | *.json
10 | *.csv
11 |
12 | *.npy
13 | *.pkl
14 | *.html
15 | *.zip
16 | *.png
17 |
18 | *.pyc
19 | *.log
20 | scraping/pages
21 | scraping/texts
22 | scraping/scraping
23 |
--------------------------------------------------------------------------------
/baby_names/name_search.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from collections import Counter
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 |
6 | vowels = ('A', 'E', 'I', 'O', 'U')
7 | import string
8 | alphabet = [letter for letter in string.ascii_uppercase]
9 | boy = 'boy'
10 | girl = 'girl'
11 |
12 | def gather_names(gender):
13 | filename = "%s_names.html" % gender
14 | names = []
15 | with open(filename, 'r') as file:
16 | page = file.read()
17 | html = BeautifulSoup(page.replace('\n',''), 'html.parser')
18 | #remove tags with class=tm-embedded-post-container
19 | #so the ad isn't included in text
20 | for tag in html.find_all('div', class_="tm-embedded-post-container"):
21 | tag.decompose()
22 | for name_link in html.find_all("li", class_="p1"):
23 | name = name_link.text.upper()
24 | names.append(name)
25 | return names
26 |
27 | boy_names = gather_names(boy)
28 | girl_names = gather_names(girl)
29 |
30 | def gender_names(gender):
31 | if gender == boy:
32 | return boy_names
33 | elif gender == girl:
34 | return girl_names
35 |
36 | def calculate_replace_letter_matches(name_set, first_letter, exchange_letter):
37 | '''
38 | Returns list of sets with name matches
39 | '''
40 | name_matches = []
41 | for name in name_set:
42 | if first_letter in name:
43 | exchange_name = name.replace(first_letter, exchange_letter)
44 | temp_name_matches = [name]
45 | if exchange_name in name_set:
46 | temp_name_matches.append(exchange_name)
47 | if len(set(temp_name_matches)) > 1:
48 | name_matches.append(set(temp_name_matches))
49 |
50 | return name_matches
51 |
52 | def replace_single_letter(first_letter='I', exchange_letter='Y', show_matches=True):
53 | boy_name_set = set(boy_names)
54 | boy_name_matches = calculate_replace_letter_matches(boy_name_set, first_letter, exchange_letter)
55 | print 'Boy name matches: %s' % len(boy_name_matches)
56 | if show_matches:
57 | print boy_name_matches
58 |
59 | girl_name_set = set(girl_names)
60 | girl_name_matches = calculate_replace_letter_matches(girl_name_set, first_letter, exchange_letter)
61 | print 'Girl name matches: %s' % len(girl_name_matches)
62 | if show_matches:
63 | print girl_name_matches
64 |
65 | def npr_solver(gender):
66 | print "Vowel Consonant Consonant Starting names for %ss" % gender
67 | names = gender_names(gender)
68 | vowel_starters = []
69 | consonant_starters = []
70 | for name in names:
71 | first_letter = name[0]
72 | if first_letter in vowels:
73 | vowel_starters.append(name)
74 | else:
75 | consonant_starters.append(name)
76 |
77 | for vname in vowel_starters:
78 | cname_same = []
79 | for cname in consonant_starters:
80 | if vname[1:] == cname[1:]:
81 | cname_same.append(cname)
82 | if cname_same:
83 | print vname
84 | for match in cname_same:
85 | print match
86 |
87 | def rhyming_names(gender):
88 | print "Rhyming for %ss" % gender
89 | total_matches = []
90 | names = gender_names(gender)
91 | for name in names:
92 | name_same = []
93 | for name2 in names:
94 | if name[1:] == name2[1:] and name != name2:
95 | name_same.append(name2)
96 | if name_same:
97 | name_same.append(name)
98 | if set(name_same) not in total_matches:
99 | total_matches.append(set(name_same))
100 | print "Total %s matches: %s" % (gender, len(total_matches))
101 | for matches in total_matches:
102 | print list(matches),
103 | print #actual new line
104 |
105 | def vowel_consonant_beginning_proportion(gender):
106 | print "Vowel Consonant Beginning Ratio for %ss" % gender
107 | names = gender_names(gender)
108 | vowel_starters = []
109 | consonant_starters = []
110 |
111 | for name in names:
112 | first_letter = name[0]
113 | if first_letter in vowels:
114 | vowel_starters.append(name)
115 | else:
116 | consonant_starters.append(name)
117 |
118 | vowel_len = float(len(vowel_starters))
119 | consonant_len = float(len(consonant_starters))
120 | print vowel_len / (vowel_len + consonant_len)
121 |
122 | def name_letter_begin_or_end(gender, index='beginning'):
123 | if index is 'beginning':
124 | asdf = 1
125 | elif index is 'end':
126 | asdf = -1
127 | else:
128 | print 'fail'
129 | return
130 | print "Name letter %s for %ss" % (index, gender)
131 | names = gender_names(gender)
132 |
133 | cnt = Counter()
134 | for name in names:
135 | letter = name[asdf]
136 | cnt[letter] += 1
137 | return cnt
138 |
139 | def name_lengths_counter(gender):
140 | names = gender_names(gender)
141 |
142 | cnt = Counter()
143 | for name in names:
144 | cnt[len(name)] += 1
145 | return cnt
146 |
147 | def name_lengths(gender):
148 | names = gender_names(gender)
149 | return [len(name) for name in names]
150 |
151 | def count_name_lengths():
152 | lengths = np.arange(15)
153 | boy_lengths = name_lengths_counter(boy)
154 | girl_lengths = name_lengths_counter(girl)
155 | boy_lengths_list = [boy_lengths[length] for length in lengths]
156 | girl_lengths_list = [girl_lengths[length] for length in lengths]
157 |
158 | boy_length_counts = name_lengths(boy)
159 | girl_length_counts = name_lengths(girl)
160 |
161 | print 'Boy length avg: %s' % np.mean(boy_length_counts)
162 | print 'Boy length std: %s' % np.std(boy_length_counts)
163 | print 'Girl length avg: %s' % np.mean(girl_length_counts)
164 | print 'Girl length std: %s' % np.std(girl_length_counts)
165 |
166 | #time to plot the bars
167 | fig, ax = plt.subplots()
168 |
169 | opacity = 0.4
170 | bar_width = 0.35
171 |
172 | rects1 = plt.bar(lengths, boy_lengths_list, bar_width,
173 | alpha=opacity,
174 | color='b',
175 | label='Boys')
176 |
177 | rects2 = plt.bar(lengths + bar_width, girl_lengths_list, bar_width,
178 | alpha=opacity,
179 | color='r',
180 | label='Girls')
181 |
182 | plt.xlabel('Lengths')
183 | plt.ylabel('Number of names of that length')
184 | plt.title('Lengths of boy and girl names')
185 | plt.legend()
186 |
187 | plt.tight_layout()
188 |
189 | plt.savefig('graphs/name_length_bar.png')
190 |
191 |
192 | def begin_end_names(gender, beginning_letter, ending_letter):
193 | names = gender_names(gender)
194 | matching_names = []
195 | for name in names:
196 | if name[0] == beginning_letter and name[-1] == ending_letter:
197 | matching_names.append(name)
198 | return matching_names
199 |
200 |
201 |
202 |
203 |
204 | def vowel_consonant_ending_proportion(gender):
205 | print "Vowel Consonant Ending Ratio for %ss" % gender
206 | names = gender_names(gender)
207 | vowel_enders = []
208 | consonant_enders = []
209 | for name in names:
210 | last_letter = name[-1]
211 | if last_letter in vowels:
212 | vowel_enders.append(name)
213 | else:
214 | consonant_enders.append(name)
215 |
216 | vowel_len = float(len(vowel_enders))
217 | consonant_len = float(len(consonant_enders))
218 | print vowel_len / (vowel_len + consonant_len)
219 |
220 | def count_vowels_consonants(gender, index):
221 | '''
222 | Gives counts for whether the letters at the indicies are vowels or consonants
223 | index = 1 for first letter, index = -1 for last letter.
224 | Other indicies work, but might cause error if index is longer than two since
225 | there are some two letter names!
226 | '''
227 | names = gender_names(gender)
228 | sizes = []
229 | cnt = Counter()
230 | for name in names:
231 | if name[index] in vowels:
232 | cnt['v'] += 1
233 | else:
234 | cnt['c'] += 1
235 | return cnt
236 |
237 | def print_percentages(gender, sizes, title):
238 | vowel_len = float(sizes[0])
239 | consonant_len = float(sizes[1])
240 | vowel_percentage = vowel_len / (vowel_len + consonant_len)
241 | consonant_percentage = consonant_len / (vowel_len + consonant_len)
242 | print title % gender
243 | print 'Vowel percentage: %s' % vowel_percentage
244 | print 'Consonant percentage: %s' % consonant_percentage
245 | print #for spacing
246 |
247 | def vowels_consonant_starts():
248 | '''
249 | Pie graph of the frequency of names that begin with vowels for both genders
250 | '''
251 | boy_counts = count_vowels_consonants(boy, 0)
252 | girl_counts = count_vowels_consonants(girl, 0)
253 |
254 | #graph time for the boys, nothing to do with Saturday being for the boys cause that's a dumb phrase
255 | title = "Percentage of %s names that start with vowels or consonants"
256 | labels = 'Vowels', 'Consonants'
257 | boy_fig, boy_ax = plt.subplots()
258 | sizes = [boy_counts['v'], boy_counts['c']]
259 | print_percentages(boy, sizes, title)
260 |
261 | boy_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
262 | plt.title(title % 'boy')
263 | plt.savefig('graphs/vowel_consonant_start_boy.png')
264 |
265 | sizes = [girl_counts['v'], girl_counts['c']]
266 | print_percentages(girl, sizes, title)
267 | girl_fig, girl_ax = plt.subplots()
268 | girl_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
269 | plt.title(title % 'girl')
270 |
271 | plt.savefig('graphs/vowel_consonant_start_girl.png')
272 |
273 | def vowels_consonant_ends():
274 | '''
275 | Pie graph of the frequency of names that begin with vowels for both genders
276 | '''
277 | boy_counts = count_vowels_consonants(boy, -1)
278 | girl_counts = count_vowels_consonants(girl, -1)
279 |
280 | #graph time for the boys, nothing to do with Saturday being for the boys cause that's a dumb phrase
281 |
282 | title = "Percentage of %s names that end with vowels or consonants"
283 | labels = 'Vowels', 'Consonants'
284 | boy_fig, boy_ax = plt.subplots()
285 |
286 | sizes = [boy_counts['v'], boy_counts['c']]
287 | print_percentages(girl, sizes, title)
288 | boy_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
289 | plt.title(title % 'boy')
290 | plt.savefig('graphs/vowel_consonant_ends_boys.png')
291 |
292 | sizes = [girl_counts['v'], girl_counts['c']]
293 | print_percentages(girl, sizes, title)
294 | girl_fig, girl_ax = plt.subplots()
295 | girl_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
296 | plt.title(title % 'girl')
297 | plt.savefig('graphs/vowel_consonant_ends_girls.png')
298 |
299 | def vowel_endings():
300 | boy_counts = count_vowel_frequency(boy, -1)
301 | girl_counts = count_vowel_frequency(girl, -1)
302 |
303 | #graph time for the boys, nothing to do with Saturday being for the boys cause that's a dumb phrase
304 |
305 | title = "Percentage of vowels that %s names end with"
306 | labels = boy_counts.keys()
307 | boy_fig, boy_ax = plt.subplots()
308 | sizes = [boy_counts[vowel] for vowel in labels if vowel in boy_counts.keys()]
309 | print_percentages(boy, sizes, title)
310 | boy_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
311 | plt.title(title % 'boy')
312 | plt.savefig('graphs/vowel_endings_boys.png')
313 |
314 | girl_fig, girl_ax = plt.subplots()
315 | labels = girl_counts.keys()
316 | sizes = [girl_counts[vowel] for vowel in labels if vowel in girl_counts.keys()]
317 | print_percentages(girl, sizes, title)
318 | girl_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
319 | plt.title(title % 'girl')
320 | plt.savefig('graphs/vowel_endings_girls.png')
321 |
322 | def vowel_beginnings():
323 | boy_counts = count_vowel_frequency(boy, 0)
324 | girl_counts = count_vowel_frequency(girl, 0)
325 |
326 | #graph time for the boys, nothing to do with Saturday being for the boys cause that's a dumb phrase
327 |
328 | title = "Percentage of vowels that %s names begin with"
329 | labels = boy_counts.keys()
330 | boy_fig, boy_ax = plt.subplots()
331 | sizes = [boy_counts[vowel] for vowel in labels if vowel in boy_counts.keys()]
332 | boy_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
333 | plt.title(title % 'boy')
334 | plt.savefig('graphs/vowel_beginnings_boys.png')
335 |
336 | girl_fig, girl_ax = plt.subplots()
337 | labels = girl_counts.keys()
338 | sizes = [girl_counts[vowel] for vowel in labels if vowel in girl_counts.keys()]
339 | girl_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
340 | plt.title(title % 'girl')
341 | plt.savefig('graphs/vowel_beginnings_girls.png')
342 |
343 | def letter_frequency(index=0):
344 |
345 | boy_name_counter = Counter()
346 | for name in boy_names:
347 | beginning_letter = name[0]
348 | boy_name_counter[beginning_letter] += 1
349 |
350 | print boy_name_counter
351 |
352 | girl_name_counter = Counter()
353 | for name in girl_names:
354 | beginning_letter = name[0]
355 | girl_name_counter[beginning_letter] += 1
356 |
357 | print girl_name_counter
358 |
359 | boy_alphabet_count_list = []
360 | girl_alphabet_count_list = []
361 | for letter in alphabet:
362 | boy_alphabet_count_list.append(float(boy_name_counter[letter]))
363 | girl_alphabet_count_list.append(float(girl_name_counter[letter]))
364 |
365 | print boy_alphabet_count_list
366 | print girl_alphabet_count_list
367 |
368 | #time to plot the bars
369 | fig, ax = plt.subplots()
370 |
371 | opacity = 0.6
372 | bar_width = 0.35
373 |
374 | num_letters = np.arange(26)
375 | rects1 = plt.bar(num_letters, boy_alphabet_count_list, bar_width,
376 | align='center',
377 | alpha=opacity,
378 | color='b',
379 | label='Boys')
380 |
381 | rects2 = plt.bar(num_letters + bar_width, girl_alphabet_count_list, bar_width,
382 | align='center',
383 | alpha=0.8,
384 | color='palevioletred',
385 | label='Girls')
386 |
387 | plot_title = 'Number of names that begin with certain letters'
388 | xtick_pos = [let + (bar_width / 2) for let in num_letters]
389 | plt.xticks(xtick_pos, alphabet)
390 | plt.xlabel('Letters')
391 | plt.ylabel('Number of names that begin with letter')
392 | plt.title(plot_title)
393 | plt.legend()
394 | plt.tight_layout()
395 | #plt.show()
396 | plt.savefig('graphs/letter_frequency.png')
397 |
398 |
399 | def count_vowel_frequency(gender, index):
400 | names = gender_names(gender)
401 | sizes = []
402 | cnt = Counter()
403 | for name in names:
404 | index_letter = name[index]
405 | if index_letter in vowels:
406 | cnt[index_letter] += 1
407 | return cnt
408 |
409 | def common_names(gender):
410 | '''
411 | Gathers the letters that the names start with, and then count the combos of all the names that begin and end with the matching letters.
412 | '''
413 | begin_gender_counts = name_letter_begin_or_end(gender, index='beginning')
414 | end_gender_counts = name_letter_begin_or_end(gender, index='end')
415 |
416 | for bletter in begin_gender_counts:
417 | for eletter in end_gender_counts:
418 | match_names= begin_end_names(gender, bletter, eletter)
419 |
420 | for name in match_names:
421 | print name
422 |
423 |
424 | if __name__ == '__main__':
425 | #npr_solver(boy)
426 | #npr_solver(girl)
427 | #vowels_consonant_starts()
428 | #vowels_consonant_ends()
429 | #vowel_endings()
430 | #vowel_beginnings()
431 | #letter_frequency()
432 | #letter_frequency(index=0)
433 | #count_name_lengths()
434 | #replace_single_letter(first_letter='I', exchange_letter='Y')
435 | #replace_single_letter(first_letter='IE', exchange_letter='Y')
436 | #replace_single_letter(first_letter='EE', exchange_letter='Y')
437 | #replace_single_letter(first_letter='A', exchange_letter='Y')
438 | #replace_single_letter(first_letter='C', exchange_letter='K')
439 | #replace_single_letter(first_letter='CK', exchange_letter='K')
440 | #replace_single_letter(first_letter='HN', exchange_letter='N')
441 | #replace_single_letter(first_letter='G', exchange_letter='J')
442 | #rhyming_names(boy)
443 | #rhyming_names(girl)
444 | '''
445 | for index, l1 in enumerate(alphabet):
446 | for l2 in alphabet[index:]:
447 | print "Flip %s and %s" % (l1, l2)
448 | replace_single_letter(first_letter=l1, exchange_letter=l2, show_matches=True)
449 | for match in alphabet:
450 | print "Flip %s and %s" % ('K', match)
451 | replace_single_letter(first_letter='K', exchange_letter=match, show_matches=True)
452 | '''
453 | pass # in case you don't uncomment a test you want to run, we need correct syntax
454 |
--------------------------------------------------------------------------------
/naive_bayes/app.py:
--------------------------------------------------------------------------------
1 | from flask import jsonify, request, Flask
2 | from sklearn.externals import joblib
3 |
4 | print "Loading Pickled Pipeline"
5 | fitted_pipeline = joblib.load('classifier.pkl')
6 |
7 | app = Flask(__name__)
8 |
9 | @app.route('/', methods=['POST'])
10 | def predict():
11 | text = request.form.get('text')
12 | guess = fitted_pipeline.predict([text])[0] #pipeline returns array
13 | results = {"class": guess}
14 | return jsonify(results)
15 |
16 | if __name__ == '__main__':
17 | app.run()
18 |
19 |
--------------------------------------------------------------------------------
/naive_bayes/classify.py:
--------------------------------------------------------------------------------
1 | import json
2 | from sklearn.feature_extraction.text import CountVectorizer
3 | from sklearn.feature_extraction.text import TfidfTransformer
4 | from sklearn.feature_extraction.text import TfidfVectorizer
5 | from sklearn.feature_extraction.text import HashingVectorizer
6 | from pandas import DataFrame
7 | import numpy
8 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
9 | from sklearn.pipeline import Pipeline
10 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
11 | from sklearn.cross_validation import KFold
12 | from sklearn.svm import SVC
13 |
14 |
15 | LABELS = []
16 | LABELS.append("baby")
17 | LABELS.append("tool")
18 | LABELS.append("home")
19 | LABELS.append("pet")
20 | LABELS.append("food")
21 | LABELS.append("automotive")
22 | LABELS.append("instant_video")
23 | LABELS.append("beauty")
24 | LABELS.append("cds_vinyl")
25 | LABELS.append("clothes")
26 | LABELS.append("digital_music")
27 | LABELS.append("cell_phones")
28 | LABELS.append("electronics")
29 | LABELS.append("kindle")
30 | LABELS.append("movies_tv")
31 | LABELS.append("instruments")
32 | LABELS.append("office")
33 | LABELS.append("patio")
34 | LABELS.append("health")
35 | LABELS.append("sports")
36 | LABELS.append("toys")
37 | LABELS.append("video_games")
38 | LABELS.append("books")
39 |
40 | def read_review_data(num_classes):
41 | print "Begin reading in data"
42 | labels = LABELS[:num_classes]
43 | reviews = []
44 | for label in labels:
45 | train_filename = "train_%s.json" % label
46 | test_filename = "train_%s.json" % label
47 | filenames = [train_filename, test_filename]
48 | for filename in filenames:
49 | with open(filename, 'r') as f:
50 | for line in f:
51 | text = json.loads(line)["reviewText"]
52 | reviews.append({'text': text, 'class': label})
53 |
54 | data = DataFrame(reviews)
55 | data = data.reindex(numpy.random.permutation(data.index))
56 |
57 | #to evaluate the length of review
58 | data["word_count"] = [len(text.split(" ")) for text in data["text"]]
59 |
60 | NUM_TRAIN_SAMPLES = int(len(data) * 0.8)
61 |
62 | train_data = data[:NUM_TRAIN_SAMPLES]
63 | test_data = data[NUM_TRAIN_SAMPLES:]
64 | print "End reading in data"
65 |
66 | return (train_data, test_data, labels)
67 |
68 | def test_fitted_pipeline(fitted_pipeline, test_data, labels, description=""):
69 | actual = test_data['class'].values
70 | print "Predicting %s" % description
71 | predictions = fitted_pipeline.predict(test_data['text'].values)
72 | score = accuracy_score(actual, predictions)
73 | cmat = confusion_matrix(actual, predictions, labels)
74 | print
75 | print description or "Results"
76 | print score
77 | print labels
78 | print cmat
79 |
80 | def fit_pipeline(pipeline, train_data, description=""):
81 | print "Training %s Classifier" % description
82 | pipeline.fit(train_data['text'].values, train_data['class'].values)
83 | return pipeline
84 |
85 | def test_pipeline(pipeline, train_data, test_data, labels, description=""):
86 | fitted_pipeline = fit_pipeline(pipeline, train_data, description=description)
87 | test_fitted_pipeline(fitted_pipeline, test_data, labels, description=description)
88 |
89 | def evalutate_n_grams(num_classes=5):
90 | train_data, test_data, labels = read_review_data(num_classes)
91 | classifier = MultinomialNB()
92 |
93 | unigram_vectorizer = CountVectorizer(stop_words='english')
94 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
95 | trigram_vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
96 | fourgram_vectorizer = CountVectorizer(ngram_range=(1, 4), stop_words='english')
97 |
98 |
99 | unigram_pipeline = Pipeline([
100 | ('count_vectorizer', unigram_vectorizer),
101 | ('classifier' , classifier)
102 | ])
103 |
104 | bigram_pipeline = Pipeline([
105 | ('count_vectorizer', bigram_vectorizer),
106 | ('classifier' , classifier)
107 | ])
108 |
109 | trigram_pipeline = Pipeline([
110 | ('count_vectorizer', trigram_vectorizer),
111 | ('classifier' , classifier)
112 | ])
113 |
114 | fourgram_pipeline = Pipeline([
115 | ('count_vectorizer', fourgram_vectorizer),
116 | ('classifier' , classifier)
117 | ])
118 |
119 | test_pipeline(unigram_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
120 | test_pipeline(bigram_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
121 | test_pipeline(trigram_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
122 | test_pipeline(fourgram_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
123 |
124 | def evaluate_classifier_type(num_classes=5):
125 | train_data, test_data, labels = read_review_data(num_classes)
126 |
127 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
128 |
129 | mn_classifier = MultinomialNB(fit_prior=False)
130 | b_classifier = BernoulliNB()
131 |
132 | mn_pipeline = Pipeline([
133 | ('count_vectorizer', bigram_vectorizer),
134 | ('classifier' , mn_classifier)
135 | ])
136 |
137 | b_pipeline = Pipeline([
138 | ('count_vectorizer', bigram_vectorizer),
139 | ('classifier' , b_classifier)
140 | ])
141 |
142 | test_pipeline(mn_pipeline, train_data, test_data, labels, description="Multinomial")
143 | test_pipeline(b_pipeline, train_data, test_data, labels, description="Bernoulli")
144 |
145 | def evaluate_tfidf(num_classes=5):
146 | train_data, test_data, labels = read_review_data(num_classes)
147 | labels = labels[:num_classes]
148 | classifier = MultinomialNB()
149 |
150 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
151 | tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, stop_words='english')
152 | tfidf_transformer = TfidfTransformer()
153 |
154 | no_tfidf_pipeline = Pipeline([
155 | ('count_vectorizer', bigram_vectorizer),
156 | ('classifier' , classifier)
157 | ])
158 |
159 | tfidf_vectorizer_pipeline = Pipeline([
160 | ('count_vectorizer', tfidf_vectorizer),
161 | ('classifier' , classifier)
162 | ])
163 |
164 | tfidf_transformer_pipeline = Pipeline([
165 | ('count_vectorizer', bigram_vectorizer),
166 | ('tfidf_transformer' , tfidf_transformer),
167 | ('classifier' , classifier)
168 | ])
169 |
170 | test_pipeline(no_tfidf_pipeline, train_data, test_data, labels, description="No TFIDF")
171 |
172 | test_pipeline(tfidf_vectorizer_pipeline, train_data, test_data, labels, description="TFIDF Vectorizer")
173 |
174 | test_pipeline(tfidf_transformer_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
175 |
176 | def evaluate_training_counts(train_data, test_data, num_classes=5):
177 | pass
178 |
179 | def evaluate_standard(num_classes=5):
180 | train_data, test_data, labels = read_review_data(num_classes)
181 |
182 | four_gram_vectorizer = CountVectorizer(ngram_range=(1, 4), stop_words='english')
183 | classifier = MultinomialNB(fit_prior=False)
184 |
185 | pipeline = Pipeline([
186 | ('count_vectorizer', four_gram_vectorizer),
187 | ('classifier' , classifier)
188 | ])
189 |
190 | test_pipeline(pipeline, train_data, test_data, labels, description="Standard")
191 |
192 | def evaluate_lengths(num_classes=5):
193 | train_data, test_data, labels = read_review_data(num_classes)
194 |
195 | vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
196 | classifier = MultinomialNB(fit_prior=False)
197 |
198 | pipeline = Pipeline([
199 | ('count_vectorizer', vectorizer),
200 | ('classifier' , classifier)
201 | ])
202 |
203 | shortest_test_data = test_data[test_data["word_count"] < 20]
204 | short_test_data = test_data[(test_data["word_count"] > 20) & (test_data["word_count"] <= 50)]
205 | med_test_data = test_data[(test_data["word_count"] > 50) & (test_data["word_count"] <= 100)]
206 | long_test_data = test_data[test_data["word_count"] > 100]
207 |
208 | fitted_pipeline = fit_pipeline(pipeline, train_data)
209 |
210 | test_fitted_pipeline(pipeline, test_data, labels, description="Standard")
211 | print
212 | print "Num shortest data: %s" % str(len(shortest_test_data))
213 | test_fitted_pipeline(pipeline, shortest_test_data, labels, description="20 Word Max")
214 | print
215 | print "Num shortest data: %s" % str(len(short_test_data))
216 | test_fitted_pipeline(pipeline, short_test_data, labels, description="Between 25 and 50 Words")
217 | print
218 | print "Num shortest data: %s" % str(len(med_test_data))
219 | test_fitted_pipeline(pipeline, med_test_data, labels, description="Between 50 and 100 Words")
220 | print
221 | print "Num shortest data: %s" % str(len(long_test_data))
222 | test_fitted_pipeline(pipeline, long_test_data, labels, description="100 Word Min")
223 |
224 |
225 | def pickle_pipeline(pipeline, num_classes=5):
226 | train_data, test_data, labels = read_review_data(num_classes)
227 | from sklearn.externals import joblib
228 | fitted_pipeline = fit_pipeline(pipeline, train_data)
229 | print "Pickling Pipeline"
230 | joblib.dump(fitted_pipeline, 'classifier.pkl')
231 |
232 |
233 | def use_pickled_pipeline(num_classes=26):
234 | _, test_data, labels = read_review_data(num_classes)
235 | from sklearn.externals import joblib
236 | print "Loading Pickled Pipeline"
237 | fitted_pipeline = joblib.load('classifier.pkl')
238 |
239 | test_fitted_pipeline(fitted_pipeline, test_data, labels, description="From Pickle")
240 |
241 | pass
242 | '''
243 |
244 |
245 |
246 | import matplotlib.pyplot as plt
247 |
248 | bins = [10 * (i) for i in range(50)]
249 | percents = [0.5506607929515418, 0.8571428571428571, 0.89151434091246839, 0.92522522522522521, 0.92804878048780493, 0.9469924812030075, 0.95398230088495573, 0.9448568398727466, 0.95388502842703726, 0.95697329376854601, 0.96498719043552517, 0.96037735849056605, 0.96003996003996006, 0.96465222348916757, 0.96681096681096679, 0.95469798657718119, 0.94086021505376349, 0.94837476099426388, 0.95259593679458243, 0.95022624434389136, 0.94750000000000001, 0.96625766871165641, 0.96491228070175439, 0.96180555555555558, 0.98084291187739459, 0.96442687747035571, 1.0, 0.95979899497487442, 0.90217391304347827, 0.96575342465753422, 0.93442622950819676, 0.94244604316546765, 0.97058823529411764, 0.94444444444444442, 0.98019801980198018, 0.91752577319587625, 0.95652173913043481, 0.98913043478260865, 1.0, 0.93670886075949367, 0.93333333333333335, 0.94805194805194803, 0.9642857142857143, 1.0, 0.98148148148148151, 0.92500000000000004, 0.90697674418604646, 0.93877551020408168, 1.0, 1.0]
250 |
251 |
252 | '''
253 |
254 | '''
255 | import matplotlib.pyplot as plt
256 |
257 | plt.figure()
258 | plt.title('Correct Max Probabilities')
259 | plt.hist(correct, 100)
260 | plt.show()
261 | plt.figure()
262 | plt.title('Incorrect Max Probabilities')
263 | plt.hist(incorrect, 100)
264 | plt.show()
265 |
266 | #quit()
267 | '''
268 |
269 | '''
270 | count_vectorizer = CountVectorizer(min_df=1, stop_words='english')
271 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1, stop_words='english')
272 | trigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1, stop_words='english')
273 | fourgram_vectorizer = CountVectorizer(ngram_range=(1, 4), min_df=1, stop_words='english')
274 |
275 | tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, stop_words='english')
276 | #hashing_vectorizer = HashingVectorizer(n_features=100, non_negative=True)
277 | classifier = MultinomialNB()
278 | g_classifier = GaussianNB()
279 | b_classifier = BernoulliNB()
280 |
281 | pipeline = Pipeline([
282 | ('count_vectorizer', count_vectorizer),
283 | ('classifier' , classifier)
284 | ])
285 |
286 | bigram_pipeline = Pipeline([
287 | ('count_vectorizer', bigram_vectorizer),
288 | ('tfidf_transformer', TfidfTransformer()),
289 | ('classifier' , classifier)
290 | ])
291 |
292 | trigram_pipeline = Pipeline([
293 | ('count_vectorizer', trigram_vectorizer),
294 | ('classifier' , classifier)
295 | ])
296 |
297 | fourgram_pipeline = Pipeline([
298 | ('count_vectorizer', fourgram_vectorizer),
299 | ('classifier' , classifier)
300 | ])
301 |
302 |
303 | g_pipeline = Pipeline([
304 | ('count_vectorizer', bigram_vectorizer),
305 | ('classifier' , g_classifier)
306 | ])
307 |
308 | b_pipeline = Pipeline([
309 | ('count_vectorizer', bigram_vectorizer),
310 | ('classifier' , b_classifier)
311 | ])
312 |
313 |
314 | bigram_pipeline.fit(train_data['text'].values, train_data['class'].values)
315 |
316 |
317 | bigram_predictions = bigram_pipeline.predict(test_data['text'].values)
318 | bigram_score = accuracy_score(actual, bigram_predictions)
319 | bigram_cmat = confusion_matrix(actual, bigram_predictions, labels)
320 |
321 | incorrect_indicies = []
322 | correct_indicies = []
323 | for index, (act, pred) in enumerate(zip(actual, bigram_predictions)):
324 | if act != pred:
325 | incorrect_indicies.append(index)
326 | else:
327 | correct_indicies.append(index)
328 |
329 | incorrect = test_data.iloc[incorrect_indicies]
330 | correct = test_data.iloc[correct_indicies]
331 |
332 | for index, ic in incorrect.iterrows():
333 | text = ic["text"]
334 | c = ic["class"]
335 | probs = bigram_pipeline.predict_proba([text])[0]
336 | if max(probs) > 0.5:
337 | guessed = bigram_pipeline.predict([text])
338 | print text
339 | print "Actual: %s, Guessed: %s" % (c, guessed)
340 |
341 | import pdb;pdb.set_trace()
342 | asdf = bigram_pipeline.predict_proba(incorrect["text"].values)
343 | qwer = bigram_pipeline.predict_proba(correct["text"].values)
344 | zxcv = [float("%.3f" % max(arr)) for arr in qwer]
345 |
346 |
347 | print
348 | print bigram_score
349 | print labels
350 | print bigram_cmat
351 |
352 |
353 | #test_data = test_data[test_data["lengths"] > 1000]
354 |
355 | #train_data = train_data[train_data["lengths"] > 100]
356 |
357 | quit()
358 |
359 | bins = [10 * (i) for i in range(50)]
360 | scores = []
361 | for b in bins:
362 | td = test_data[(test_data["lengths"] > b) & (test_data["lengths"] < (b+10)) ]
363 | actual = td['class'].values
364 | predictions = pipeline.predict(td['text'].values)
365 |
366 | score = accuracy_score(actual, predictions)
367 | scores.append(score)
368 | #cmat = confusion_matrix(actual, predictions, labels)
369 | print "Word Count: %s, Doc Count: %s, Score: %s" % (str(b), str(len(td)), '%.5f' % score)
370 |
371 | print bins
372 | print scores
373 |
374 |
375 | '''
376 |
377 | '''
378 | import matplotlib.pyplot as plt
379 |
380 | incorrect_indicies = []
381 | correct_indicies = []
382 | for index, (act, pred) in enumerate(zip(actual, predictions)):
383 | if act != pred:
384 | incorrect_indicies.append(index)
385 | else:
386 | correct_indicies.append(index)
387 |
388 | incorrect = test_data.iloc[incorrect_indicies]
389 | correct = test_data.iloc[correct_indicies]
390 |
391 | print len(correct)
392 | print len(incorrect)
393 |
394 | #[len(correct[correct.lengths < bins[i+1] & correct.lengths > bins[i]]) for i in range(len(bins)-1)]
395 | bins = [50 * (i) for i in range(100)]
396 | correct_counts = [len(correct[correct.lengths < i]) for i in bins]
397 | incorrect_counts = [len(incorrect[incorrect.lengths < i]) for i in bins]
398 |
399 | print "Correct"
400 | ccounts = [0 if index == 0 else icc - correct_counts[index-1] for index, icc in enumerate(correct_counts)]
401 | print "Incorrect"
402 | iccounts = [0 if index == 0 else icc - incorrect_counts[index-1] for index, icc in enumerate(incorrect_counts)]
403 | plt.figure()
404 | plt.subplot()
405 | plt.plot(bins, ccounts)
406 | plt.subplot()
407 | plt.plot(bins, iccounts)
408 | plt.show()
409 |
410 | ["{0:.0f}%".format(c / float(sum(ccounts))) for c in ccounts]
411 |
412 | import pdb;pdb.set_trace()
413 |
414 |
415 | fig = plt.figure()
416 | incorrect["lengths"].plot.hist(bins=50)
417 | fig.suptitle('Incorrect text lengths', fontsize=14, fontweight='bold')
418 | plt.show()
419 |
420 | fig = plt.figure()
421 | correct["lengths"].plot.hist(bins=50)
422 | fig.suptitle('Incorrect text lengths', fontsize=14, fontweight='bold')
423 | plt.show()
424 |
425 | confusion = numpy.array([[0 for i in range(len(labels))] for y in range(len(labels))])
426 |
427 | k_fold = KFold(n=len(data), n_folds=6)
428 | scores = []
429 |
430 | for train_indices, test_indices in k_fold:
431 | train_text = data.iloc[train_indices]['text'].values
432 | train_y = data.iloc[train_indices]['class'].values
433 |
434 | test_text = data.iloc[test_indices]['text'].values
435 | test_y = data.iloc[test_indices]['class'].values
436 |
437 | pipeline.fit(train_text, train_y)
438 | predictions = pipeline.predict(test_text)
439 |
440 | confusion += confusion_matrix(test_y, predictions)
441 | score = accuracy_score(test_y, predictions)
442 | scores.append(score)
443 | print numpy.average(scores)
444 | print labels
445 | print confusion
446 |
447 | '''
448 |
449 | if __name__ == "__main__":
450 |
451 | #evaluate_standard(num_classes=26)
452 | #evaluate_classifier_type(num_classes=26)
453 | #evalutate_n_grams(num_classes=3)
454 | #evaluate_tfidf(num_classes=5)
455 | #evaluate_lengths(num_classes=10)
456 |
457 | #vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
458 | #classifier = MultinomialNB(fit_prior=False)
459 |
460 | #pipeline = Pipeline([
461 | # ('count_vectorizer', vectorizer),
462 | # ('classifier' , classifier)
463 | #])
464 | #pickle_pipeline(pipeline, num_classes=26)
465 | use_pickled_pipeline()
466 | pass
467 |
468 |
--------------------------------------------------------------------------------
/naive_bayes/naive_bayes.py:
--------------------------------------------------------------------------------
1 | import json
2 | import math
3 | from collections import Counter
4 | import string
5 | from nltk.corpus import stopwords
6 | import nltk
7 |
8 | STOP_WORDS = set(stopwords.words('english'))
9 | STOP_WORDS.add('')
10 |
11 | def print_confusion_matrix(matrix, class_labels):
12 | lines = ["" for i in range(len(class_labels)+1)]
13 | for index, c in enumerate(class_labels):
14 | lines[0] += "\t"
15 | lines[0] += c
16 | lines[index+1] += c
17 | for index, result in enumerate(matrix):
18 | for amount in result:
19 | lines[index+1] += "\t"
20 | lines[index+1] += str(amount)
21 | for line in lines:
22 | print line
23 |
24 | def initialize_conversion_matrix(num_labels):
25 | return [[0 for i in range(num_labels)] for y in range(num_labels)]
26 |
27 | def read_reviews(filename):
28 | reviews = []
29 | with open(filename, 'r') as f:
30 | for line in f:
31 | reviews.append(json.loads(line))
32 | return reviews
33 |
34 | def review_texts_from_reviews(reviews):
35 | return [review["reviewText"] for review in reviews]
36 |
37 | def get_review_texts(filename):
38 | reviews = read_reviews(filename)
39 | return review_texts_from_reviews(reviews)
40 |
41 | def clean_review(review):
42 | exclude = set(string.punctuation)
43 | review = ''.join(ch for ch in review if ch not in exclude)
44 | split_sentence = review.lower().split(" ")
45 | clean = [word for word in split_sentence if word not in STOP_WORDS]
46 | return clean
47 |
48 | def counters_from_file(filename):
49 | reviews = read_reviews(filename)
50 | texts = [review["reviewText"] for review in reviews]
51 | tokens = [clean_review(review_text) for review_text in texts]
52 | flattened_tokens = [val for sublist in tokens for val in sublist]
53 | counter = Counter(flattened_tokens)
54 | return counter
55 |
56 | def line_count_from_file(filename):
57 | return sum(1 for line in open(filename))
58 |
59 | def naive_bayes(class_labels, nltk=False):
60 | if nltk:
61 | confusion_matrix = naive_bayes_nltk(class_labels)
62 | else:
63 | confusion_matrix = naive_bayes_self(class_labels)
64 | return confusion_matrix
65 |
66 | def conditional_prob(word, counters, total_vocab_count):
67 | word_count = counters[word]
68 | class_total_word_count = sum(counters.values())
69 | cond_prob = float((word_count + 1)) / (class_total_word_count + total_vocab_count)
70 | return cond_prob
71 |
72 | def naive_bayes_self(class_labels):
73 | counters = []
74 | doc_counts = []
75 | for label in class_labels:
76 | filename = "train_%s.json" % label
77 | doc_counts.append(line_count_from_file(filename))
78 | counter = counters_from_file(filename)
79 | counters.append(counter)
80 |
81 | combined_bag = Counter()
82 | for counter in counters:
83 | combined_bag += counter
84 | combined_vocab_count = len(combined_bag.keys())
85 |
86 | probabilities = [float(doc_count) / sum(doc_counts) for doc_count in doc_counts]
87 | correct = 0
88 | incorrect = 0
89 | confusion_matrix = initialize_conversion_matrix(len(class_labels))
90 |
91 | for index, class_name in enumerate(class_labels):
92 | filename = "test_%s.json" % class_name
93 | texts = get_review_texts(filename)
94 | for text in texts:
95 | tokens = clean_review(text)
96 | scores = []
97 | for cindex, bag in enumerate(counters): #for each class
98 | score = math.log1p(probabilities[cindex])
99 | for word in tokens:
100 | #for each word, we need the probablity that word given the class / bag
101 | cond_prob = conditional_prob(word, bag, combined_vocab_count)
102 | score += math.log(cond_prob)
103 | scores.append(score)
104 | max_index, max_value = max(enumerate(scores), key=lambda p: p[1])
105 | confusion_matrix[index][max_index] += 1
106 |
107 | if index == max_index:
108 | correct += 1
109 | else:
110 | incorrect += 1
111 |
112 | print (correct / float(correct + incorrect))
113 | return confusion_matrix
114 |
115 | def naive_bayes_nltk(class_labels):
116 | #note, training set needs to be in form of
117 | #train_set = [
118 | #({'I': 3, 'like': 1, 'this': 1, 'product': 2}, 'class_name_1')
119 | #({'This': 2, 'is': 1, 'really': 1, 'great': 2}, 'class_name_1')
120 | #...
121 | #({'Big': 1, 'fan': 1, 'of': 1, 'this': 1}, 'class_name_X')
122 | #]
123 | train_set = []
124 | for class_name in class_labels:
125 | filename = "train_%s.json" % class_name
126 | texts = get_review_texts(filename)
127 | for text in texts:
128 | tokens = clean_review(text)
129 | counter = Counter(tokens)
130 | train_set.append((dict(counter), class_name))
131 |
132 | classifier = nltk.NaiveBayesClassifier.train(train_set)
133 |
134 | correct = 0
135 | incorrect = 0
136 | confusion_matrix = initialize_conversion_matrix(len(class_labels))
137 |
138 | for index, class_name in enumerate(class_labels):
139 | filename = "test_%s.json" % class_name
140 | reviews = read_reviews(filename)
141 | texts = [review["reviewText"] for review in reviews]
142 | for text in texts:
143 | tokens = clean_review(text)
144 | counter = dict(Counter(tokens))
145 | guess = classifier.classify(counter)
146 | lindex = class_labels.index(guess)
147 | confusion_matrix[index][lindex] += 1
148 |
149 | if guess == class_name:
150 | correct += 1
151 | else:
152 | incorrect += 1
153 |
154 | print (correct / float(correct + incorrect))
155 | classifier.show_most_informative_features()
156 | return confusion_matrix
157 |
158 | if __name__ == "__main__":
159 |
160 | class_labels = ['baby', 'tool']
161 |
162 | confusion_matrix = naive_bayes(class_labels)#, nltk=True)
163 | print_confusion_matrix(confusion_matrix, class_labels)
164 |
--------------------------------------------------------------------------------
/naive_bayes/sample.py:
--------------------------------------------------------------------------------
1 | import json
2 | import linecache
3 | import random
4 |
5 | NUM_TOTAL_SAMPLES = 10000
6 | NUM_TRAIN_SAMPLES = 8000
7 | NUM_TEST_SAMPLES = 2000
8 |
9 | apps_count = 752937
10 | apps_filename = "reviews_Apps_for_Android_5.json"
11 |
12 | automotive_count = 20473
13 | automotive_filename = "reviews_Automotive_5.json"
14 |
15 | baby_count = 160792
16 | baby_filename = "reviews_Baby_5.json"
17 |
18 | books_count = 8898041
19 | books_filename = "reviews_Books_5.json"
20 |
21 | cell_phones_count = 194439
22 | cell_phones_filename = "reviews_Cell_Phones_and_Accessories_5.json"
23 |
24 | tool_count = 134476
25 | tool_filename = "reviews_Tools_and_Home_Improvement_5.json"
26 |
27 | food_count = 151254
28 | food_filename = "reviews_Grocery_and_Gourmet_Food_5.json"
29 |
30 | pet_count = 157836
31 | pet_filename = "reviews_Pet_Supplies_5.json"
32 |
33 | home_count = 551682
34 | home_filename = "reviews_Home_and_Kitchen_5.json"
35 |
36 | automotive_count = 20473
37 | automotive = "reviews_Automotive_5.json"
38 |
39 | instant_video_count = 37126
40 | instant_video_filename = "reviews_Amazon_Instant_Video_5.json"
41 |
42 | beauty_count = 198502
43 | beauty_filename = "reviews_Beauty_5.json"
44 |
45 | cds_vinyl_count = 1097592
46 | cds_vinyl_filename = "reviews_CDs_and_Vinyl_5.json"
47 |
48 | health_count = 346355
49 | health_filename = "reviews_Health_and_Personal_Care_5.json"
50 |
51 | clothes_count = 278677
52 | clothes_filename = "reviews_Clothing_Shoes_and_Jewelry_5.json"
53 |
54 | digital_music_count = 64706
55 | digital_music_filename = "reviews_Digital_Music_5.json"
56 |
57 | electronics_count = 1689188
58 | electronics_filename = "reviews_Electronics_5.json"
59 |
60 | kindle_count = 982619
61 | kindle_filename = "reviews_Kindle_Store_5.json"
62 |
63 | movies_tv_count = 1697533
64 | movies_tv_filename = "reviews_Movies_and_TV_5.json"
65 |
66 | instruments_count = 10261
67 | instruments_filename = "reviews_Musical_Instruments_5.json"
68 |
69 | office_count = 53258
70 | office_filename = "reviews_Office_Products_5.json"
71 |
72 | patio_count = 13272
73 | patio_filename = "reviews_Patio_Lawn_and_Garden_5.json"
74 |
75 | sports_count = 296337
76 | sports_filename = "reviews_Sports_and_Outdoors_5.json"
77 |
78 | toys_count = 167597
79 | toys_filename = "reviews_Toys_and_Games_5.json"
80 |
81 | video_games_count = 231780
82 | video_games_filename = "reviews_Video_Games_5.json"
83 |
84 | infos = []
85 | infos.append({"class_name": "apps", "count": apps_count, "filename": apps_filename})
86 | '''
87 | infos.append({"class_name": "baby", "count": baby_count, "filename": baby_filename})
88 | infos.append({"class_name": "tool", "count": tool_count, "filename": tool_filename})
89 | infos.append({"class_name": "food", "count": food_count, "filename": food_filename})
90 | infos.append({"class_name": "pet", "count": pet_count, "filename": pet_filename})
91 | infos.append({"class_name": "home", "count": home_count, "filename": home_filename})
92 | infos.append({"class_name": "automotive", "count": automotive_count, "filename": automotive_filename})
93 | infos.append({"class_name": "instant_video", "count": instant_video_count, "filename": instant_video_filename})
94 | infos.append({"class_name": "beauty", "count": beauty_count, "filename": beauty_filename})
95 | infos.append({"class_name": "cds_vinyl", "count": cds_vinyl_count, "filename": cds_vinyl_filename})
96 | infos.append({"class_name": "clothes", "count": clothes_count, "filename": clothes_filename})
97 | infos.append({"class_name": "digital_music", "count": digital_music_count, "filename": digital_music_filename})
98 | infos.append({"class_name": "cell_phones", "count": cell_phones_count, "filename": cell_phones_filename})
99 | infos.append({"class_name": "electronics", "count": electronics_count, "filename": electronics_filename})
100 | infos.append({"class_name": "kindle", "count": kindle_count, "filename": kindle_filename})
101 | infos.append({"class_name": "movies_tv", "count": movies_tv_count, "filename": movies_tv_filename})
102 | infos.append({"class_name": "instruments", "count": instruments_count, "filename": instruments_filename})
103 | infos.append({"class_name": "office", "count": office_count, "filename": office_filename})
104 | infos.append({"class_name": "patio", "count": patio_count, "filename": patio_filename})
105 | infos.append({"class_name": "health", "count": health_count, "filename": health_filename})
106 | infos.append({"class_name": "sports", "count": sports_count, "filename": sports_filename})
107 | infos.append({"class_name": "toys", "count": toys_count, "filename": toys_filename})
108 | infos.append({"class_name": "video_games", "count": video_games_count, "filename": video_games_filename})
109 | infos.append({"class_name": "books", "count": books_count, "filename": books_filename})
110 | '''
111 |
112 | for info in infos:
113 | filename = "data/%s" % info["filename"]
114 | count = info["count"]
115 | class_name = info["class_name"]
116 | train_filename = "train_%s.json" % class_name
117 | test_filename = "test_%s.json" % class_name
118 |
119 | print class_name
120 |
121 | all_lines = random.sample(range(1,count), NUM_TOTAL_SAMPLES)
122 | train_lines = set(all_lines[0:NUM_TRAIN_SAMPLES])
123 | test_lines = set(all_lines[NUM_TRAIN_SAMPLES:])
124 |
125 | train_reviews = [eval(linecache.getline(filename, i)) for i in train_lines]
126 | test_reviews = [eval(linecache.getline(filename, i)) for i in test_lines]
127 |
128 | with open(train_filename, 'w') as f:
129 | for review in train_reviews:
130 | f.write(json.dumps(review))
131 | f.write('\n')
132 |
133 | with open(test_filename, 'w') as f:
134 | for review in test_reviews:
135 | f.write(json.dumps(review))
136 | f.write('\n')
137 |
138 |
--------------------------------------------------------------------------------
/scraping/data_scrapy.py:
--------------------------------------------------------------------------------
1 | import helpers
2 |
3 | import scrapy
4 | from scrapy.selector import Selector
5 |
6 | class DataSpider(scrapy.Spider):
7 | name = "data"
8 | start_urls = [
9 | 'https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/'
10 | ]
11 |
12 | desired_tags = (u'p', u'h1', u'h3', u'pre')
13 | text = {}
14 |
15 | def words_from_tags(self, tag, response):
16 | total = []
17 | div = response.xpath("//div[contains(@class, 'entry-content')]")
18 | for para in div.xpath(".//%s" % tag):
19 | combined = []
20 | for words in para.xpath('./descendant-or-self::*/text()'):
21 | combined.append(words.extract())
22 | total.append(' '.join(combined))
23 | return total
24 |
25 | def parse(self, response):
26 | selector = Selector(response=response)
27 | for tag in self.desired_tags:
28 | self.text[tag] = self.words_from_tags(tag, response)
29 | helpers.write_data('scrapy', self.text)
30 | yield self.text #how scrapy returns the json object you created
31 |
--------------------------------------------------------------------------------
/scraping/helpers.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | def write_html(lib_name, page):
4 | if not os.path.exists("pages"):
5 | os.makedirs("pages")
6 | file_name = "pages/" + lib_name + "_page.html"
7 | with open(file_name, 'w') as f:
8 | f.write(page)
9 |
10 | def write_data(lib, text_array):
11 | if not os.path.exists("texts"):
12 | os.makedirs("texts")
13 | lib_dir = "texts/" + lib
14 | if not os.path.exists(lib_dir):
15 | os.makedirs(lib_dir)
16 | for key, values in text_array.iteritems():
17 | filename = lib_dir + '/' + key + '.txt'
18 | with open(filename, 'w') as f:
19 | for value in values:
20 | if value is not None:
21 | f.write(value.encode('UTF-8') + '\n')
22 |
--------------------------------------------------------------------------------
/scraping/page.html:
--------------------------------------------------------------------------------
1 |
2 |
5 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | General Tips for Web Scraping with Python | Big-Ish Data
15 |
16 |
17 |
20 |
21 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
55 |
59 |
73 |
74 |
75 |
76 |
79 |
80 |
85 |
86 |
87 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
198 |
199 |
200 |
The great majority of the projects about machine learning or data analysis I write about here on Bigish-Data have an initial step of scraping data from websites. And since I get a bunch of contact emails asking me to give them either the data I’ve scraped myself, or help with getting the code to work for themselves. Because of that, I figured I should write something here about the process of web scraping!
201 |
There are plenty of other things to talk about when scraping, such as specifics on how to grab the data from a particular site, which Python libraries to use and how to use them, how to write code that would scrape the data in a daily job, where exactly to look as to how to get the data from random sites, etc. But since there are tons of other specific tutorials online, I’m going to talk about overall thoughts on how to scrape. There are three parts of this post – How to grab the data, how to save the data, and how to be nice .
202 |
As is the case with everything, programming-wise, if you’re looking to learn scraping, you can’t just read tutorials and think to yourself that you know how to program. Pick a project, practice grabbing the data, and then write a blog post about what you learned .
203 |
There definitely are tons of different thoughts on scraping, but these are the ones that I’ve learned from doing it a while. If you have questions, comments, and want to call me out, feel free to comment, or get in contact !
204 |
Grabbing the Data
205 |
The first step for scraping data from websites is to figure out where the sites keep their data, and what method they use to display the data on the browser. For this part of your project, I’ll suggest writing in a file named gather.py which should performs all these tasks.
206 |
207 |
That being said, there are a few ways you’ll need to look for to see how to most easily get the data.
208 |
Check if the site has an API First
209 |
A ton of sites with interesting data have APIs for programmers to grab the data and write posts about the interesting-ness of the site. Genius does this very nicely, except for the song lyrics of course.
210 |
And also if the site has an API, that means that they’re totally alright with programmers using their data, though pretty much every site doesn’t allow you to use its data to make money. Read their requirements and rules for using the site’s data, and if your project is allowed, API is the way to go.
211 |
Figure out the URLs of all the data
212 |
If there is no API, that means you’re going to have to figure out the urls where the site displays all the data you need.
213 |
A common type you’ll see is that the data is displayed using IDs for the objects. If you’ve done web development in something like Rails, you’ll know exactly how that works. In this case, there probably is an index page that has links to all the different pages you’re trying to scrape, so you’ll have two scraping requirements. And like I’ve said, each site is different, but just know that these are possible requirements to get all the data you want.
214 |
Check JSON loading of data
215 |
If the site doesn’t have an API and you’re still going to want the data, check to see if the page that shows the data you’re looking for is by using JSON. If the page loads and it takes a second or a flash for the text to show up, it’s probably using that JSON.
216 |
From there, right click the web page and click “Inspect” on Chrome to get the Developer Tools window to open, reload the web page, and check the Sources tab to see pages that end in .json. You’ll be able to see the URL it came from, then open a new tab and paste that URL and you’ll be able to see the JSON with your data!
217 |
Quick example of how stats.nba.com generates their pages. If you just look at the HTML returned, you’ll see that it’s only AngularJS, meaning you can’t use the HTML to scrape the data. You’ll need to find the JS url that loads that data.
218 |
219 |
Looking at the network, I find a specific html file requested for over the Network.
220 |
Then, by reloading the page and checking the files under the Network tab, I find the url that generates the data for the page. As you can see, it’s just a JS variable that has all the data for the players.
221 |
222 |
223 |
I won’t list the URL specifically here, but there are ways to change it to grab the data that you’re looking for.
224 |
Fall back to HTML scraping
225 |
If the site you’re looking for data from doesn’t have an API or use JSON to load the data, you’re going to fall back to grabbing the HTML pages. Which is the only technique that people think of when imagining web scraping!
226 |
Like the JSON data, you’re going to have to use the Inspect feature of Chrome’s development tools, but in this case right click on the text that you’re trying to grab and analyze the classes and ids in order to grab that data.
227 |
For example, if you’re looking to scrape a WordPress blog to do something like sentiment analysis of the posts, you’ll want to do something like this:
228 |
229 |
As for other sites, I won’t go into exactly how that’s to be done because the classes and ids vary, but odds are it’ll be structured similarly with specific classes and ids for the data part of the page. Practice HTML scraping a couple sites and you’ll see how that part is.
230 |
Saving the Data
231 |
After you have the data saved using gather.py , you’ll need to write code that scrapes the data. In case you didn’t guess this, a good name for that file is scrape.py.
232 |
With this file, you’ll want to write the code that grabs the data and structures it for what you saved. How to save the data also depends on the type of scraping job you’re writing.
233 |
CSV is probably a fine type of database initially
234 |
There are two different types of scraping projects. First is just grabbing data that is consistent and doesn’t change that much over time. Like the PGA Tour stats I’ve scraped which change week by week for the current year but obviously don’t change when grabbing stats for every year in the past. Another example of this is how to get lyrics from Genius. Lyrics don’t change, and if you’re looking for other information about the songs, that doesn’t change much over time either.
235 |
If you’re getting this kind of data, don’t worry about setting up a DB to save the data . All types of this have a limited amount and frankly, test files are also quick to analyze the data.
236 |
Database is useful if you have data that keeps coming
237 |
On the other hand, if the data you’re looking to scrape is updated continuously, you’re probably going to want a DB to store the data, especially if you have a service (Heroku, Amazon, etc.) that runs your scraping code at certain times.
238 |
Another use for the DB is if you’re looking to scrape the data and then make a website that displays the data. Something like a script that checks Reddit comments to see how many Amazon products are mentioned and then displays them online.
239 |
And obviously the benefit of storing the data in a database rather than local files is that querying to compare data you scraped is much easier than having to load all your files into variables and then analyze the data. Like everything I’ve mentioned here, types of methods depends on the site, data, and information you’re trying to gather .
240 |
Be Nice
241 |
Scrape with header that has your name and email so company knows it’s you.
242 |
Some sites will get mad at you if you’re scraping their data. Even when the sites aren’t “nice”, you don’t want to do something “illegal”. An example of a way to do this:
243 |
import requests
244 |
245 | from = "'From': %s" % 'contact@bigishdata.com'
246 | headers = {'user-agent' : 'Jack Schultz, bigishdata.com, contact@bigishdata.com'}
247 | html = requests.get(url, headers=headers)
248 |
You can look up other options for the request header, but make sure it’s the same and that people who look at their server’s logs know who you are, just in case they want to get in contact.
249 |
Make sure you don’t keep hitting the servers!!!!
250 |
When you’re writing and running gather.py make sure you’re testing it in a way that doesn’t continuously hit the servers to gather the data! I’m talking about using JSON and HTML. As for API, you’ll also want to make sure you’re not hitting the end points time and time again, especially since they track who’s hitting the API and most only allow a certain number of requests per time period.
251 |
Then when you’re running scrape.py , don’t hit their servers over and over as well. That script deals with gathering the data that you already got from their site.
252 |
Basically, the only time you should continuously hit the servers is when you’re running your final code that gets and saves the data / files from the site .
253 |
Gevent
254 |
Now if you’re needing to scrape data from a bunch of different web pages, Gevent is the python library to use that will help run request jobs concurrently so you’ll be able to hit the API, grab the JSON, or grab the HTML pages quicker. Since for the most part, the longest code is the kind that hits their servers and then wait for the file to be returned.
255 |
import gevent
256 | from gevent import monkey
257 | monkey.patch_all()
258 | ... #set the urls that you'll get the data from
259 | jobs = [gevent.spawn(gather_pages, pair[0], pair[1]) for pair in url_filenames]
260 | gevent.joinall(jobs)
261 |
Again, as long as you’re not going too quickly destroy their servers by asking for thousands of pages at once, feel free to use Gevent. Especially since most sites have more than 50 requests at once.
262 |
Practice, practice, practice
263 |
With all that said, and what is the case with everything, if you want to web scrape, you gotta practice. Reading so many of the tutorials is interesting and does teach you things, but if you want to learn, write the code yourself and search for tutorials that help solve your bugs.
264 |
And remember, be nice when grabbing the data.
265 |
Like this: Like Loading...
266 |
267 |
Related
268 |
269 |
270 |
272 |
273 |
274 |
275 | Post navigation
276 | ← Product Mentions Update — Thoughts When Reviewing the Reddit Mentions
277 |
278 |
279 |
280 |
281 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
512 |
513 |
514 |
515 |
516 |
521 |
522 |
523 |
542 |
543 |
547 |
552 |
553 |
554 |
575 |
576 |
579 |
591 |
603 |
615 |
616 |
617 |
620 |
625 |
630 |
635 |
640 |
641 |
659 |
660 |
674 |
675 |
676 |
677 |
685 |
686 |
707 |
--------------------------------------------------------------------------------
/scraping/requesting_html.py:
--------------------------------------------------------------------------------
1 | import helpers
2 |
3 | url = "https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/"
4 | params = {} #used for ? values
5 | headers = {'user-agent' : 'Jack Schultz, bigishdata.com, contact@bigishdata.com'}
6 |
7 |
8 | import urllib2
9 | import urllib
10 | data = urllib.urlencode(params)
11 | req = urllib2.Request(url, data, headers)
12 | fstring = urllib2.urlopen(req).read()
13 | helpers.write_html('urllib2', fstring)
14 |
15 |
16 | import requests
17 | page = requests.get(url, headers=headers)
18 | fstring = page.text
19 | helpers.write_html('requests', fstring.encode('UTF-8'))
20 |
21 |
22 | import httplib
23 | #note that the url here is split into the base and the path
24 | conn = httplib.HTTPConnection("bigishdata.com")
25 | conn.request("GET", "/2017/05/11/general-tips-for-web-scraping-with-python/")
26 | response = conn.getresponse()
27 | helpers.write_html('httplib', response.read())
28 | conn.close()
29 |
--------------------------------------------------------------------------------
/scraping/requirements.txt:
--------------------------------------------------------------------------------
1 | PyDispatcher==2.0.5
2 | Scrapy==1.4.0
3 | Twisted==17.1.0
4 | beautifulsoup4==4.6.0
5 | certifi==2017.4.17
6 | chardet==3.0.3
7 | cssselect==1.0.1
8 | idna==2.5
9 | lxml==3.7.3
10 | parsel==1.2.0
11 | pyOpenSSL==17.0.0
12 | queuelib==1.4.2
13 | requests==2.17.3
14 | service-identity==17.0.0
15 | six==1.10.0
16 | urllib3==1.21.1
17 | virtualenv==15.1.0
18 | w3lib==1.17.0
19 | wsgiref==0.1.2
20 |
--------------------------------------------------------------------------------
/scraping/scraping_html.py:
--------------------------------------------------------------------------------
1 | import helpers
2 |
3 | with open('page.html', 'r') as f:
4 | page_string = f.read()
5 |
6 |
7 | ##
8 | ## BeautifulSoup
9 | ##
10 | from bs4 import BeautifulSoup as bs
11 | soup = bs(page_string, "html.parser")
12 | article = soup.find('div', {'class' : 'entry-content'})
13 |
14 | text = {}
15 | text['p'] = []
16 | text['h1'] = []
17 | text['h3'] = []
18 | text['pre'] = []
19 | text['imgsrc'] = []
20 | for tag in article.contents:
21 | #multiple if statements here to make is easier to read
22 | if tag is not None and tag.name is not None:
23 | if tag.name == "p":
24 | text['p'].append(tag.text)
25 | elif tag.name == 'h1':
26 | text['h1'].append(tag.text)
27 | elif tag.name == 'h3':
28 | text['h3'].append(tag.text)
29 | elif tag.name == 'pre':
30 | text['pre'].append(tag.text)
31 | for tag in article.findAll('img'):
32 | text['imgsrc'].append(tag['src'])
33 | helpers.write_data('bs', text)
34 |
35 | ##
36 | ## LXML
37 | ##
38 | import lxml.html
39 | page = lxml.html.fromstring(page_string)
40 | post = page.find_class('entry-content')[0] #0 since only one tag with that class
41 |
42 | text = {}
43 | text['p'] = []
44 | text['h1'] = []
45 | text['h3'] = []
46 | text['pre'] = []
47 | text['imgsrc'] = []
48 | #test_content is needed to get all of the text within the tag, not just on the top level
49 | for tag in post.findall('p'):
50 | text['p'].append(tag.text_content())
51 | for img in tag.findall('img'): #images in paragraphs, so need to check here
52 | text['imgsrc'].append(img.attrib['src'])
53 | for tag in post.findall('h1'):
54 | text['h1'].append(tag.text_content())
55 | for tag in post.findall('h3'):
56 | text['h3'].append(tag.text_content())
57 | for tag in post.findall('pre'):
58 | text['pre'].append(tag.text_content())
59 | helpers.write_data('lxml', text)
60 |
61 |
62 |
63 | ##
64 | ## HTMLParser
65 | ##
66 | from HTMLParser import HTMLParser
67 | import urllib
68 |
69 | desired_tags = (u'p', u'h1', u'h3', u'pre', u'img')
70 | class BigIshDataParser(HTMLParser):
71 | def __init__(self):
72 | HTMLParser.__init__(self)
73 | self.inside_entry_content = 0
74 | self.current_tag = None
75 | self.current_text = []
76 | self.overall_text = {}
77 | self.overall_text['p'] = []
78 | self.overall_text['h1'] = []
79 | self.overall_text['h3'] = []
80 | self.overall_text['pre'] = []
81 | self.overall_text['img'] = []
82 |
83 | def handle_starttag(self, tag, attributes):
84 | if self.inside_entry_content and tag in desired_tags:
85 | self.current_tag = tag
86 | if tag == 'div':
87 | for name, value in attributes:
88 | if name == 'class' and value == 'entry-content': #if this is correct div
89 | self.inside_entry_content += 1
90 | return #don't keep going through the attributes since there could be infinate, or just a ton of them
91 | if tag == 'img' and self.inside_entry_content: #need to deal with images here since they're only a start tag
92 | for attr in attributes:
93 | if attr[0] == 'src':
94 | self.overall_text['img'].append(attr[1])
95 | break
96 |
97 | def handle_endtag(self, tag):
98 | if tag == 'div' and self.inside_entry_content:
99 | self.inside_entry_content -= 1 #moving on down the divs
100 | if tag == self.current_tag:
101 | tstring = ''.join(self.current_text)
102 | self.overall_text[self.current_tag].append(tstring)
103 | self.current_text = []
104 | self.current_tag = None
105 |
106 | def handle_data(self, data):
107 | if self.inside_entry_content:
108 | self.current_text.append(data)
109 |
110 | p = BigIshDataParser()
111 | page_string = p.unescape(page_string.decode('UTF-8'))
112 | p.feed(page_string)
113 | helpers.write_data('htmlparser', p.overall_text)
114 | p.close()
115 |
116 |
117 |
--------------------------------------------------------------------------------
/scraping/selenium_test.py:
--------------------------------------------------------------------------------
1 | import helpers
2 |
3 | from selenium import webdriver
4 | from selenium.webdriver.common.keys import Keys
5 |
6 | url = 'https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/'
7 |
8 | driver = webdriver.PhantomJS()
9 | driver.get(url)
10 | elem = driver.find_element_by_class_name('entry-content')
11 |
12 | text = {}
13 | desired_tags = (u'p', u'h1', u'h3', u'pre')
14 | for tag in desired_tags:
15 | tags = elem.find_elements_by_tag_name(tag)
16 | text[tag] = []
17 | for data in tags:
18 | text[tag].append(data.text)
19 |
20 | helpers.write_data('selenium', text)
21 |
--------------------------------------------------------------------------------
/sklearn_classify/classify.py:
--------------------------------------------------------------------------------
1 | import json
2 | from sklearn.feature_extraction.text import CountVectorizer
3 | from sklearn.feature_extraction.text import TfidfTransformer
4 | from sklearn.feature_extraction.text import TfidfVectorizer
5 | from pandas import DataFrame
6 | import numpy
7 | from sklearn.naive_bayes import MultinomialNB
8 | from sklearn.pipeline import Pipeline
9 | from sklearn.metrics import confusion_matrix, accuracy_score
10 |
11 |
12 | def print_confusion_matrix(matrix, class_labels):
13 | lines = ["" for i in range(len(class_labels)+1)]
14 | for index, c in enumerate(class_labels):
15 | lines[0] += "\t"
16 | lines[0] += c
17 | lines[index+1] += c
18 | for index, result in enumerate(matrix):
19 | for amount in result:
20 | lines[index+1] += "\t"
21 | lines[index+1] += str(amount)
22 | for line in lines:
23 | print line
24 |
25 | def initialize_conversion_matrix(num_labels):
26 | return [[0 for i in range(num_labels)] for y in range(num_labels)]
27 |
28 |
29 | '''
30 | counts = count_vectorizer.fit_transform(data['text'].values)
31 | bigram_counts = bigram_vectorizer.fit_transform(data['text'].values)
32 | tfidf_counts = tfidf_vectorizer.fit_transform(data['text'].values)
33 | '''
34 |
35 | labels = ["baby", "tool", "home", "pet", "food"]
36 |
37 | count_vectorizer = CountVectorizer(min_df=1)
38 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
39 | tfidf_vectorizer = TfidfVectorizer(min_df=1)
40 |
41 | classifier = MultinomialNB()
42 |
43 | pipeline = Pipeline([
44 | ('count_vectorizer', bigram_vectorizer),
45 | ('classifier', classifier)
46 | ])
47 |
48 | reviews = []
49 | for label in labels:
50 | filename = "train_%s.json" % label
51 | with open(filename, 'r') as f:
52 | for line in f:
53 | reviews.append({'text': json.loads(line)["reviewText"], 'class': label})
54 |
55 | data = DataFrame(reviews)
56 | data = data.reindex(numpy.random.permutation(data.index))
57 |
58 | pipeline.fit(data['text'].values, data['class'].values)
59 |
60 | test_reviews = []
61 | for index, label in enumerate(labels):
62 | filename = "test_%s.json" % label
63 | with open(filename, 'r') as f:
64 | for line in f:
65 | test_reviews.append({'text': json.loads(line)["reviewText"], 'class': label})
66 |
67 | test_examples = [review['text'] for review in test_reviews]
68 | test_labels = [review['class'] for review in test_reviews]
69 |
70 | #print pipeline.score(test_examples)
71 | guesses = pipeline.predict(test_examples)
72 |
73 | print accuracy_score(test_labels, guesses)
74 | print confusion_matrix(test_labels, guesses, labels=labels)
75 |
76 |
--------------------------------------------------------------------------------
/tourstats/analyze.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import numpy as np
3 | from sklearn import linear_model
4 | import os
5 | from bokeh.plotting import figure, output_file, show,vplot
6 | from collections import Iterable, Sequence
7 |
8 | stat = 'Driving Distance'
9 | folder_path = 'stats_csv/%s' % (stat)
10 |
11 | key = 'AVG.'
12 |
13 | years = []
14 | yearly_data = []
15 | year_hash = {}
16 | for filename in os.listdir(folder_path):
17 | with open(folder_path + '/' + filename, 'rb') as csvfile:
18 | year = filename.split('.')[0]
19 | years.append(year)
20 | reader = csv.DictReader(csvfile)
21 | fieldnames = reader.fieldnames
22 |
23 | avgs = [float(row[key]) for row in reader]
24 | year_hash[year] = avgs
25 | yearly_data.append(avgs)
26 |
27 | int_years = [int(year) for year in years]
28 |
29 | yda = np.array(yearly_data)
30 |
31 | p = figure(tools="save", title="Max, Avg, Min Driving Distance Over Time")
32 | p.line(int_years, [np.average(asdf) for asdf in yda], line_color="red")#, fill_color="red", line_color="green", line_width=3, )
33 | p.line(int_years, [np.min(asdf) for asdf in yda], line_color="blue")#, fill_color="red", line_color="green", line_width=3, )
34 | p.line(int_years, [np.max(asdf) for asdf in yda], line_color="green")#, fill_color="red", line_color="green", line_width=3, )
35 | output_file("driving_distance.html", title="Max, Avg, Min Driving Distance Over Time")
36 | show(vplot(p))
37 |
38 | '''
39 | filename = '2015.csv'
40 | ind = []
41 | dep = []
42 | names = []
43 | with open(filename, 'rb') as csvfile:
44 | reader = csv.reader(csvfile)
45 | headings = reader.next()[1:-1] #headings
46 | for row in reader:
47 | names.append(row[0])
48 | ind.append(map(float, row[1:-3]))
49 | dep.append(float(row[-2]))
50 |
51 | npind = np.array(ind)
52 | npdep = np.array(dep)
53 |
54 | regr = linear_model.LinearRegression(normalize=True)
55 |
56 | regr.fit(npind, npdep)
57 |
58 | for name, coeff in zip(headings, regr.coef_):
59 | print "%s: %s" % (name, coeff)
60 |
61 | print("Residual sum of squares: %.2f"
62 | % np.mean((regr.predict(npind) - npdep) ** 2))
63 |
64 | for name, stats, money in zip(names, ind, dep):
65 | predicted = '{:20,.2f}'.format(np.dot(stats, regr.coef_))
66 | print "%s: %s, %s" % (name, predicted, '{:20,.2f}'.format(money))
67 |
68 | import csv
69 | from bokeh.plotting import figure, output_file, show, vplot
70 | years = range(2002,2016)
71 | years = [2002, 2015]
72 | for year in years:
73 | filename = "%s.csv" % year
74 | with open(filename, 'rb') as csvfile:
75 | reader = csv.DictReader(csvfile)
76 | fieldnames = reader.fieldnames
77 | distances = [float(row['driving_distance']) for row in reader if row['percentage_of_yardage_covered_by_tee_shots']]
78 | a = np.array(distances)
79 |
80 | hist, edges = np.histogram(a, density=True, bins=100)
81 |
82 | x = np.linspace(np.amin(a)-5, np.amax(a)+5, 1000)
83 | mu = np.mean(a)
84 | sigma = np.std(a)
85 | pdf = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(x-mu)**2 / (2*sigma**2))
86 |
87 | p1 = figure(title="%s Driving Distance" % (year),tools="save", background_fill_color="#E8DDCB")
88 | p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="#036564", line_color="#033649")
89 | p1.line(x, pdf, line_color="#D95B43", line_width=8, alpha=0.7, legend="PDF")
90 |
91 | p1.legend.location = "top_left"
92 | p1.xaxis.axis_label = 'Driving Distance'
93 | p1.yaxis.axis_label = 'Pr(x)'
94 |
95 | output_file("%s_driving_distance.html" % (year), title="%s Driving Distance" % (year))
96 | show(vplot(p1))
97 |
98 |
99 | '''
100 |
--------------------------------------------------------------------------------
/tourstats/distance_vs_putts.csv:
--------------------------------------------------------------------------------
1 | Brian Gay,1.719,270.2
2 | Marc Leishman,1.723,288.3
3 | Rob Oppenheim,1.746,272.2
4 | Scott Stallings,1.798,287.3
5 | Chad Campbell,1.788,281.5
6 | Ian Poulter,1.835,276.2
7 | Cameron Percy,1.832,274.6
8 | Emiliano Grillo,1.779,286.2
9 | Smylie Kaufman,1.725,295.0
10 | Paul Casey,1.786,285.4
11 | Rickie Fowler,1.753,293.7
12 | Jason Dufner,1.756,282.7
13 | Ryo Ishikawa,1.723,297.4
14 | Charles Howell III,1.739,294.6
15 | Vaughn Taylor,1.715,272.9
16 | Jon Curran,1.738,267.1
17 | Matt Every,1.772,281.0
18 | Russell Knox,1.786,281.5
19 | Steven Bowditch,1.750,288.5
20 | Martin Piller,1.780,283.7
21 | Ben Crane,1.711,275.0
22 | David Lingmerth,1.729,284.1
23 | Bryce Molder,1.758,276.9
24 | Jim Herman,1.781,284.0
25 | D.H. Lee,1.762,276.4
26 | Chesson Hadley,1.812,288.3
27 | Robert Streb,1.808,285.4
28 | Matt Kuchar,1.725,281.2
29 | Mark Wilson,1.807,269.9
30 | Adam Hadwin,1.708,283.5
31 | Peter Malnati,1.735,277.3
32 | Billy Horschel,1.778,287.2
33 | Bud Cauley,1.762,282.6
34 | Jhonattan Vegas,1.791,288.1
35 | Rory Sabbatini,1.735,287.6
36 | D.A. Points,1.732,270.1
37 | Wes Roach,1.783,284.5
38 | Hiroshi Iwata,1.751,282.5
39 | Jonas Blixt,1.755,279.9
40 | Mark Hubbard,1.761,278.9
41 | Tyrone Van Aswegen,1.767,280.7
42 | Hideki Matsuyama,1.717,289.1
43 | Andres Gonzales,1.793,284.2
44 | Michael Kim,1.752,280.1
45 | Scott Brown,1.785,283.6
46 | Cameron Beckman,1.901,264.5
47 | Patrick Rodgers,1.791,295.6
48 | Danny Lee,1.741,283.2
49 | Stewart Cink,1.800,283.5
50 | Kyle Reifers,1.746,280.9
51 | David Toms,1.791,265.2
52 | William McGirt,1.727,277.5
53 | Charley Hoffman,1.769,296.5
54 | Will Wilcox,1.770,286.8
55 | Colt Knost,1.765,271.7
56 | Webb Simpson,1.746,288.9
57 | Brett Stegmaier,1.782,286.0
58 | Lucas Glover,1.763,283.8
59 | J.B. Holmes,1.761,306.0
60 | Patton Kizzire,1.728,283.5
61 | Brian Stuard,1.790,266.2
62 | Rory McIlroy,1.773,297.2
63 | Camilo Villegas,1.772,283.4
64 | Graeme McDowell,1.737,275.8
65 | Erik Compton,1.775,281.3
66 | Bill Haas,1.782,280.0
67 | Jonathan Byrd,1.763,275.1
68 | Ollie Schniederjans,1.802,295.5
69 | Tyler Aldridge,1.776,280.6
70 | Michael Putnam,1.825,278.8
71 | John Huh,1.741,274.1
72 | Vijay Singh,1.773,279.9
73 | Ken Duke,1.829,270.3
74 | Cameron Tringale,1.730,279.4
75 | Nick Watney,1.759,290.0
76 | Justin Thomas,1.742,293.9
77 | Nick Taylor,1.764,290.6
78 | Justin Rose,1.735,289.5
79 | Kyle Stanley,1.769,282.4
80 | Steve Wheatcroft,1.760,281.4
81 | Brendon de Jonge,1.784,283.1
82 | Cameron Smith,1.745,276.6
83 | Charlie Beljan,1.910,295.1
84 | Stuart Appleby,1.781,276.0
85 | Dustin Johnson,1.722,305.5
86 | Zach Johnson,1.755,281.8
87 | Charl Schwartzel,1.788,288.5
88 | Billy Hurley III,1.785,263.5
89 | Fabian Gomez,1.781,284.4
90 | Sung Kang,1.785,277.5
91 | Rhein Gibson,1.848,284.7
92 | Jason Day,1.765,297.4
93 | Andrew Loupe,1.701,296.4
94 | Luke Donald,1.781,270.8
95 | Alex Cejka,1.740,278.1
96 | Lucas Lee,1.818,265.7
97 | Justin Hicks,1.771,277.5
98 | Sam Saunders,1.810,284.8
99 | Tim Herron,1.825,269.1
100 | Brandt Snedeker,1.718,289.8
101 | Ryan Moore,1.715,281.0
102 | Justin Leonard,1.747,270.7
103 | Steve Stricker,1.661,268.9
104 | Tim Clark,1.796,270.2
105 | Hudson Swafford,1.786,292.6
106 | Carlos Ortiz,1.736,285.9
107 | George McNeill,1.744,277.8
108 | Jeff Overton,1.764,281.6
109 | John Senden,1.770,283.1
110 | Jimmy Walker,1.751,291.6
111 | Ben Martin,1.730,280.6
112 | Dicky Pride,1.807,268.5
113 | Hunter Stewart,1.782,283.3
114 | Brice Garnett,1.757,274.2
115 | Robert Garrigus,1.800,290.2
116 | Aaron Baddeley,1.722,281.7
117 | Jason Bohn,1.711,280.3
118 | James Hahn,1.759,286.4
119 | David Hearn,1.746,275.9
120 | Sean O'Hair,1.732,287.3
121 | Andrew Landry,1.819,278.2
122 | Shawn Stefani,1.824,287.3
123 | Graham DeLaet,1.741,288.7
124 | Brian Davis,1.783,263.8
125 | Chris Stroud,1.762,281.0
126 | Roberto Castro,1.774,279.9
127 | Russell Henley,1.762,290.2
128 | Blake Adams,1.805,270.3
129 | Andres Romero,1.727,278.4
130 | Phil Mickelson,1.718,287.0
131 | Martin Laird,1.751,290.8
132 | Derek Fathauer,1.755,282.3
133 | Retief Goosen,1.784,282.0
134 | Chris Kirk,1.824,279.7
135 | Davis Love III,1.773,283.8
136 | Morgan Hoffmann,1.754,289.2
137 | Will MacKenzie,1.783,279.1
138 | K.J. Choi,1.764,271.2
139 | Jordan Spieth,1.659,287.5
140 | Abraham Ancer,1.807,272.5
141 | John Merrick,1.804,276.1
142 | Dawie van der Walt,1.775,282.2
143 | Kevin Na,1.716,277.9
144 | Troy Merritt,1.743,283.2
145 | Sergio Garcia,1.746,282.4
146 | Whee Kim,1.847,282.4
147 | Brendan Steele,1.742,291.6
148 | Daniel Berger,1.764,290.7
149 | Boo Weekley,1.799,283.8
150 | Jason Kokrak,1.793,297.1
151 | Kevin Kisner,1.691,288.1
152 | J.J. Henry,1.811,283.9
153 | Darron Stiles,1.771,256.4
154 | Kelly Kraft,1.770,280.4
155 | Rod Pampling,1.845,273.7
156 | Johnson Wagner,1.788,280.0
157 | Chez Reavie,1.764,281.8
158 | Robert Allenby,1.851,274.9
159 | Francesco Molinari,1.751,279.2
160 | Jerry Kelly,1.787,273.2
161 | Gary Woodland,1.759,301.5
162 | Si Woo Kim,1.766,287.8
163 | Michael Thompson,1.747,284.2
164 | Steve Marino,1.806,283.6
165 | Scott Langley,1.820,277.0
166 | Thomas Aiken,1.834,275.5
167 | Alex Prugh,1.853,285.0
168 | Ricky Barnes,1.741,279.7
169 | Geoff Ogilvy,1.798,286.8
170 | Brooks Koepka,1.729,298.6
171 | Daniel Summerhays,1.728,284.2
172 | Scott Pinckney,1.814,291.2
173 | Ernie Els,1.801,283.3
174 | Jarrod Lyle,1.726,274.2
175 | Brian Harman,1.760,285.1
176 | Kevin Streelman,1.754,283.9
177 | Keegan Bradley,1.833,292.2
178 | Blayne Barber,1.795,279.2
179 | Hunter Mahan,1.759,293.3
180 | Derek Ernst,1.842,284.4
181 | Miguel Angel Carballo,1.774,284.3
182 | Zac Blair,1.776,271.0
183 | Seung-Yul Noh,1.730,288.7
184 | D.J. Trahan,1.762,282.4
185 | Brendon Todd,1.774,274.3
186 | Shane Lowry,1.780,288.3
187 | Freddie Jacobson,1.717,275.9
188 | Ryan Palmer,1.748,298.6
189 | Tim Wilkinson,1.773,275.6
190 | Chad Collins,1.775,273.8
191 | Harris English,1.778,290.8
192 | Tom Hoge,1.780,281.6
193 | Kevin Chappell,1.760,285.3
194 | Pat Perez,1.772,280.2
195 | Luke List,1.796,294.2
196 | Greg Owen,1.820,290.1
197 | Bronson Burgoon,1.791,285.6
198 | Matt Jones,1.752,289.8
199 | Shane Bertsch,1.781,275.5
200 | Andy Sullivan,1.806,276.9
201 | Adam Scott,1.726,289.5
202 | Jamie Lovemark,1.774,297.0
203 | Angel Cabrera,1.787,290.6
204 | Scott Piercy,1.765,294.1
205 | Nicholas Thompson,1.837,270.2
206 | Padraig Harrington,1.779,280.4
207 | Henrik Norlander,1.780,276.1
208 | Harold Varner III,1.779,291.8
209 | Tony Finau,1.784,300.9
210 | Patrick Reed,1.743,288.1
211 | Carl Pettersson,1.734,280.4
212 | Branden Grace,1.701,282.6
213 | Bo Van Pelt,1.871,278.5
214 | Bubba Watson,1.787,307.5
215 | Spencer Levin,1.732,277.8
216 | Jason Gore,1.730,281.2
217 | Anirban Lahiri,1.700,286.0
218 |
--------------------------------------------------------------------------------
/tourstats/driving_distance.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Driving Distance over the years
7 |
8 |
9 |
10 |
11 |
14 |
15 |
16 |
17 |
18 |
19 |
27 |
28 |
--------------------------------------------------------------------------------
/tourstats/driving_vs_putts.py:
--------------------------------------------------------------------------------
1 | from models import Player, Stat, StatLine
2 |
3 | from sqlalchemy.orm import sessionmaker
4 | from sqlalchemy import create_engine, or_
5 |
6 | engine = create_engine('postgresql://pgatour_user:pgatour_user_password@localhost:5432/pgatour')
7 | Session = sessionmaker(bind=engine)
8 | session = Session()
9 |
10 | stat_names = set([
11 | #'Driving Distance',
12 | 'Putting Average',
13 | 'Total Putting',
14 | 'Greens in Regulation Percentage',
15 | 'Driving Accuracy Percentage',
16 | 'Proximity to Hole',
17 | 'Birdie Average',
18 | 'Scrambling',
19 | 'Scoring Average'
20 | ])
21 |
22 | stats = session.query(Stat.id, Stat.name).filter(or_(Stat.name == v for v in stat_names))
23 | stats_info = [(stat.id, stat.name) for stat in stats]
24 |
25 | from sqlalchemy import text
26 |
27 | sql_text_train = '''
28 | select players.id,
29 | players.name,
30 | max(case when stat_lines.stat_id=330 then stat_lines.raw else null end) as putting_average,
31 | max(case when stat_lines.stat_id=157 then stat_lines.raw else null end) as driving_distance,
32 | max(case when stat_lines.stat_id=250 then stat_lines.raw else null end) as gir,
33 | max(case when stat_lines.stat_id=156 then stat_lines.raw else null end) as driving_accuracy,
34 | max(case when stat_lines.stat_id=382 then stat_lines.raw else null end) as scoring_average
35 | from players
36 | join stat_lines on stat_lines.player_id = players.id
37 | join stats on stat_lines.stat_id=stats.id
38 | where stat_lines.year=2012 and (stats.id=157 or stats.id=330 or stats.id=382 or stats.id=250 or stats.id=156) and stat_lines.raw is not null
39 | group by players.name,players.id;
40 | '''
41 |
42 | select_clauses = []
43 | where_clauses = []
44 | for stat_info in stats_info:
45 | stat_id = stat_info[0]
46 | stat_name = stat_info[1].lower().replace(' ','_')
47 | select_string = ", max(case when stat_lines.stat_id=%s then stat_lines.raw else null end) as %s" % (stat_id, stat_name)
48 | where_string = "stats.id=%s " % (stat_id)
49 | select_clauses.append(select_string)
50 | where_clauses.append(where_string)
51 |
52 | underscored_stat_names = [stat_name.lower().replace(' ','_') for stat_name in stat_names if stat_name != 'Scoring Average']
53 |
54 | sql_text = 'select players.id, players.name'
55 | for select_clause in select_clauses:
56 | sql_text += select_clause
57 | sql_text += '''
58 | from players
59 | join stat_lines on stat_lines.player_id = players.id
60 | join stats on stat_lines.stat_id=stats.id
61 | where
62 | stat_lines.year=%s
63 | and (
64 | '''
65 | for index, where_clause in enumerate(where_clauses):
66 | if index != 0:
67 | sql_text += 'or '
68 | sql_text += where_clause
69 | sql_text += '''
70 | )
71 | and stat_lines.raw is not null
72 | group by players.name, players.id;
73 | '''
74 |
75 | import pandas as pd
76 | import statsmodels.api as sm
77 | from sklearn import linear_model, preprocessing
78 | import numpy as np
79 | import sys
80 | current_module = sys.modules[__name__]
81 |
82 | sql_text_train = sql_text % '2012'
83 | sql_text_pred = sql_text % '2013'
84 |
85 | driving_accuracy_percentage_clean = lambda x: float(x) * 0.01 * 14
86 | greens_in_regulation_percentage_clean = lambda x: float(x) * 0.01 * 18
87 | greens_or_fringe_in_regulation_clean = lambda x: float(x) * 0.01 * 18
88 | putting_average_clean = lambda x: float(x) * 18
89 |
90 | def proximity_to_hole_clean(val):
91 | distances = str(val).split("'")
92 | inches = int(distances[0]) * 12 + int(distances[1][1:-1])
93 | return inches
94 |
95 | df = pd.read_sql_query(sql_text_train, engine)
96 | df = df[df.scoring_average.notnull()]
97 | for underscored_stat_name in underscored_stat_names:
98 | try:
99 | cleaning_function = getattr(current_module, underscored_stat_name+'_clean')
100 | df[underscored_stat_name] = df[underscored_stat_name].map(cleaning_function)
101 | except AttributeError:
102 | pass
103 |
104 | X_train = df[underscored_stat_names].astype(np.float)
105 | X_train = sm.add_constant(X_train)
106 | y = df['scoring_average'].astype(np.float)
107 |
108 | res = sm.OLS(y,X_train).fit()
109 | print res.summary()
110 | ytrain = res.predict(X_train)
111 |
112 | #prediction time
113 | df_pred = pd.read_sql_query(sql_text_pred, engine)
114 | df_pred = df_pred[df_pred.scoring_average.notnull()]
115 | for underscored_stat_name in underscored_stat_names:
116 | try:
117 | cleaning_function = getattr(current_module, underscored_stat_name+'_clean')
118 | df_pred[underscored_stat_name] = df_pred[underscored_stat_name].map(cleaning_function)
119 | except AttributeError:
120 | pass
121 |
122 | X_pred = df_pred[underscored_stat_names].astype(np.float)
123 | X_pred = sm.add_constant(X_pred)
124 | y_actual = df_pred['scoring_average'].astype(np.float)
125 |
126 | ypred = res.predict(X_pred)
127 |
128 | import matplotlib.pyplot as plt
129 | fig, ax = plt.subplots()
130 | #ax.scatter(df['putting_average'].astype(np.float), df['scoring_average'].astype(np.float))
131 | #ax.scatter(y_actual, ypred)
132 | ax.scatter(ytrain, y)
133 |
134 | for index, row in df_pred['scoring_average'].iteritems():
135 | name = df_pred.loc[index]['name']
136 | if y_actual[index] + 1 < ypred[index] or y_actual[index] - 1 > ypred[index]:
137 | pass
138 | # ax.annotate(name, (y_actual[index],ypred[index]))
139 |
140 | plt.show()
141 | '''
142 | import csv
143 | import matplotlib.pyplot as plt
144 | filename = "distance_vs_putts.csv"
145 | df = pd.read_csv(filename, index_col=0)
146 |
147 | data = {}
148 | names = []
149 | distance = []
150 | putts = []
151 | with open(filename, 'rb') as csvfile:
152 | reader = csv.reader(csvfile)
153 | for row in reader:
154 | data[row[0]] = [row[1:3]]
155 | names.append(row[0])
156 | putts.append(row[1])
157 | distance.append(row[2])
158 |
159 | fig, ax = plt.subplots()
160 | ax.scatter(distance, putts)
161 |
162 | for i, name in enumerate(names):
163 | ax.annotate(name, (distance[i],putts[i]))
164 |
165 | plt.show()
166 | '''
167 |
--------------------------------------------------------------------------------
/tourstats/gather.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | from collections import defaultdict
4 | import csv
5 | import urllib
6 | import os
7 |
8 | #import gevent
9 | #from gevent import monkey
10 | #monkey.patch_all()
11 |
12 |
13 | url_stub = "http://www.pgatour.com/stats/stat.%s.%s.html" #stat id, year
14 |
15 | def feet_string_to_inches(string):
16 | ''' 29'1" for example, turns it into inches '''
17 | splits = map(float, string[:-1].split("'"))
18 | return splits[0] * 12.0 + splits[1]
19 |
20 | def to_dollas(string):
21 | return float(string[1:].replace(',', ''))
22 |
23 | category_url_stub = 'http://www.pgatour.com/stats/categories.%s.html'
24 | category_labels = ['RPTS_INQ', 'ROTT_INQ', 'RAPP_INQ', 'RARG_INQ', 'RPUT_INQ', 'RSCR_INQ', 'RSTR_INQ', 'RMNY_INQ']
25 | pga_tour_base_url = "http://www.pgatour.com"
26 | def gather_pages(url, filename):
27 | print filename
28 | urllib.urlretrieve(url, filename)
29 |
30 | def gather_html():
31 | stat_ids = []
32 | for category in category_labels:
33 | category_url = category_url_stub % (category)
34 | page = requests.get(category_url)
35 | html = BeautifulSoup(page.text.replace('\n',''), 'html.parser')
36 | for table in html.find_all("div", class_="table-content"):
37 | for link in table.find_all("a"):
38 | stat_ids.append(link['href'].split('.')[1])
39 | starting_year = 2015 #page in order to see which years we have info for
40 | print stat_ids
41 | for stat_id in stat_ids:
42 | url = url_stub % (stat_id, starting_year)
43 | page = requests.get(url)
44 | html = BeautifulSoup(page.text.replace('\n',''), 'html.parser')
45 | stat = html.find("div", class_="parsys mainParsys section").find('h3').text
46 | print stat
47 | directory = "stats_html/%s" % stat.replace('/', ' ') #need to replace to avoid
48 | if not os.path.exists(directory):
49 | os.makedirs(directory)
50 | years = []
51 | for option in html.find("select", class_="statistics-details-select").find_all("option"):
52 | year = option['value']
53 | if year not in years:
54 | years.append(year)
55 | url_filenames = []
56 | for year in years:
57 | url = url_stub % (stat_id, year)
58 | filename = "%s/%s.html" % (directory, year)
59 | if not os.path.isfile(filename): #this check saves time if you've already downloaded the page
60 | url_filenames.append((url, filename))
61 | jobs = [gevent.spawn(gather_pages, pair[0], pair[1]) for pair in url_filenames]
62 | gevent.joinall(jobs)
63 |
64 | gather_html()
65 |
66 | '''
67 | for folder in os.listdir("stats_html"):
68 | path = "stats_html/%s" % folder
69 | if os.path.isdir(path):
70 | for file in os.listdir(path):
71 | if file[0] == '.':
72 | continue
73 | csv_lines = []
74 | file_path = path + "/" + file
75 | csv_dir = "stats_csv/" + folder
76 | if not os.path.exists(csv_dir):
77 | os.makedirs(csv_dir)
78 | csv_file_path = csv_dir + "/" + file.split('.')[0] + '.csv'
79 | print csv_file_path
80 | if os.path.isfile(csv_file_path):
81 | continue
82 | with open(file_path, 'r') as ff:
83 | f = ff.read()
84 | html = BeautifulSoup(f.replace('\n',''), 'html.parser')
85 | table = html.find('table', class_='table-styled')
86 | headings = [t.text for t in table.find('thead').find_all('td')]
87 | csv_lines.append(headings)
88 | for tr in table.find('tbody').find_all('tr'):
89 | info = [td.text.replace(u'\xa0', u' ').strip() for td in tr.find_all('td')]
90 | csv_lines.append(info)
91 | #write the array to csv
92 | with open(csv_file_path, 'wb') as csvfile:
93 | writer = spamwriter = csv.writer(csvfile, delimiter=',')
94 | for row in csv_lines:
95 | writer.writerow(row)
96 |
97 |
98 |
99 | column_keys = ['%', 'AVG']
100 | inputs = [
101 | {'name': 'driving_distance', 'sid': 101, 'conversion': float},
102 | {'name': 'driving_accuracy', 'sid': 102, 'conversion': float},
103 | {'name': 'greens_in_regulation', 'sid': 103, 'conversion': float},
104 | {'name': 'greens_or_fringe_in_regulation', 'sid': '02437', 'conversion': float},
105 | {'name': 'proximity_to_hole', 'sid': 331, 'conversion': feet_string_to_inches},
106 | {'name': 'scrambling', 'sid': 130, 'conversion': float},
107 | {'name': 'putts_per_round', 'sid': 119, 'conversion': float},
108 | {'name': 'percentage_of_yardage_covered_by_tee_shots', 'sid': '02341', 'conversion': float},
109 | {'name': 'strokes_gained_tee_to_green', 'sid': '02674', 'conversion': float},
110 | {'name': 'fairway_proximity', 'sid': 431, 'conversion': feet_string_to_inches},
111 | {'name': 'rough_proximity', 'sid': 437, 'conversion': feet_string_to_inches},
112 | {'name': 'proximity_to_hole_around_green', 'sid': 374, 'conversion': feet_string_to_inches},
113 | {'name': 'three_putt_avoidance', 'sid': 426, 'conversion': float},
114 | {'name': 'one_putt_percentage', 'sid': 413, 'conversion': float},
115 | {'name': 'total_putting', 'sid': '02428', 'conversion': float},
116 |
117 | {'name': 'scoring_average', 'sid': 120, 'conversion': float},
118 | {'name': 'scoring_average_actual', 'sid': 108, 'conversion': float},
119 | {'name': 'money_leaders', 'sid': 109, 'conversion': to_dollas}
120 | ]
121 |
122 | player_stats = defaultdict(dict)
123 | years = range(2014, 1999, -1)
124 | for year in years:
125 | print year
126 | for source in inputs:
127 | print source['name']
128 | url = url_stub % (source['sid'], year)
129 | page = requests.get(url)
130 | html = BeautifulSoup(page.text.replace('\n',''), 'html.parser')
131 | for row in html.find("table", id="statsTable").find('tbody').find_all('tr'):
132 | stat_line = [info.text for info in row.find_all('td')]
133 | player = str(stat_line[2].replace(u'\xa0', u' ').strip())
134 | stat = source['conversion'](stat_line[4])
135 | player_stats[player][source['name']] = stat
136 |
137 | filename = "%s.csv" % year
138 | with open(filename, 'w') as csvfile:
139 | fieldnames = ['name'] + [s['name'] for s in inputs]
140 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
141 | writer.writeheader()
142 | for name, stats in player_stats.iteritems():
143 | if stats.get('scoring_average') == None:
144 | continue
145 | stats['name'] = name
146 | writer.writerow(stats)
147 |
148 | '''
149 |
--------------------------------------------------------------------------------
/tourstats/models.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy.ext.declarative import declarative_base
2 | from sqlalchemy import Column, Integer, String, ForeignKey
3 | from sqlalchemy.orm import relationship
4 |
5 | Base = declarative_base()
6 |
7 | class Player(Base):
8 | __tablename__ = 'players'
9 | id = Column('id', Integer, primary_key=True)
10 | name = Column('name', String)
11 | stat_lines = relationship("StatLine")
12 |
13 | class Stat(Base):
14 | __tablename__ = 'stats'
15 | id = Column('id', Integer, primary_key=True)
16 | name = Column('name', String)
17 | stat_lines = relationship("StatLine")
18 |
19 | class StatLine(Base):
20 | __tablename__ = 'stat_lines'
21 | id = Column('id', Integer, primary_key=True)
22 | player_id = Column('player_id', Integer, ForeignKey("players.id"))
23 | player = relationship('Player')
24 | stat_id = Column('stat_id', Integer, ForeignKey("stats.id"))
25 | stat = relationship('Stat')
26 | raw = Column('raw', String)
27 | events = Column('events', Integer)
28 | year = Column('year', Integer)
29 |
--------------------------------------------------------------------------------
/tourstats/models.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackschultz/bigishdata/e16cb67daa5196f06a140877ba108c1aea58d995/tourstats/models.pyc
--------------------------------------------------------------------------------
/tourstats/seed.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy.ext.declarative import declarative_base
2 | from sqlalchemy.orm import sessionmaker
3 |
4 | from sqlalchemy import create_engine
5 | from sqlalchemy import Column, Integer, String, ForeignKey
6 | from sqlalchemy.orm import relationship
7 |
8 | engine = create_engine('postgresql://pgatour_user:pgatour_user_password@localhost:5432/pgatour')
9 | Session = sessionmaker(bind=engine)
10 | session = Session()
11 |
12 | Base = declarative_base()
13 |
14 | class Player(Base):
15 | __tablename__ = 'players'
16 | id = Column('id', Integer, primary_key=True)
17 | name = Column('name', String)
18 | stat_lines = relationship("StatLine")
19 |
20 | class Stat(Base):
21 | __tablename__ = 'stats'
22 | id = Column('id', Integer, primary_key=True)
23 | name = Column('name', String)
24 | stat_lines = relationship("StatLine")
25 |
26 | class StatLine(Base):
27 | __tablename__ = 'stat_lines'
28 | id = Column('id', Integer, primary_key=True)
29 | player_id = Column('player_id', Integer, ForeignKey("players.id"))
30 | player = relationship('Player')
31 | stat_id = Column('stat_id', Integer, ForeignKey("stats.id"))
32 | stat = relationship('Stat')
33 | raw = Column('raw', String)
34 | events = Column('events', Integer)
35 | year = Column('year', Integer)
36 |
37 | import os
38 | import csv
39 |
40 | '''
41 | players = set()
42 | def add_players_from_file(filepath):
43 | with open(filepath, 'rb') as csvfile:
44 | reader = csv.reader(csvfile)
45 | for row in reader:
46 | player_name = row[2]
47 | #some players have an astrisk at the end of their name
48 | #want to remove this for player insertion
49 | if len(player_name) > 0 and player_name[-1] == "*":
50 | player_name = player_name[0:-2]
51 | players.add(player_name)
52 |
53 | for subdir, dirs, files in os.walk('stats_csv'):
54 | for dir in dirs:
55 | for subdir, dirs, files in os.walk("stats_csv/%s" % dir):
56 | for file in files:
57 | filepath = "stats_csv/%s/%s" % (dir, file)
58 | add_players_from_file(filepath)
59 |
60 | for player_name in players:
61 | if session.query(Player).filter_by(name=player_name).count() == 0:
62 | p = Player(name=player_name)
63 | session.add(p)
64 |
65 | for subdir, dirs, files in os.walk('stats_csv'):
66 | for dir in dirs:
67 | if session.query(Stat).filter_by(name=dir).count() == 0:
68 | print dir
69 | s = Stat(name=dir)
70 | session.add(s)
71 | session.commit()
72 | session.close() #for good measure
73 |
74 |
75 |
76 | def acknowledge_or_create_stat_line(data, stat, year):
77 | for row in data:
78 | if len(row) >= 5:
79 | player_name = row[2]
80 | if len(player_name) > 0 and player_name[-1] == "*":
81 | player_name = player_name[0:-2]
82 | player = session.query(Player).filter_by(name=player_name).first()
83 | stat_line = session.query(StatLine).filter_by(player=player, stat=stat, year=year).first()
84 | if not stat_line:
85 | try:
86 | events = int(row[3])
87 | except ValueError:
88 | events = 0
89 | raw = row[4]
90 | stat_line = StatLine(player=player, stat=stat, year=year, events=events, raw=raw)
91 | session.add(stat_line)
92 |
93 | def process_file(filename, stat, year):
94 | with open(filename, 'rb') as csvfile:
95 | reader = csv.reader(csvfile)
96 | next(reader)
97 | stat_count = session.query(StatLine).filter_by(stat=stat, year=year).count()
98 | data = list(reader) #only do this because I know reader is about 200. Bigger data sets can have issues!
99 | file_stat_count = len(data)
100 | print "%s, stat_count: %s, file_stat_count: %s" % (filename, stat_count, file_stat_count)
101 | if stat_count != file_stat_count:
102 | acknowledge_or_create_stat_line(data, stat, year)
103 | session.commit()
104 | return filename
105 |
106 | from multiprocessing import Pool
107 | pool = Pool()
108 |
109 | for subdir, dirs, files in os.walk('stats_csv'):
110 | for dir in dirs:
111 | stat = session.query(Stat).filter_by(name=dir).first()
112 | for subdir, dirs, files in os.walk("stats_csv/%s" % dir):
113 | for file in files:
114 | year = int(file[0:-4]) #chopping off the csv
115 | filepath = "stats_csv/%s/%s" % (dir, file)
116 | pool.apply_async(process_file, [filepath, stat, year])
117 |
118 | pool.close()
119 | pool.join()
120 | '''
121 |
122 | phil = session.query(Player).filter_by(name='Phil Mickelson').first()
123 | stat = session.query(Stat).filter_by(name='Driving Distance').first()
124 | stat_lines = session.query(StatLine).filter_by(player=phil, stat=stat).order_by("year")
125 | for stat_line in stat_lines:
126 | print "%s: %s" % (stat_line.year, stat_line.raw)
127 |
--------------------------------------------------------------------------------
Like this URL:
297 |296 | http://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2016-17&SeasonSegment=&SeasonType=Playoffs&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=
LikeLike
298 |Pingback: General Tips for Web Scraping with Python – Full-Stack Feed
306 |Pingback: 2 – General Tips for Web Scraping with Python
309 |Pingback: This Week in Data Science (May 16, 2017) – Be Analytics
312 |Pingback: This Week in Data Science (May 16, 2017) – Cloud Data Architect
315 |Pingback: This Week in Data Science (May 16, 2017) - biva
318 |Pingback: This Week in Data Science (May 16, 2017) – Be Analytics
321 |