├── .gitignore
├── baby_names
    └── name_search.py
├── naive_bayes
    ├── app.py
    ├── classify.py
    ├── naive_bayes.py
    ├── sample.py
    ├── test_baby.json
    ├── test_food.json
    ├── test_home.json
    ├── test_pet.json
    ├── test_tool.json
    ├── train_baby.json
    ├── train_food.json
    ├── train_home.json
    ├── train_pet.json
    └── train_tool.json
├── scraping
    ├── data_scrapy.py
    ├── helpers.py
    ├── page.html
    ├── requesting_html.py
    ├── requirements.txt
    ├── scraping_html.py
    └── selenium_test.py
├── sklearn_classify
    └── classify.py
└── tourstats
    ├── analyze.py
    ├── distance_vs_putts.csv
    ├── driving_distance.html
    ├── driving_vs_putts.py
    ├── gather.py
    ├── models.py
    ├── models.pyc
    └── seed.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | naive_bayes/data
 3 | tourstats/stats*
 4 | tourstats/pgatourstats.zip
 5 | 
 6 | 
 7 | # we don't want to save the data files.
 8 | #Just the code to scrap the data files if that's what you're looking for.
 9 | *.json
10 | *.csv
11 | 
12 | *.npy
13 | *.pkl
14 | *.html
15 | *.zip
16 | *.png
17 | 
18 | *.pyc
19 | *.log
20 | scraping/pages
21 | scraping/texts
22 | scraping/scraping
23 | 


--------------------------------------------------------------------------------
/baby_names/name_search.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | from collections import Counter
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | 
  6 | vowels = ('A', 'E', 'I', 'O', 'U')
  7 | import string
  8 | alphabet = [letter for letter in string.ascii_uppercase]
  9 | boy = 'boy'
 10 | girl = 'girl'
 11 | 
 12 | def gather_names(gender):
 13 |   filename = "%s_names.html" % gender
 14 |   names = []
 15 |   with open(filename, 'r') as file:
 16 |     page = file.read()
 17 |     html = BeautifulSoup(page.replace('\n',''), 'html.parser')
 18 |     #remove tags with class=tm-embedded-post-container
 19 |     #so the ad isn't included in text
 20 |     for tag in html.find_all('div', class_="tm-embedded-post-container"):
 21 |       tag.decompose()
 22 |     for name_link in html.find_all("li", class_="p1"):
 23 |       name = name_link.text.upper()
 24 |       names.append(name)
 25 |   return names
 26 | 
 27 | boy_names = gather_names(boy)
 28 | girl_names = gather_names(girl)
 29 | 
 30 | def gender_names(gender):
 31 |   if gender == boy:
 32 |     return boy_names
 33 |   elif gender == girl:
 34 |     return girl_names
 35 | 
 36 | def calculate_replace_letter_matches(name_set, first_letter, exchange_letter):
 37 |   '''
 38 |   Returns list of sets with name matches
 39 |   '''
 40 |   name_matches = []
 41 |   for name in name_set:
 42 |     if first_letter in name:
 43 |       exchange_name = name.replace(first_letter, exchange_letter)
 44 |       temp_name_matches = [name]
 45 |       if exchange_name in name_set:
 46 |         temp_name_matches.append(exchange_name)
 47 |       if len(set(temp_name_matches)) > 1:
 48 |         name_matches.append(set(temp_name_matches))
 49 | 
 50 |   return name_matches
 51 | 
 52 | def replace_single_letter(first_letter='I', exchange_letter='Y', show_matches=True):
 53 |   boy_name_set = set(boy_names)
 54 |   boy_name_matches = calculate_replace_letter_matches(boy_name_set, first_letter, exchange_letter)
 55 |   print 'Boy name matches: %s' % len(boy_name_matches)
 56 |   if show_matches:
 57 |     print boy_name_matches
 58 | 
 59 |   girl_name_set = set(girl_names)
 60 |   girl_name_matches = calculate_replace_letter_matches(girl_name_set, first_letter, exchange_letter)
 61 |   print 'Girl name matches: %s' % len(girl_name_matches)
 62 |   if show_matches:
 63 |     print girl_name_matches
 64 | 
 65 | def npr_solver(gender):
 66 |   print "Vowel Consonant Consonant Starting names for %ss" % gender
 67 |   names = gender_names(gender)
 68 |   vowel_starters = []
 69 |   consonant_starters = []
 70 |   for name in names:
 71 |     first_letter = name[0]
 72 |     if first_letter in vowels:
 73 |       vowel_starters.append(name)
 74 |     else:
 75 |       consonant_starters.append(name)
 76 | 
 77 |   for vname in vowel_starters:
 78 |     cname_same = []
 79 |     for cname in consonant_starters:
 80 |       if vname[1:] == cname[1:]:
 81 |         cname_same.append(cname)
 82 |     if cname_same:
 83 |       print vname
 84 |       for match in cname_same:
 85 |         print match
 86 | 
 87 | def rhyming_names(gender):
 88 |   print "Rhyming for %ss" % gender
 89 |   total_matches = []
 90 |   names = gender_names(gender)
 91 |   for name in names:
 92 |     name_same = []
 93 |     for name2 in names:
 94 |       if name[1:] == name2[1:] and name != name2:
 95 |         name_same.append(name2)
 96 |     if name_same:
 97 |       name_same.append(name)
 98 |       if set(name_same) not in total_matches:
 99 |         total_matches.append(set(name_same))
100 |   print "Total %s matches: %s" % (gender, len(total_matches))
101 |   for matches in total_matches:
102 |     print list(matches),
103 |   print #actual new line
104 | 
105 | def vowel_consonant_beginning_proportion(gender):
106 |   print "Vowel Consonant Beginning Ratio for %ss" % gender
107 |   names = gender_names(gender)
108 |   vowel_starters = []
109 |   consonant_starters = []
110 | 
111 |   for name in names:
112 |     first_letter = name[0]
113 |     if first_letter in vowels:
114 |       vowel_starters.append(name)
115 |     else:
116 |       consonant_starters.append(name)
117 | 
118 |   vowel_len = float(len(vowel_starters))
119 |   consonant_len = float(len(consonant_starters))
120 |   print vowel_len / (vowel_len + consonant_len)
121 | 
122 | def name_letter_begin_or_end(gender, index='beginning'):
123 |   if index is 'beginning':
124 |     asdf = 1
125 |   elif index is 'end':
126 |     asdf = -1
127 |   else:
128 |     print 'fail'
129 |     return
130 |   print "Name letter %s for %ss" % (index, gender)
131 |   names = gender_names(gender)
132 | 
133 |   cnt = Counter()
134 |   for name in names:
135 |     letter = name[asdf]
136 |     cnt[letter] += 1
137 |   return cnt
138 | 
139 | def name_lengths_counter(gender):
140 |   names = gender_names(gender)
141 | 
142 |   cnt = Counter()
143 |   for name in names:
144 |     cnt[len(name)] += 1
145 |   return cnt
146 | 
147 | def name_lengths(gender):
148 |   names = gender_names(gender)
149 |   return [len(name) for name in names]
150 | 
151 | def count_name_lengths():
152 |   lengths = np.arange(15)
153 |   boy_lengths = name_lengths_counter(boy)
154 |   girl_lengths = name_lengths_counter(girl)
155 |   boy_lengths_list = [boy_lengths[length] for length in lengths]
156 |   girl_lengths_list = [girl_lengths[length] for length in lengths]
157 | 
158 |   boy_length_counts = name_lengths(boy)
159 |   girl_length_counts = name_lengths(girl)
160 | 
161 |   print 'Boy length avg: %s' % np.mean(boy_length_counts)
162 |   print 'Boy length std: %s' % np.std(boy_length_counts)
163 |   print 'Girl length avg: %s' % np.mean(girl_length_counts)
164 |   print 'Girl length std: %s' % np.std(girl_length_counts)
165 | 
166 |   #time to plot the bars
167 |   fig, ax = plt.subplots()
168 | 
169 |   opacity = 0.4
170 |   bar_width = 0.35
171 | 
172 |   rects1 = plt.bar(lengths, boy_lengths_list, bar_width,
173 |                  alpha=opacity,
174 |                  color='b',
175 |                  label='Boys')
176 | 
177 |   rects2 = plt.bar(lengths + bar_width, girl_lengths_list, bar_width,
178 |                  alpha=opacity,
179 |                  color='r',
180 |                  label='Girls')
181 | 
182 |   plt.xlabel('Lengths')
183 |   plt.ylabel('Number of names of that length')
184 |   plt.title('Lengths of boy and girl names')
185 |   plt.legend()
186 | 
187 |   plt.tight_layout()
188 | 
189 |   plt.savefig('graphs/name_length_bar.png')
190 | 
191 | 
192 | def begin_end_names(gender, beginning_letter, ending_letter):
193 |   names = gender_names(gender)
194 |   matching_names = []
195 |   for name in names:
196 |     if name[0] == beginning_letter and name[-1] == ending_letter:
197 |       matching_names.append(name)
198 |   return matching_names
199 | 
200 | 
201 | 
202 | 
203 | 
204 | def vowel_consonant_ending_proportion(gender):
205 |   print "Vowel Consonant Ending Ratio for %ss" % gender
206 |   names = gender_names(gender)
207 |   vowel_enders = []
208 |   consonant_enders = []
209 |   for name in names:
210 |     last_letter = name[-1]
211 |     if last_letter in vowels:
212 |       vowel_enders.append(name)
213 |     else:
214 |       consonant_enders.append(name)
215 | 
216 |   vowel_len = float(len(vowel_enders))
217 |   consonant_len = float(len(consonant_enders))
218 |   print vowel_len / (vowel_len + consonant_len)
219 | 
220 | def count_vowels_consonants(gender, index):
221 |   '''
222 |   Gives counts for whether the letters at the indicies are vowels or consonants
223 |   index = 1 for first letter, index = -1 for last letter.
224 |   Other indicies work, but might cause error if index is longer than two since
225 |   there are some two letter names!
226 |   '''
227 |   names = gender_names(gender)
228 |   sizes = []
229 |   cnt = Counter()
230 |   for name in names:
231 |     if name[index] in vowels:
232 |       cnt['v'] += 1
233 |     else:
234 |       cnt['c'] += 1
235 |   return cnt
236 | 
237 | def print_percentages(gender, sizes, title):
238 |   vowel_len = float(sizes[0])
239 |   consonant_len = float(sizes[1])
240 |   vowel_percentage = vowel_len / (vowel_len + consonant_len)
241 |   consonant_percentage = consonant_len / (vowel_len + consonant_len)
242 |   print title % gender
243 |   print 'Vowel percentage: %s' % vowel_percentage
244 |   print 'Consonant percentage: %s' % consonant_percentage
245 |   print #for spacing
246 | 
247 | def vowels_consonant_starts():
248 |   '''
249 |   Pie graph of the frequency of names that begin with vowels for both genders
250 |   '''
251 |   boy_counts = count_vowels_consonants(boy, 0)
252 |   girl_counts = count_vowels_consonants(girl, 0)
253 | 
254 |   #graph time for the boys, nothing to do with Saturday being for the boys cause that's a dumb phrase
255 |   title = "Percentage of %s names that start with vowels or consonants"
256 |   labels = 'Vowels', 'Consonants'
257 |   boy_fig, boy_ax = plt.subplots()
258 |   sizes = [boy_counts['v'], boy_counts['c']]
259 |   print_percentages(boy, sizes, title)
260 | 
261 |   boy_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
262 |   plt.title(title % 'boy')
263 |   plt.savefig('graphs/vowel_consonant_start_boy.png')
264 | 
265 |   sizes = [girl_counts['v'], girl_counts['c']]
266 |   print_percentages(girl, sizes, title)
267 |   girl_fig, girl_ax = plt.subplots()
268 |   girl_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
269 |   plt.title(title % 'girl')
270 | 
271 |   plt.savefig('graphs/vowel_consonant_start_girl.png')
272 | 
273 | def vowels_consonant_ends():
274 |   '''
275 |   Pie graph of the frequency of names that begin with vowels for both genders
276 |   '''
277 |   boy_counts = count_vowels_consonants(boy, -1)
278 |   girl_counts = count_vowels_consonants(girl, -1)
279 | 
280 |   #graph time for the boys, nothing to do with Saturday being for the boys cause that's a dumb phrase
281 | 
282 |   title = "Percentage of %s names that end with vowels or consonants"
283 |   labels = 'Vowels', 'Consonants'
284 |   boy_fig, boy_ax = plt.subplots()
285 | 
286 |   sizes = [boy_counts['v'], boy_counts['c']]
287 |   print_percentages(girl, sizes, title)
288 |   boy_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
289 |   plt.title(title % 'boy')
290 |   plt.savefig('graphs/vowel_consonant_ends_boys.png')
291 | 
292 |   sizes = [girl_counts['v'], girl_counts['c']]
293 |   print_percentages(girl, sizes, title)
294 |   girl_fig, girl_ax = plt.subplots()
295 |   girl_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
296 |   plt.title(title % 'girl')
297 |   plt.savefig('graphs/vowel_consonant_ends_girls.png')
298 | 
299 | def vowel_endings():
300 |   boy_counts = count_vowel_frequency(boy, -1)
301 |   girl_counts = count_vowel_frequency(girl, -1)
302 | 
303 |   #graph time for the boys, nothing to do with Saturday being for the boys cause that's a dumb phrase
304 | 
305 |   title = "Percentage of vowels that %s names end with"
306 |   labels = boy_counts.keys()
307 |   boy_fig, boy_ax = plt.subplots()
308 |   sizes = [boy_counts[vowel] for vowel in labels if vowel in boy_counts.keys()]
309 |   print_percentages(boy, sizes, title)
310 |   boy_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
311 |   plt.title(title % 'boy')
312 |   plt.savefig('graphs/vowel_endings_boys.png')
313 | 
314 |   girl_fig, girl_ax = plt.subplots()
315 |   labels = girl_counts.keys()
316 |   sizes = [girl_counts[vowel] for vowel in labels if vowel in girl_counts.keys()]
317 |   print_percentages(girl, sizes, title)
318 |   girl_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
319 |   plt.title(title % 'girl')
320 |   plt.savefig('graphs/vowel_endings_girls.png')
321 | 
322 | def vowel_beginnings():
323 |   boy_counts = count_vowel_frequency(boy, 0)
324 |   girl_counts = count_vowel_frequency(girl, 0)
325 | 
326 |   #graph time for the boys, nothing to do with Saturday being for the boys cause that's a dumb phrase
327 | 
328 |   title = "Percentage of vowels that %s names begin with"
329 |   labels = boy_counts.keys()
330 |   boy_fig, boy_ax = plt.subplots()
331 |   sizes = [boy_counts[vowel] for vowel in labels if vowel in boy_counts.keys()]
332 |   boy_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
333 |   plt.title(title % 'boy')
334 |   plt.savefig('graphs/vowel_beginnings_boys.png')
335 | 
336 |   girl_fig, girl_ax = plt.subplots()
337 |   labels = girl_counts.keys()
338 |   sizes = [girl_counts[vowel] for vowel in labels if vowel in girl_counts.keys()]
339 |   girl_ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
340 |   plt.title(title % 'girl')
341 |   plt.savefig('graphs/vowel_beginnings_girls.png')
342 | 
343 | def letter_frequency(index=0):
344 | 
345 |   boy_name_counter = Counter()
346 |   for name in boy_names:
347 |     beginning_letter = name[0]
348 |     boy_name_counter[beginning_letter] += 1
349 | 
350 |   print boy_name_counter
351 | 
352 |   girl_name_counter = Counter()
353 |   for name in girl_names:
354 |     beginning_letter = name[0]
355 |     girl_name_counter[beginning_letter] += 1
356 | 
357 |   print girl_name_counter
358 | 
359 |   boy_alphabet_count_list = []
360 |   girl_alphabet_count_list = []
361 |   for letter in alphabet:
362 |     boy_alphabet_count_list.append(float(boy_name_counter[letter]))
363 |     girl_alphabet_count_list.append(float(girl_name_counter[letter]))
364 | 
365 |   print boy_alphabet_count_list
366 |   print girl_alphabet_count_list
367 | 
368 |   #time to plot the bars
369 |   fig, ax = plt.subplots()
370 | 
371 |   opacity = 0.6
372 |   bar_width = 0.35
373 | 
374 |   num_letters = np.arange(26)
375 |   rects1 = plt.bar(num_letters, boy_alphabet_count_list, bar_width,
376 |                  align='center',
377 |                  alpha=opacity,
378 |                  color='b',
379 |                  label='Boys')
380 | 
381 |   rects2 = plt.bar(num_letters + bar_width, girl_alphabet_count_list, bar_width,
382 |                  align='center',
383 |                  alpha=0.8,
384 |                  color='palevioletred',
385 |                  label='Girls')
386 | 
387 |   plot_title = 'Number of names that begin with certain letters'
388 |   xtick_pos = [let + (bar_width / 2) for let in num_letters]
389 |   plt.xticks(xtick_pos, alphabet)
390 |   plt.xlabel('Letters')
391 |   plt.ylabel('Number of names that begin with letter')
392 |   plt.title(plot_title)
393 |   plt.legend()
394 |   plt.tight_layout()
395 |   #plt.show()
396 |   plt.savefig('graphs/letter_frequency.png')
397 | 
398 | 
399 | def count_vowel_frequency(gender, index):
400 |   names = gender_names(gender)
401 |   sizes = []
402 |   cnt = Counter()
403 |   for name in names:
404 |     index_letter = name[index]
405 |     if index_letter in vowels:
406 |       cnt[index_letter] += 1
407 |   return cnt
408 | 
409 | def common_names(gender):
410 |   '''
411 |     Gathers the letters that the names start with, and then count the combos of all the names that begin and end with the matching letters.
412 |   '''
413 |   begin_gender_counts = name_letter_begin_or_end(gender, index='beginning')
414 |   end_gender_counts = name_letter_begin_or_end(gender, index='end')
415 | 
416 |   for bletter in begin_gender_counts:
417 |     for eletter in end_gender_counts:
418 |       match_names= begin_end_names(gender, bletter, eletter)
419 | 
420 |   for name in match_names:
421 |     print name
422 | 
423 | 
424 | if __name__ == '__main__':
425 |   #npr_solver(boy)
426 |   #npr_solver(girl)
427 |   #vowels_consonant_starts()
428 |   #vowels_consonant_ends()
429 |   #vowel_endings()
430 |   #vowel_beginnings()
431 |   #letter_frequency()
432 |   #letter_frequency(index=0)
433 |   #count_name_lengths()
434 |   #replace_single_letter(first_letter='I', exchange_letter='Y')
435 |   #replace_single_letter(first_letter='IE', exchange_letter='Y')
436 |   #replace_single_letter(first_letter='EE', exchange_letter='Y')
437 |   #replace_single_letter(first_letter='A', exchange_letter='Y')
438 |   #replace_single_letter(first_letter='C', exchange_letter='K')
439 |   #replace_single_letter(first_letter='CK', exchange_letter='K')
440 |   #replace_single_letter(first_letter='HN', exchange_letter='N')
441 |   #replace_single_letter(first_letter='G', exchange_letter='J')
442 |   #rhyming_names(boy)
443 |   #rhyming_names(girl)
444 |   '''
445 |   for index, l1 in enumerate(alphabet):
446 |     for l2 in alphabet[index:]:
447 |       print "Flip %s and %s" % (l1, l2)
448 |       replace_single_letter(first_letter=l1, exchange_letter=l2, show_matches=True)
449 |   for match in alphabet:
450 |     print "Flip %s and %s" % ('K', match)
451 |     replace_single_letter(first_letter='K', exchange_letter=match, show_matches=True)
452 |   '''
453 |   pass # in case you don't uncomment a test you want to run, we need correct syntax
454 | 


--------------------------------------------------------------------------------
/naive_bayes/app.py:
--------------------------------------------------------------------------------
 1 | from flask import jsonify, request, Flask
 2 | from sklearn.externals import joblib
 3 | 
 4 | print "Loading Pickled Pipeline"
 5 | fitted_pipeline = joblib.load('classifier.pkl')
 6 | 
 7 | app = Flask(__name__)
 8 | 
 9 | @app.route('/', methods=['POST'])
10 | def predict():
11 |   text = request.form.get('text')
12 |   guess = fitted_pipeline.predict([text])[0] #pipeline returns array
13 |   results = {"class": guess}
14 |   return jsonify(results)
15 | 
16 | if __name__ == '__main__':
17 |   app.run()
18 | 
19 | 


--------------------------------------------------------------------------------
/naive_bayes/classify.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from sklearn.feature_extraction.text import CountVectorizer
  3 | from sklearn.feature_extraction.text import TfidfTransformer
  4 | from sklearn.feature_extraction.text import TfidfVectorizer
  5 | from sklearn.feature_extraction.text import HashingVectorizer
  6 | from pandas import DataFrame
  7 | import numpy
  8 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
  9 | from sklearn.pipeline import Pipeline
 10 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
 11 | from sklearn.cross_validation import KFold
 12 | from sklearn.svm import SVC
 13 | 
 14 | 
 15 | LABELS = []
 16 | LABELS.append("baby")
 17 | LABELS.append("tool")
 18 | LABELS.append("home")
 19 | LABELS.append("pet")
 20 | LABELS.append("food")
 21 | LABELS.append("automotive")
 22 | LABELS.append("instant_video")
 23 | LABELS.append("beauty")
 24 | LABELS.append("cds_vinyl")
 25 | LABELS.append("clothes")
 26 | LABELS.append("digital_music")
 27 | LABELS.append("cell_phones")
 28 | LABELS.append("electronics")
 29 | LABELS.append("kindle")
 30 | LABELS.append("movies_tv")
 31 | LABELS.append("instruments")
 32 | LABELS.append("office")
 33 | LABELS.append("patio")
 34 | LABELS.append("health")
 35 | LABELS.append("sports")
 36 | LABELS.append("toys")
 37 | LABELS.append("video_games")
 38 | LABELS.append("books")
 39 | 
 40 | def read_review_data(num_classes):
 41 |   print "Begin reading in data"
 42 |   labels = LABELS[:num_classes]
 43 |   reviews = []
 44 |   for label in labels:
 45 |     train_filename = "train_%s.json" % label
 46 |     test_filename = "train_%s.json" % label
 47 |     filenames = [train_filename, test_filename]
 48 |     for filename in filenames:
 49 |       with open(filename, 'r') as f:
 50 |         for line in f:
 51 |           text = json.loads(line)["reviewText"]
 52 |           reviews.append({'text': text, 'class': label})
 53 | 
 54 |   data = DataFrame(reviews)
 55 |   data = data.reindex(numpy.random.permutation(data.index))
 56 | 
 57 |   #to evaluate the length of review
 58 |   data["word_count"] = [len(text.split(" ")) for text in data["text"]]
 59 | 
 60 |   NUM_TRAIN_SAMPLES = int(len(data) * 0.8)
 61 | 
 62 |   train_data = data[:NUM_TRAIN_SAMPLES]
 63 |   test_data = data[NUM_TRAIN_SAMPLES:]
 64 |   print "End reading in data"
 65 | 
 66 |   return (train_data, test_data, labels)
 67 | 
 68 | def test_fitted_pipeline(fitted_pipeline, test_data, labels, description=""):
 69 |   actual = test_data['class'].values
 70 |   print "Predicting %s" % description
 71 |   predictions = fitted_pipeline.predict(test_data['text'].values)
 72 |   score = accuracy_score(actual, predictions)
 73 |   cmat = confusion_matrix(actual, predictions, labels)
 74 |   print
 75 |   print description or "Results"
 76 |   print score
 77 |   print labels
 78 |   print cmat
 79 | 
 80 | def fit_pipeline(pipeline, train_data, description=""):
 81 |   print "Training %s Classifier" % description
 82 |   pipeline.fit(train_data['text'].values, train_data['class'].values)
 83 |   return pipeline
 84 | 
 85 | def test_pipeline(pipeline, train_data, test_data, labels, description=""):
 86 |   fitted_pipeline = fit_pipeline(pipeline, train_data, description=description)
 87 |   test_fitted_pipeline(fitted_pipeline, test_data, labels, description=description)
 88 | 
 89 | def evalutate_n_grams(num_classes=5):
 90 |   train_data, test_data, labels = read_review_data(num_classes)
 91 |   classifier = MultinomialNB()
 92 | 
 93 |   unigram_vectorizer = CountVectorizer(stop_words='english')
 94 |   bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
 95 |   trigram_vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
 96 |   fourgram_vectorizer = CountVectorizer(ngram_range=(1, 4), stop_words='english')
 97 | 
 98 | 
 99 |   unigram_pipeline = Pipeline([
100 |     ('count_vectorizer', unigram_vectorizer),
101 |     ('classifier'      , classifier)
102 |   ])
103 | 
104 |   bigram_pipeline = Pipeline([
105 |     ('count_vectorizer', bigram_vectorizer),
106 |     ('classifier'      , classifier)
107 |   ])
108 | 
109 |   trigram_pipeline = Pipeline([
110 |     ('count_vectorizer', trigram_vectorizer),
111 |     ('classifier'      , classifier)
112 |   ])
113 | 
114 |   fourgram_pipeline = Pipeline([
115 |     ('count_vectorizer', fourgram_vectorizer),
116 |     ('classifier'      , classifier)
117 |   ])
118 | 
119 |   test_pipeline(unigram_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
120 |   test_pipeline(bigram_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
121 |   test_pipeline(trigram_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
122 |   test_pipeline(fourgram_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
123 | 
124 | def evaluate_classifier_type(num_classes=5):
125 |   train_data, test_data, labels = read_review_data(num_classes)
126 | 
127 |   bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
128 | 
129 |   mn_classifier = MultinomialNB(fit_prior=False)
130 |   b_classifier = BernoulliNB()
131 | 
132 |   mn_pipeline = Pipeline([
133 |     ('count_vectorizer', bigram_vectorizer),
134 |     ('classifier'      , mn_classifier)
135 |   ])
136 | 
137 |   b_pipeline = Pipeline([
138 |     ('count_vectorizer', bigram_vectorizer),
139 |     ('classifier'      , b_classifier)
140 |   ])
141 | 
142 |   test_pipeline(mn_pipeline, train_data, test_data, labels, description="Multinomial")
143 |   test_pipeline(b_pipeline, train_data, test_data, labels, description="Bernoulli")
144 | 
145 | def evaluate_tfidf(num_classes=5):
146 |   train_data, test_data, labels = read_review_data(num_classes)
147 |   labels = labels[:num_classes]
148 |   classifier = MultinomialNB()
149 | 
150 |   bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
151 |   tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, stop_words='english')
152 |   tfidf_transformer = TfidfTransformer()
153 | 
154 |   no_tfidf_pipeline = Pipeline([
155 |     ('count_vectorizer', bigram_vectorizer),
156 |     ('classifier'      , classifier)
157 |   ])
158 | 
159 |   tfidf_vectorizer_pipeline = Pipeline([
160 |     ('count_vectorizer', tfidf_vectorizer),
161 |     ('classifier'      , classifier)
162 |   ])
163 | 
164 |   tfidf_transformer_pipeline = Pipeline([
165 |     ('count_vectorizer', bigram_vectorizer),
166 |     ('tfidf_transformer' , tfidf_transformer),
167 |     ('classifier'      , classifier)
168 |   ])
169 | 
170 |   test_pipeline(no_tfidf_pipeline, train_data, test_data, labels, description="No TFIDF")
171 | 
172 |   test_pipeline(tfidf_vectorizer_pipeline, train_data, test_data, labels, description="TFIDF Vectorizer")
173 | 
174 |   test_pipeline(tfidf_transformer_pipeline, train_data, test_data, labels, description="TFIDF Transformer")
175 | 
176 | def evaluate_training_counts(train_data, test_data, num_classes=5):
177 |   pass
178 | 
179 | def evaluate_standard(num_classes=5):
180 |   train_data, test_data, labels = read_review_data(num_classes)
181 | 
182 |   four_gram_vectorizer = CountVectorizer(ngram_range=(1, 4), stop_words='english')
183 |   classifier = MultinomialNB(fit_prior=False)
184 | 
185 |   pipeline = Pipeline([
186 |     ('count_vectorizer', four_gram_vectorizer),
187 |     ('classifier'      , classifier)
188 |     ])
189 | 
190 |   test_pipeline(pipeline, train_data, test_data, labels, description="Standard")
191 | 
192 | def evaluate_lengths(num_classes=5):
193 |   train_data, test_data, labels = read_review_data(num_classes)
194 | 
195 |   vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
196 |   classifier = MultinomialNB(fit_prior=False)
197 | 
198 |   pipeline = Pipeline([
199 |     ('count_vectorizer', vectorizer),
200 |     ('classifier'      , classifier)
201 |   ])
202 | 
203 |   shortest_test_data = test_data[test_data["word_count"] < 20]
204 |   short_test_data = test_data[(test_data["word_count"] > 20) & (test_data["word_count"] <= 50)]
205 |   med_test_data = test_data[(test_data["word_count"] > 50) & (test_data["word_count"] <= 100)]
206 |   long_test_data = test_data[test_data["word_count"] > 100]
207 | 
208 |   fitted_pipeline = fit_pipeline(pipeline, train_data)
209 | 
210 |   test_fitted_pipeline(pipeline, test_data, labels, description="Standard")
211 |   print
212 |   print "Num shortest data: %s" % str(len(shortest_test_data))
213 |   test_fitted_pipeline(pipeline, shortest_test_data, labels, description="20 Word Max")
214 |   print
215 |   print "Num shortest data: %s" % str(len(short_test_data))
216 |   test_fitted_pipeline(pipeline, short_test_data, labels, description="Between 25 and 50 Words")
217 |   print
218 |   print "Num shortest data: %s" % str(len(med_test_data))
219 |   test_fitted_pipeline(pipeline, med_test_data, labels, description="Between 50 and 100 Words")
220 |   print
221 |   print "Num shortest data: %s" % str(len(long_test_data))
222 |   test_fitted_pipeline(pipeline, long_test_data, labels, description="100 Word Min")
223 | 
224 | 
225 | def pickle_pipeline(pipeline, num_classes=5):
226 |   train_data, test_data, labels = read_review_data(num_classes)
227 |   from sklearn.externals import joblib
228 |   fitted_pipeline = fit_pipeline(pipeline, train_data)
229 |   print "Pickling Pipeline"
230 |   joblib.dump(fitted_pipeline, 'classifier.pkl')
231 | 
232 | 
233 | def use_pickled_pipeline(num_classes=26):
234 |   _, test_data, labels = read_review_data(num_classes)
235 |   from sklearn.externals import joblib
236 |   print "Loading Pickled Pipeline"
237 |   fitted_pipeline = joblib.load('classifier.pkl')
238 | 
239 |   test_fitted_pipeline(fitted_pipeline, test_data, labels, description="From Pickle")
240 | 
241 |   pass
242 | '''
243 | 
244 | 
245 | 
246 | import matplotlib.pyplot as plt
247 | 
248 | bins = [10 * (i) for i in range(50)]
249 | percents = [0.5506607929515418, 0.8571428571428571, 0.89151434091246839, 0.92522522522522521, 0.92804878048780493, 0.9469924812030075, 0.95398230088495573, 0.9448568398727466, 0.95388502842703726, 0.95697329376854601, 0.96498719043552517, 0.96037735849056605, 0.96003996003996006, 0.96465222348916757, 0.96681096681096679, 0.95469798657718119, 0.94086021505376349, 0.94837476099426388, 0.95259593679458243, 0.95022624434389136, 0.94750000000000001, 0.96625766871165641, 0.96491228070175439, 0.96180555555555558, 0.98084291187739459, 0.96442687747035571, 1.0, 0.95979899497487442, 0.90217391304347827, 0.96575342465753422, 0.93442622950819676, 0.94244604316546765, 0.97058823529411764, 0.94444444444444442, 0.98019801980198018, 0.91752577319587625, 0.95652173913043481, 0.98913043478260865, 1.0, 0.93670886075949367, 0.93333333333333335, 0.94805194805194803, 0.9642857142857143, 1.0, 0.98148148148148151, 0.92500000000000004, 0.90697674418604646, 0.93877551020408168, 1.0, 1.0]
250 | 
251 | 
252 | '''
253 | 
254 | '''
255 | import matplotlib.pyplot as plt
256 | 
257 | plt.figure()
258 | plt.title('Correct Max Probabilities')
259 | plt.hist(correct, 100)
260 | plt.show()
261 | plt.figure()
262 | plt.title('Incorrect Max Probabilities')
263 | plt.hist(incorrect, 100)
264 | plt.show()
265 | 
266 | #quit()
267 | '''
268 | 
269 | '''
270 | count_vectorizer = CountVectorizer(min_df=1, stop_words='english')
271 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1, stop_words='english')
272 | trigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1, stop_words='english')
273 | fourgram_vectorizer = CountVectorizer(ngram_range=(1, 4), min_df=1, stop_words='english')
274 | 
275 | tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, stop_words='english')
276 | #hashing_vectorizer = HashingVectorizer(n_features=100, non_negative=True)
277 | classifier = MultinomialNB()
278 | g_classifier = GaussianNB()
279 | b_classifier = BernoulliNB()
280 | 
281 | pipeline = Pipeline([
282 |   ('count_vectorizer', count_vectorizer),
283 |   ('classifier'      , classifier)
284 | ])
285 | 
286 | bigram_pipeline = Pipeline([
287 |   ('count_vectorizer', bigram_vectorizer),
288 |   ('tfidf_transformer',  TfidfTransformer()),
289 |   ('classifier'      , classifier)
290 | ])
291 | 
292 | trigram_pipeline = Pipeline([
293 |   ('count_vectorizer', trigram_vectorizer),
294 |   ('classifier'      , classifier)
295 | ])
296 | 
297 | fourgram_pipeline = Pipeline([
298 |   ('count_vectorizer', fourgram_vectorizer),
299 |   ('classifier'      , classifier)
300 | ])
301 | 
302 | 
303 | g_pipeline = Pipeline([
304 |   ('count_vectorizer', bigram_vectorizer),
305 |   ('classifier'      , g_classifier)
306 | ])
307 | 
308 | b_pipeline = Pipeline([
309 |   ('count_vectorizer', bigram_vectorizer),
310 |   ('classifier'      , b_classifier)
311 | ])
312 | 
313 | 
314 | bigram_pipeline.fit(train_data['text'].values, train_data['class'].values)
315 | 
316 | 
317 | bigram_predictions = bigram_pipeline.predict(test_data['text'].values)
318 | bigram_score = accuracy_score(actual, bigram_predictions)
319 | bigram_cmat = confusion_matrix(actual, bigram_predictions, labels)
320 | 
321 | incorrect_indicies = []
322 | correct_indicies = []
323 | for index, (act, pred) in enumerate(zip(actual, bigram_predictions)):
324 |   if act != pred:
325 |     incorrect_indicies.append(index)
326 |   else:
327 |     correct_indicies.append(index)
328 | 
329 | incorrect = test_data.iloc[incorrect_indicies]
330 | correct = test_data.iloc[correct_indicies]
331 | 
332 | for index, ic in incorrect.iterrows():
333 |   text = ic["text"]
334 |   c = ic["class"]
335 |   probs = bigram_pipeline.predict_proba([text])[0]
336 |   if max(probs) > 0.5:
337 |     guessed = bigram_pipeline.predict([text])
338 |     print text
339 |     print "Actual: %s, Guessed: %s" % (c, guessed)
340 | 
341 | import pdb;pdb.set_trace()
342 | asdf = bigram_pipeline.predict_proba(incorrect["text"].values)
343 | qwer = bigram_pipeline.predict_proba(correct["text"].values)
344 | zxcv = [float("%.3f" % max(arr)) for arr in qwer]
345 | 
346 | 
347 | print
348 | print bigram_score
349 | print labels
350 | print bigram_cmat
351 | 
352 | 
353 | #test_data = test_data[test_data["lengths"] > 1000]
354 | 
355 | #train_data = train_data[train_data["lengths"] > 100]
356 | 
357 | quit()
358 | 
359 | bins = [10 * (i) for i in range(50)]
360 | scores = []
361 | for b in bins:
362 |   td = test_data[(test_data["lengths"] > b) & (test_data["lengths"] < (b+10)) ]
363 |   actual = td['class'].values
364 |   predictions = pipeline.predict(td['text'].values)
365 | 
366 |   score = accuracy_score(actual, predictions)
367 |   scores.append(score)
368 |   #cmat = confusion_matrix(actual, predictions, labels)
369 |   print "Word Count: %s, Doc Count: %s, Score: %s" % (str(b), str(len(td)), '%.5f' % score)
370 | 
371 | print bins
372 | print scores
373 | 
374 | 
375 | '''
376 | 
377 | '''
378 | import matplotlib.pyplot as plt
379 | 
380 | incorrect_indicies = []
381 | correct_indicies = []
382 | for index, (act, pred) in enumerate(zip(actual, predictions)):
383 |   if act != pred:
384 |     incorrect_indicies.append(index)
385 |   else:
386 |     correct_indicies.append(index)
387 | 
388 | incorrect = test_data.iloc[incorrect_indicies]
389 | correct = test_data.iloc[correct_indicies]
390 | 
391 | print len(correct)
392 | print len(incorrect)
393 | 
394 | #[len(correct[correct.lengths < bins[i+1] & correct.lengths > bins[i]]) for i in range(len(bins)-1)]
395 | bins = [50 * (i) for i in range(100)]
396 | correct_counts = [len(correct[correct.lengths < i]) for i in bins]
397 | incorrect_counts = [len(incorrect[incorrect.lengths < i]) for i in bins]
398 | 
399 | print "Correct"
400 | ccounts = [0 if index == 0 else icc - correct_counts[index-1] for index, icc in enumerate(correct_counts)]
401 | print "Incorrect"
402 | iccounts = [0 if index == 0 else icc - incorrect_counts[index-1] for index, icc in enumerate(incorrect_counts)]
403 | plt.figure()
404 | plt.subplot()
405 | plt.plot(bins, ccounts)
406 | plt.subplot()
407 | plt.plot(bins, iccounts)
408 | plt.show()
409 | 
410 | ["{0:.0f}%".format(c / float(sum(ccounts))) for c in ccounts]
411 | 
412 | import pdb;pdb.set_trace()
413 | 
414 | 
415 | fig = plt.figure()
416 | incorrect["lengths"].plot.hist(bins=50)
417 | fig.suptitle('Incorrect text lengths', fontsize=14, fontweight='bold')
418 | plt.show()
419 | 
420 | fig = plt.figure()
421 | correct["lengths"].plot.hist(bins=50)
422 | fig.suptitle('Incorrect text lengths', fontsize=14, fontweight='bold')
423 | plt.show()
424 | 
425 | confusion = numpy.array([[0 for i in range(len(labels))] for y in range(len(labels))])
426 | 
427 | k_fold = KFold(n=len(data), n_folds=6)
428 | scores = []
429 | 
430 | for train_indices, test_indices in k_fold:
431 |     train_text = data.iloc[train_indices]['text'].values
432 |     train_y = data.iloc[train_indices]['class'].values
433 | 
434 |     test_text = data.iloc[test_indices]['text'].values
435 |     test_y = data.iloc[test_indices]['class'].values
436 | 
437 |     pipeline.fit(train_text, train_y)
438 |     predictions = pipeline.predict(test_text)
439 | 
440 |     confusion += confusion_matrix(test_y, predictions)
441 |     score = accuracy_score(test_y, predictions)
442 |     scores.append(score)
443 | print numpy.average(scores)
444 | print labels
445 | print confusion
446 | 
447 | '''
448 | 
449 | if __name__ == "__main__":
450 | 
451 |   #evaluate_standard(num_classes=26)
452 |   #evaluate_classifier_type(num_classes=26)
453 |   #evalutate_n_grams(num_classes=3)
454 |   #evaluate_tfidf(num_classes=5)
455 |   #evaluate_lengths(num_classes=10)
456 | 
457 |   #vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
458 |   #classifier = MultinomialNB(fit_prior=False)
459 | 
460 |   #pipeline = Pipeline([
461 |   #  ('count_vectorizer', vectorizer),
462 |   #  ('classifier'      , classifier)
463 |   #])
464 |   #pickle_pipeline(pipeline, num_classes=26)
465 |   use_pickled_pipeline()
466 |   pass
467 | 
468 | 


--------------------------------------------------------------------------------
/naive_bayes/naive_bayes.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | from collections import Counter
  4 | import string
  5 | from nltk.corpus import stopwords
  6 | import nltk
  7 | 
  8 | STOP_WORDS = set(stopwords.words('english'))
  9 | STOP_WORDS.add('')
 10 | 
 11 | def print_confusion_matrix(matrix, class_labels):
 12 |   lines = ["" for i in range(len(class_labels)+1)]
 13 |   for index, c in enumerate(class_labels):
 14 |     lines[0] += "\t"
 15 |     lines[0] += c
 16 |     lines[index+1] += c
 17 |   for index, result in enumerate(matrix):
 18 |     for amount in result:
 19 |       lines[index+1] += "\t"
 20 |       lines[index+1] += str(amount)
 21 |   for line in lines:
 22 |     print line
 23 | 
 24 | def initialize_conversion_matrix(num_labels):
 25 |   return [[0 for i in range(num_labels)] for y in range(num_labels)]
 26 | 
 27 | def read_reviews(filename):
 28 |   reviews = []
 29 |   with open(filename, 'r') as f:
 30 |     for line in f:
 31 |       reviews.append(json.loads(line))
 32 |   return reviews
 33 | 
 34 | def review_texts_from_reviews(reviews):
 35 |   return [review["reviewText"] for review in reviews]
 36 | 
 37 | def get_review_texts(filename):
 38 |   reviews = read_reviews(filename)
 39 |   return review_texts_from_reviews(reviews)
 40 | 
 41 | def clean_review(review):
 42 |   exclude = set(string.punctuation)
 43 |   review = ''.join(ch for ch in review if ch not in exclude)
 44 |   split_sentence = review.lower().split(" ")
 45 |   clean = [word for word in split_sentence if word not in STOP_WORDS]
 46 |   return clean
 47 | 
 48 | def counters_from_file(filename):
 49 |   reviews = read_reviews(filename)
 50 |   texts = [review["reviewText"] for review in reviews]
 51 |   tokens = [clean_review(review_text) for review_text in texts]
 52 |   flattened_tokens = [val for sublist in tokens for val in sublist]
 53 |   counter = Counter(flattened_tokens)
 54 |   return counter
 55 | 
 56 | def line_count_from_file(filename):
 57 |   return sum(1 for line in open(filename))
 58 | 
 59 | def naive_bayes(class_labels, nltk=False):
 60 |   if nltk:
 61 |     confusion_matrix = naive_bayes_nltk(class_labels)
 62 |   else:
 63 |     confusion_matrix = naive_bayes_self(class_labels)
 64 |   return confusion_matrix
 65 | 
 66 | def conditional_prob(word, counters, total_vocab_count):
 67 |   word_count = counters[word]
 68 |   class_total_word_count = sum(counters.values())
 69 |   cond_prob = float((word_count + 1)) / (class_total_word_count + total_vocab_count)
 70 |   return cond_prob
 71 | 
 72 | def naive_bayes_self(class_labels):
 73 |   counters = []
 74 |   doc_counts = []
 75 |   for label in class_labels:
 76 |     filename = "train_%s.json" % label
 77 |     doc_counts.append(line_count_from_file(filename))
 78 |     counter = counters_from_file(filename)
 79 |     counters.append(counter)
 80 | 
 81 |   combined_bag = Counter()
 82 |   for counter in counters:
 83 |     combined_bag += counter
 84 |   combined_vocab_count = len(combined_bag.keys())
 85 | 
 86 |   probabilities = [float(doc_count) / sum(doc_counts) for doc_count in doc_counts]
 87 |   correct = 0
 88 |   incorrect = 0
 89 |   confusion_matrix = initialize_conversion_matrix(len(class_labels))
 90 | 
 91 |   for index, class_name in enumerate(class_labels):
 92 |     filename = "test_%s.json" % class_name
 93 |     texts = get_review_texts(filename)
 94 |     for text in texts:
 95 |       tokens = clean_review(text)
 96 |       scores = []
 97 |       for cindex, bag in enumerate(counters): #for each class
 98 |         score = math.log1p(probabilities[cindex])
 99 |         for word in tokens:
100 |           #for each word, we need the probablity that word given the class / bag
101 |           cond_prob = conditional_prob(word, bag, combined_vocab_count)
102 |           score += math.log(cond_prob)
103 |         scores.append(score)
104 |       max_index, max_value = max(enumerate(scores), key=lambda p: p[1])
105 |       confusion_matrix[index][max_index] += 1
106 | 
107 |       if index == max_index:
108 |         correct += 1
109 |       else:
110 |         incorrect += 1
111 | 
112 |   print (correct / float(correct + incorrect))
113 |   return confusion_matrix
114 | 
115 | def naive_bayes_nltk(class_labels):
116 |   #note, training set needs to be in form of
117 |   #train_set = [
118 |   #({'I': 3, 'like': 1, 'this': 1, 'product': 2}, 'class_name_1')
119 |   #({'This': 2, 'is': 1, 'really': 1, 'great': 2}, 'class_name_1')
120 |   #...
121 |   #({'Big': 1, 'fan': 1, 'of': 1, 'this': 1}, 'class_name_X')
122 |   #]
123 |   train_set = []
124 |   for class_name in class_labels:
125 |     filename = "train_%s.json" % class_name
126 |     texts = get_review_texts(filename)
127 |     for text in texts:
128 |       tokens = clean_review(text)
129 |       counter = Counter(tokens)
130 |       train_set.append((dict(counter), class_name))
131 | 
132 |   classifier = nltk.NaiveBayesClassifier.train(train_set)
133 | 
134 |   correct = 0
135 |   incorrect = 0
136 |   confusion_matrix = initialize_conversion_matrix(len(class_labels))
137 | 
138 |   for index, class_name in enumerate(class_labels):
139 |     filename = "test_%s.json" % class_name
140 |     reviews = read_reviews(filename)
141 |     texts = [review["reviewText"] for review in reviews]
142 |     for text in texts:
143 |       tokens = clean_review(text)
144 |       counter = dict(Counter(tokens))
145 |       guess = classifier.classify(counter)
146 |       lindex = class_labels.index(guess)
147 |       confusion_matrix[index][lindex] += 1
148 | 
149 |       if guess == class_name:
150 |         correct += 1
151 |       else:
152 |         incorrect += 1
153 | 
154 |   print (correct / float(correct + incorrect))
155 |   classifier.show_most_informative_features()
156 |   return confusion_matrix
157 | 
158 | if __name__ == "__main__":
159 | 
160 |   class_labels = ['baby', 'tool']
161 | 
162 |   confusion_matrix = naive_bayes(class_labels)#, nltk=True)
163 |   print_confusion_matrix(confusion_matrix, class_labels)
164 | 


--------------------------------------------------------------------------------
/naive_bayes/sample.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import linecache
  3 | import random
  4 | 
  5 | NUM_TOTAL_SAMPLES = 10000
  6 | NUM_TRAIN_SAMPLES = 8000
  7 | NUM_TEST_SAMPLES = 2000
  8 | 
  9 | apps_count = 752937
 10 | apps_filename = "reviews_Apps_for_Android_5.json"
 11 | 
 12 | automotive_count = 20473
 13 | automotive_filename = "reviews_Automotive_5.json"
 14 | 
 15 | baby_count = 160792
 16 | baby_filename = "reviews_Baby_5.json"
 17 | 
 18 | books_count = 8898041
 19 | books_filename = "reviews_Books_5.json"
 20 | 
 21 | cell_phones_count = 194439
 22 | cell_phones_filename = "reviews_Cell_Phones_and_Accessories_5.json"
 23 | 
 24 | tool_count = 134476
 25 | tool_filename = "reviews_Tools_and_Home_Improvement_5.json"
 26 | 
 27 | food_count = 151254
 28 | food_filename = "reviews_Grocery_and_Gourmet_Food_5.json"
 29 | 
 30 | pet_count = 157836
 31 | pet_filename = "reviews_Pet_Supplies_5.json"
 32 | 
 33 | home_count = 551682
 34 | home_filename = "reviews_Home_and_Kitchen_5.json"
 35 | 
 36 | automotive_count = 20473
 37 | automotive = "reviews_Automotive_5.json"
 38 | 
 39 | instant_video_count = 37126
 40 | instant_video_filename = "reviews_Amazon_Instant_Video_5.json"
 41 | 
 42 | beauty_count = 198502
 43 | beauty_filename = "reviews_Beauty_5.json"
 44 | 
 45 | cds_vinyl_count = 1097592
 46 | cds_vinyl_filename = "reviews_CDs_and_Vinyl_5.json"
 47 | 
 48 | health_count = 346355
 49 | health_filename = "reviews_Health_and_Personal_Care_5.json"
 50 | 
 51 | clothes_count = 278677
 52 | clothes_filename = "reviews_Clothing_Shoes_and_Jewelry_5.json"
 53 | 
 54 | digital_music_count = 64706
 55 | digital_music_filename = "reviews_Digital_Music_5.json"
 56 | 
 57 | electronics_count = 1689188
 58 | electronics_filename = "reviews_Electronics_5.json"
 59 | 
 60 | kindle_count = 982619
 61 | kindle_filename = "reviews_Kindle_Store_5.json"
 62 | 
 63 | movies_tv_count = 1697533
 64 | movies_tv_filename = "reviews_Movies_and_TV_5.json"
 65 | 
 66 | instruments_count = 10261
 67 | instruments_filename = "reviews_Musical_Instruments_5.json"
 68 | 
 69 | office_count = 53258
 70 | office_filename = "reviews_Office_Products_5.json"
 71 | 
 72 | patio_count = 13272
 73 | patio_filename = "reviews_Patio_Lawn_and_Garden_5.json"
 74 | 
 75 | sports_count = 296337
 76 | sports_filename = "reviews_Sports_and_Outdoors_5.json"
 77 | 
 78 | toys_count = 167597
 79 | toys_filename = "reviews_Toys_and_Games_5.json"
 80 | 
 81 | video_games_count = 231780
 82 | video_games_filename = "reviews_Video_Games_5.json"
 83 | 
 84 | infos = []
 85 | infos.append({"class_name": "apps", "count": apps_count, "filename": apps_filename})
 86 | '''
 87 | infos.append({"class_name": "baby", "count": baby_count, "filename": baby_filename})
 88 | infos.append({"class_name": "tool", "count": tool_count, "filename": tool_filename})
 89 | infos.append({"class_name": "food", "count": food_count, "filename": food_filename})
 90 | infos.append({"class_name": "pet", "count": pet_count, "filename": pet_filename})
 91 | infos.append({"class_name": "home", "count": home_count, "filename": home_filename})
 92 | infos.append({"class_name": "automotive", "count": automotive_count, "filename": automotive_filename})
 93 | infos.append({"class_name": "instant_video", "count": instant_video_count, "filename": instant_video_filename})
 94 | infos.append({"class_name": "beauty", "count": beauty_count, "filename": beauty_filename})
 95 | infos.append({"class_name": "cds_vinyl", "count": cds_vinyl_count, "filename": cds_vinyl_filename})
 96 | infos.append({"class_name": "clothes", "count": clothes_count, "filename": clothes_filename})
 97 | infos.append({"class_name": "digital_music", "count": digital_music_count, "filename": digital_music_filename})
 98 | infos.append({"class_name": "cell_phones", "count": cell_phones_count, "filename": cell_phones_filename})
 99 | infos.append({"class_name": "electronics", "count": electronics_count, "filename": electronics_filename})
100 | infos.append({"class_name": "kindle", "count": kindle_count, "filename": kindle_filename})
101 | infos.append({"class_name": "movies_tv", "count": movies_tv_count, "filename": movies_tv_filename})
102 | infos.append({"class_name": "instruments", "count": instruments_count, "filename": instruments_filename})
103 | infos.append({"class_name": "office", "count": office_count, "filename": office_filename})
104 | infos.append({"class_name": "patio", "count": patio_count, "filename": patio_filename})
105 | infos.append({"class_name": "health", "count": health_count, "filename": health_filename})
106 | infos.append({"class_name": "sports", "count": sports_count, "filename": sports_filename})
107 | infos.append({"class_name": "toys", "count": toys_count, "filename": toys_filename})
108 | infos.append({"class_name": "video_games", "count": video_games_count, "filename": video_games_filename})
109 | infos.append({"class_name": "books", "count": books_count, "filename": books_filename})
110 | '''
111 | 
112 | for info in infos:
113 |   filename = "data/%s" % info["filename"]
114 |   count = info["count"]
115 |   class_name = info["class_name"]
116 |   train_filename = "train_%s.json" % class_name
117 |   test_filename = "test_%s.json" % class_name
118 | 
119 |   print class_name
120 | 
121 |   all_lines = random.sample(range(1,count), NUM_TOTAL_SAMPLES)
122 |   train_lines = set(all_lines[0:NUM_TRAIN_SAMPLES])
123 |   test_lines = set(all_lines[NUM_TRAIN_SAMPLES:])
124 | 
125 |   train_reviews = [eval(linecache.getline(filename, i)) for i in train_lines]
126 |   test_reviews = [eval(linecache.getline(filename, i)) for i in test_lines]
127 | 
128 |   with open(train_filename, 'w') as f:
129 |     for review in train_reviews:
130 |       f.write(json.dumps(review))
131 |       f.write('\n')
132 | 
133 |   with open(test_filename, 'w') as f:
134 |     for review in test_reviews:
135 |       f.write(json.dumps(review))
136 |       f.write('\n')
137 | 
138 | 


--------------------------------------------------------------------------------
/scraping/data_scrapy.py:
--------------------------------------------------------------------------------
 1 | import helpers
 2 | 
 3 | import scrapy
 4 | from scrapy.selector import Selector
 5 | 
 6 | class DataSpider(scrapy.Spider):
 7 |   name = "data"
 8 |   start_urls = [
 9 |       'https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/'
10 |   ]
11 | 
12 |   desired_tags = (u'p', u'h1', u'h3', u'pre')
13 |   text = {}
14 | 
15 |   def words_from_tags(self, tag, response):
16 |     total = []
17 |     div = response.xpath("//div[contains(@class, 'entry-content')]")
18 |     for para in div.xpath(".//%s" % tag):
19 |       combined = []
20 |       for words in para.xpath('./descendant-or-self::*/text()'):
21 |         combined.append(words.extract())
22 |       total.append(' '.join(combined))
23 |     return total
24 | 
25 |   def parse(self, response):
26 |     selector = Selector(response=response)
27 |     for tag in self.desired_tags:
28 |       self.text[tag] = self.words_from_tags(tag, response)
29 |     helpers.write_data('scrapy', self.text)
30 |     yield self.text #how scrapy returns the json object you created
31 | 


--------------------------------------------------------------------------------
/scraping/helpers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def write_html(lib_name, page):
 4 |   if not os.path.exists("pages"):
 5 |     os.makedirs("pages")
 6 |   file_name = "pages/" + lib_name + "_page.html"
 7 |   with open(file_name, 'w') as f:
 8 |     f.write(page)
 9 | 
10 | def write_data(lib, text_array):
11 |   if not os.path.exists("texts"):
12 |     os.makedirs("texts")
13 |   lib_dir = "texts/" + lib
14 |   if not os.path.exists(lib_dir):
15 |     os.makedirs(lib_dir)
16 |   for key, values in text_array.iteritems():
17 |     filename = lib_dir + '/' + key + '.txt'
18 |     with open(filename, 'w') as f:
19 |       for value in values:
20 |         if value is not None:
21 |           f.write(value.encode('UTF-8') + '\n')
22 | 


--------------------------------------------------------------------------------
/scraping/page.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 7]>
  3 | <html class="ie ie7" lang="en">
  4 | <![endif]-->
  5 | <!--[if IE 8]>
  6 | <html class="ie ie8" lang="en">
  7 | <![endif]-->
  8 | <!--[if !(IE 7) & !(IE 8)]><!-->
  9 | <html lang="en">
 10 | <!--<![endif]-->
 11 | <head>
 12 | <meta charset="UTF-8" />
 13 | <meta name="viewport" content="width=device-width" />
 14 | <title>General Tips for Web Scraping with Python | Big-Ish Data</title>
 15 | <link rel="profile" href="http://gmpg.org/xfn/11" />
 16 | <link rel="pingback" href="https://bigishdata.com/xmlrpc.php" />
 17 | <!--[if lt IE 9]>
 18 | <script src="https://s0.wp.com/wp-content/themes/pub/twentytwelve/js/html5.js" type="text/javascript"></script>
 19 | <![endif]-->
 20 | 		<script src='https://r-login.wordpress.com/remote-login.php?action=js&amp;host=bigishdata.com&amp;id=92761702&amp;t=1496255626&amp;back=https%3A%2F%2Fbigishdata.com%2F2017%2F05%2F11%2Fgeneral-tips-for-web-scraping-with-python%2F' type="text/javascript"></script>
 21 | 		<script type="text/javascript">
 22 | 		/* <![CDATA[ */
 23 | 			if ( 'function' === typeof WPRemoteLogin ) {
 24 | 				document.cookie = "wordpress_test_cookie=test; path=/";
 25 | 				if ( document.cookie.match( /(;|^)\s*wordpress_test_cookie\=/ ) ) {
 26 | 					WPRemoteLogin();
 27 | 				}
 28 | 			}
 29 | 		/* ]]> */
 30 | 		</script>
 31 | 		<link rel='dns-prefetch' href='//s2.wp.com' />
 32 | <link rel='dns-prefetch' href='//s1.wp.com' />
 33 | <link rel='dns-prefetch' href='//s0.wp.com' />
 34 | <link rel='dns-prefetch' href='//bigishdata.wordpress.com' />
 35 | <link rel='dns-prefetch' href='//fonts.googleapis.com' />
 36 | <link href='https://fonts.gstatic.com' crossorigin rel='preconnect' />
 37 | <link rel="alternate" type="application/rss+xml" title="Big-Ish Data &raquo; Feed" href="https://bigishdata.com/feed/" />
 38 | <link rel="alternate" type="application/rss+xml" title="Big-Ish Data &raquo; Comments Feed" href="https://bigishdata.com/comments/feed/" />
 39 | <link rel="alternate" type="application/rss+xml" title="Big-Ish Data &raquo; General Tips for Web Scraping with&nbsp;Python Comments Feed" href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/feed/" />
 40 | 	<script type="text/javascript">
 41 | 		/* <![CDATA[ */
 42 | 		function addLoadEvent(func) {
 43 | 			var oldonload = window.onload;
 44 | 			if (typeof window.onload != 'function') {
 45 | 				window.onload = func;
 46 | 			} else {
 47 | 				window.onload = function () {
 48 | 					oldonload();
 49 | 					func();
 50 | 				}
 51 | 			}
 52 | 		}
 53 | 		/* ]]> */
 54 | 	</script>
 55 | 			<script type="text/javascript">
 56 | 			window._wpemojiSettings = {"baseUrl":"https:\/\/s0.wp.com\/wp-content\/mu-plugins\/wpcom-smileys\/twemoji\/2\/72x72\/","ext":".png","svgUrl":"https:\/\/s0.wp.com\/wp-content\/mu-plugins\/wpcom-smileys\/twemoji\/2\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/s1.wp.com\/wp-includes\/js\/wp-emoji-release.min.js?m=1488818651h&ver=4.7.5"}};
 57 | 			!function(a,b,c){function d(a){var b,c,d,e,f=String.fromCharCode;if(!k||!k.fillText)return!1;switch(k.clearRect(0,0,j.width,j.height),k.textBaseline="top",k.font="600 32px Arial",a){case"flag":return k.fillText(f(55356,56826,55356,56819),0,0),!(j.toDataURL().length<3e3)&&(k.clearRect(0,0,j.width,j.height),k.fillText(f(55356,57331,65039,8205,55356,57096),0,0),b=j.toDataURL(),k.clearRect(0,0,j.width,j.height),k.fillText(f(55356,57331,55356,57096),0,0),c=j.toDataURL(),b!==c);case"emoji4":return k.fillText(f(55357,56425,55356,57341,8205,55357,56507),0,0),d=j.toDataURL(),k.clearRect(0,0,j.width,j.height),k.fillText(f(55357,56425,55356,57341,55357,56507),0,0),e=j.toDataURL(),d!==e}return!1}function e(a){var c=b.createElement("script");c.src=a,c.defer=c.type="text/javascript",b.getElementsByTagName("head")[0].appendChild(c)}var f,g,h,i,j=b.createElement("canvas"),k=j.getContext&&j.getContext("2d");for(i=Array("flag","emoji4"),c.supports={everything:!0,everythingExceptFlag:!0},h=0;h<i.length;h++)c.supports[i[h]]=d(i[h]),c.supports.everything=c.supports.everything&&c.supports[i[h]],"flag"!==i[h]&&(c.supports.everythingExceptFlag=c.supports.everythingExceptFlag&&c.supports[i[h]]);c.supports.everythingExceptFlag=c.supports.everythingExceptFlag&&!c.supports.flag,c.DOMReady=!1,c.readyCallback=function(){c.DOMReady=!0},c.supports.everything||(g=function(){c.readyCallback()},b.addEventListener?(b.addEventListener("DOMContentLoaded",g,!1),a.addEventListener("load",g,!1)):(a.attachEvent("onload",g),b.attachEvent("onreadystatechange",function(){"complete"===b.readyState&&c.readyCallback()})),f=c.source||{},f.concatemoji?e(f.concatemoji):f.wpemoji&&f.twemoji&&(e(f.twemoji),e(f.wpemoji)))}(window,document,window._wpemojiSettings);
 58 | 		</script>
 59 | 		<style type="text/css">
 60 | img.wp-smiley,
 61 | img.emoji {
 62 | 	display: inline !important;
 63 | 	border: none !important;
 64 | 	box-shadow: none !important;
 65 | 	height: 1em !important;
 66 | 	width: 1em !important;
 67 | 	margin: 0 .07em !important;
 68 | 	vertical-align: -0.1em !important;
 69 | 	background: none !important;
 70 | 	padding: 0 !important;
 71 | }
 72 | </style>
 73 | <link rel='stylesheet' id='all-css-0-1' href='https://s2.wp.com/_static/??-eJyFj9EKwjAMRX/IWqZT3IP4LV0XXLemDUvG8O9tHQoOqS8hN7mH3OiFlI1BIIjGWZGf7y6wXshGVIzOw2Oj9pZ5p39j3o3AegAhY0f1UiX7BN4IdIoiy0aVMBsnSHMkI9mB0DkDHjDZShjS+U3ltk8Ri2fWr9uWJmBWqaKbUUmfDv2JhznK+r5Oxu/JijodorhE86fJixteq/rUHOqqOV6GJ+iEmuM=' type='text/css' media='all' />
 74 | <link rel='stylesheet' id='twentytwelve-fonts-css'  href='https://fonts.googleapis.com/css?family=Open+Sans:400italic,700italic,400,700&#038;subset=latin,latin-ext' type='text/css' media='all' />
 75 | <link rel='stylesheet' id='all-css-2-1' href='https://s2.wp.com/wp-content/themes/pub/twentytwelve/style.css?m=1492718579h' type='text/css' media='all' />
 76 | <!--[if lt IE 9]>
 77 | <link rel='stylesheet' id='twentytwelve-ie-css'  href='https://s0.wp.com/wp-content/themes/pub/twentytwelve/css/ie.css?m=1423935238h&#038;ver=20121010' type='text/css' media='all' />
 78 | <![endif]-->
 79 | <link rel='stylesheet' id='all-css-4-1' href='https://s2.wp.com/_static/??-eJx9kN0OwiAMhV9IrCabzgvjswBBhimUUHDZ28viEreYcEN7mvP1B5ii0BSyCRnyaLxhiEVBnmphri++DXCe0YgpavJHzXyADeOLiFisCwzWkEDSMjsKOyGeKF1qockoJFtTC9W1kS1I6qW5kumXteyROH83AR5lWmatsUUxaSexnmJpL/6g9ffGDiySkrgYHv5+7m6n63Dph/71Af8piGQ=' type='text/css' media='all' />
 80 | <script type='text/javascript'>
 81 | /* <![CDATA[ */
 82 | var related_posts_js_options = {"post_heading":"h4"};
 83 | /* ]]> */
 84 | </script>
 85 | <script type='text/javascript' src='https://s1.wp.com/_static/??-eJyFj90KwjAMhV/IrAw6xQvxWWoXS2r/bNoNfXo70At1KARCTr4cTsScgIJ2dUQWttW1Yr49W2d5I34B4MlkVbDzFF6wjqFgKAvr44kcQmXMyjStGZ3jCpciF4/MDVrZvkeiMBHOfzGLJSl9gYxM9y9XXyG5aiiwyOjaAyMsGT6mdnX0h17uh36Qcre1DyuucHk='></script>
 86 | <link rel='stylesheet' id='all-css-0-2' href='https://s1.wp.com/wp-content/mu-plugins/highlander-comments/style.css?m=1377793621h' type='text/css' media='all' />
 87 | <!--[if lt IE 8]>
 88 | <link rel='stylesheet' id='highlander-comments-ie7-css'  href='https://s1.wp.com/wp-content/mu-plugins/highlander-comments/style-ie7.css?m=1351637563h&#038;ver=20110606' type='text/css' media='all' />
 89 | <![endif]-->
 90 | <link rel="EditURI" type="application/rsd+xml" title="RSD" href="https://bigishdata.wordpress.com/xmlrpc.php?rsd" />
 91 | <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="https://s1.wp.com/wp-includes/wlwmanifest.xml" /> 
 92 | <link rel='prev' title='Product Mentions Update &#8212; Thoughts When Reviewing the Reddit&nbsp;Mentions' href='https://bigishdata.com/2017/05/11/product-mentions-update-thoughts-when-reviewing-the-reddit-mentions/' />
 93 | <meta name="generator" content="WordPress.com" />
 94 | <link rel="canonical" href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/" />
 95 | <link rel='shortlink' href='http://wp.me/p6hdyu-8g' />
 96 | <link rel="alternate" type="application/json+oembed" href="https://public-api.wordpress.com/oembed/1.0/?format=json&amp;url=https%3A%2F%2Fbigishdata.com%2F2017%2F05%2F11%2Fgeneral-tips-for-web-scraping-with-python%2F&amp;for=wpcom-auto-discovery" /><link rel="alternate" type="application/xml+oembed" href="https://public-api.wordpress.com/oembed/1.0/?format=xml&amp;url=https%3A%2F%2Fbigishdata.com%2F2017%2F05%2F11%2Fgeneral-tips-for-web-scraping-with-python%2F&amp;for=wpcom-auto-discovery" />
 97 | <!-- Jetpack Open Graph Tags -->
 98 | <meta property="og:type" content="article" />
 99 | <meta property="og:title" content="General Tips for Web Scraping with Python" />
100 | <meta property="og:url" content="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/" />
101 | <meta property="og:description" content="The great majority of the projects about machine learning or data analysis I write about here on Bigish-Data have an initial step of scraping data from websites. And since I get a bunch of contact …" />
102 | <meta property="article:published_time" content="2017-05-11T19:38:44+00:00" />
103 | <meta property="article:modified_time" content="2017-05-13T12:04:42+00:00" />
104 | <meta property="og:site_name" content="Big-Ish Data" />
105 | <meta property="og:image" content="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png" />
106 | <meta property="og:image:width" content="1628" />
107 | <meta property="og:image:height" content="498" />
108 | <meta property="og:locale" content="en_US" />
109 | <meta name="twitter:site" content="@wordpressdotcom" />
110 | <meta name="twitter:text:title" content="General Tips for Web Scraping with&nbsp;Python" />
111 | <meta name="twitter:image" content="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=240" />
112 | <meta name="twitter:card" content="summary" />
113 | <meta property="article:publisher" content="https://www.facebook.com/WordPresscom" />
114 | <link rel="shortcut icon" type="image/x-icon" href="https://s2.wp.com/i/favicon.ico" sizes="16x16 24x24 32x32 48x48" />
115 | <link rel="icon" type="image/x-icon" href="https://s2.wp.com/i/favicon.ico" sizes="16x16 24x24 32x32 48x48" />
116 | <link rel="apple-touch-icon-precomposed" href="https://s0.wp.com/i/webclip.png" />
117 | <link rel='openid.server' href='https://bigishdata.wordpress.com/?openidserver=1' />
118 | <link rel='openid.delegate' href='https://bigishdata.wordpress.com/' />
119 | <link rel="search" type="application/opensearchdescription+xml" href="https://bigishdata.com/osd.xml" title="Big-Ish Data" />
120 | <link rel="search" type="application/opensearchdescription+xml" href="https://s1.wp.com/opensearch.xml" title="WordPress.com" />
121 | 		<style type="text/css">
122 | 			.recentcomments a {
123 | 				display: inline !important;
124 | 				padding: 0 !important;
125 | 				margin: 0 !important;
126 | 			}
127 | 
128 | 			table.recentcommentsavatartop img.avatar, table.recentcommentsavatarend img.avatar {
129 | 				border: 0px;
130 | 				margin: 0;
131 | 			}
132 | 
133 | 			table.recentcommentsavatartop a, table.recentcommentsavatarend a {
134 | 				border: 0px !important;
135 | 				background-color: transparent !important;
136 | 			}
137 | 
138 | 			td.recentcommentsavatarend, td.recentcommentsavatartop {
139 | 				padding: 0px 0px 1px 0px;
140 | 				margin: 0px;
141 | 			}
142 | 
143 | 			td.recentcommentstextend {
144 | 				border: none !important;
145 | 				padding: 0px 0px 2px 10px;
146 | 			}
147 | 
148 | 			.rtl td.recentcommentstextend {
149 | 				padding: 0px 10px 2px 0px;
150 | 			}
151 | 
152 | 			td.recentcommentstexttop {
153 | 				border: none;
154 | 				padding: 0px 0px 0px 10px;
155 | 			}
156 | 
157 | 			.rtl td.recentcommentstexttop {
158 | 				padding: 0px 10px 0px 0px;
159 | 			}
160 | 		</style>
161 | 		<meta name="application-name" content="Big-Ish Data" /><meta name="msapplication-window" content="width=device-width;height=device-height" /><meta name="msapplication-task" content="name=Subscribe;action-uri=https://bigishdata.com/feed/;icon-uri=https://s2.wp.com/i/favicon.ico" /><meta name="msapplication-task" content="name=Sign up for a free blog;action-uri=http://wordpress.com/signup/;icon-uri=https://s2.wp.com/i/favicon.ico" /><meta name="msapplication-task" content="name=WordPress.com Support;action-uri=http://support.wordpress.com/;icon-uri=https://s2.wp.com/i/favicon.ico" /><meta name="msapplication-task" content="name=WordPress.com Forums;action-uri=http://forums.wordpress.com/;icon-uri=https://s2.wp.com/i/favicon.ico" /><meta name="title" content="General Tips for Web Scraping with&nbsp;Python | Big-Ish Data on WordPress.com" />
162 | <meta name="description" content="The great majority of the projects about machine learning or data analysis I write about here on Bigish-Data have an initial step of scraping data from websites. And since I get a bunch of contact emails asking me to give them either the data I&#039;ve scraped myself, or help with getting the code to work for themselves.&hellip;" />
163 | <link rel="amphtml" href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/amp/" /><style type="text/css" id="syntaxhighlighteranchor"></style>
164 | </head>
165 | 
166 | <body class="post-template-default single single-post postid-512 single-format-standard mp6 customizer-styles-applied full-width custom-font-enabled highlander-enabled highlander-light">
167 | <div id="page" class="hfeed site">
168 | 	<header id="masthead" class="site-header" role="banner">
169 | 				<hgroup>
170 | 			<h1 class="site-title"><a href="https://bigishdata.com/" title="Big-Ish Data" rel="home">Big-Ish Data</a></h1>
171 | 			<h2 class="site-description"></h2>
172 | 		</hgroup>
173 | 
174 | 		<nav id="site-navigation" class="main-navigation" role="navigation">
175 | 			<button class="menu-toggle">Menu</button>
176 | 			<a class="assistive-text" href="#content" title="Skip to content">Skip to content</a>
177 | 			<div class="nav-menu"><ul>
178 | <li ><a href="https://bigishdata.com/">Home</a></li><li class="page_item page-item-1"><a href="https://bigishdata.com/about/">About</a></li>
179 | <li class="page_item page-item-6"><a href="https://bigishdata.com/contact/">Contact</a></li>
180 | </ul></div>
181 | 		</nav><!-- #site-navigation -->
182 | 
183 | 			</header><!-- #masthead -->
184 | 
185 | 	<div id="main" class="wrapper">
186 | 	<div id="primary" class="site-content">
187 | 		<div id="content" role="main">
188 | 
189 | 			
190 | 				
191 | 	<article id="post-512" class="post-512 post type-post status-publish format-standard hentry category-how-to category-python category-scraping tag-tips">
192 | 				<header class="entry-header">
193 | 			
194 | 						<h1 class="entry-title">General Tips for Web Scraping with&nbsp;Python</h1>
195 | 										<div class="comments-link">
196 | 					<a href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/#comments">7 Replies</a>				</div><!-- .comments-link -->
197 | 					</header><!-- .entry-header -->
198 | 
199 | 				<div class="entry-content">
200 | 			<p>The great majority of the projects about machine learning or data analysis I write about here on Bigish-Data have an initial step of scraping data from websites. And since I get a bunch of contact emails asking me to give them either the data I&#8217;ve scraped myself, or help with getting the code to work for themselves. Because of that, I figured I should write something here about the process of web scraping!</p>
201 | <p>There are plenty of other things to talk about when scraping, such as specifics on how to grab the data from a particular site, which Python libraries to use and how to use them, how to write code that would scrape the data in a daily job, where exactly to look as to how to get the data from random sites, etc. But since there are tons of other specific tutorials online, I&#8217;m going to talk about overall thoughts on how to scrape. There are three parts of this post &#8211; <strong>How to grab the data, how to save the data, and how to be nice</strong>.</p>
202 | <p>As is the case with everything, programming-wise, if you&#8217;re looking to learn scraping, you can&#8217;t just read tutorials and think to yourself that you know how to program. <strong>Pick a project, practice grabbing the data, and then write a blog post about what you learned</strong>.</p>
203 | <p>There definitely are tons of different thoughts on scraping, but these are the ones that I&#8217;ve learned from doing it a while. If you have questions, comments, and want to call me out, feel free to comment, or get in <a href="https://bigishdata.com/contact/">contact</a>!</p>
204 | <h1>Grabbing the Data</h1>
205 | <p>The first step for scraping data from websites is to figure out where the sites keep their data, and what method they use to display the data on the browser. For this part of your project, I&#8217;ll suggest writing in a file named <strong>gather.py</strong> which should performs all these tasks.</p>
206 | <p><span id="more-512"></span></p>
207 | <p>That being said, there are a few ways you&#8217;ll need to look for to see how to most easily get the data.</p>
208 | <h3>Check if the site has an API First</h3>
209 | <p>A ton of sites with interesting data have APIs for programmers to grab the data and write posts about the interesting-ness of the site. Genius does this very nicely, except for the song lyrics of course.</p>
210 | <p>And also if the site has an API, that means that they&#8217;re totally alright with programmers using their data, though pretty much every site doesn&#8217;t allow you to use its data to make money. Read their requirements and rules for using the site&#8217;s data, and if your project is allowed, API is the way to go.</p>
211 | <h3>Figure out the URLs of all the data</h3>
212 | <p>If there is no API, that means you&#8217;re going to have to figure out the urls where the site displays all the data you need.</p>
213 | <p>A common type you&#8217;ll see is that the data is displayed using IDs for the objects. If you&#8217;ve done web development in something like Rails, you&#8217;ll know exactly how that works. In this case, there probably is an index page that has links to all the different pages you&#8217;re trying to scrape, so you&#8217;ll have two scraping requirements. And like I&#8217;ve said, each site is different, but just know that these are possible requirements to get all the data you want.</p>
214 | <h3>Check JSON loading of data</h3>
215 | <p>If the site doesn&#8217;t have an API and you&#8217;re still going to want the data, check to see if the page that shows the data you&#8217;re looking for is by using JSON. If the page loads and it takes a second or a flash for the text to show up, it&#8217;s probably using that JSON.</p>
216 | <p>From there, right click the web page and click &#8220;Inspect&#8221; on Chrome to get the Developer Tools window to open, reload the web page, and check the Sources tab to see pages that end in .json. You&#8217;ll be able to see the URL it came from, then open a new tab and paste that URL and you&#8217;ll be able to see the JSON with your data!</p>
217 | <p>Quick example of how stats.nba.com generates their pages. If you just look at the HTML returned, you&#8217;ll see that it&#8217;s only AngularJS, meaning you can&#8217;t use the HTML to scrape the data. You&#8217;ll need to find the JS url that loads that data.</p>
218 | <p><img data-attachment-id="653" data-permalink="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/screen-shot-2017-05-11-at-2-29-16-pm/" data-orig-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=960" data-orig-size="1896,910" data-comments-opened="1" data-image-meta="{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}" data-image-title="Screen Shot 2017-05-11 at 2.29.16 PM" data-image-description="" data-medium-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=960?w=300" data-large-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=960?w=960" class="alignnone size-full wp-image-653" src="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=960" alt="Screen Shot 2017-05-11 at 2.29.16 PM" srcset="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=960 960w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=150 150w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=300 300w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=768 768w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png?w=1024 1024w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-29-16-pm.png 1896w" sizes="(max-width: 960px) 100vw, 960px"   /></p>
219 | <p>Looking at the network, I find a specific html file requested for over the Network.</p>
220 | <p>Then, by reloading the page and checking the files under the Network tab, I find the url that generates the data for the page. As you can see, it&#8217;s just a JS variable that has all the data for the players.</p>
221 | <p><img data-attachment-id="657" data-permalink="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/screen-shot-2017-05-11-at-2-34-16-pm/" data-orig-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=960" data-orig-size="2592,948" data-comments-opened="1" data-image-meta="{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}" data-image-title="Screen Shot 2017-05-11 at 2.34.16 PM" data-image-description="" data-medium-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=960?w=300" data-large-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=960?w=960" class="alignnone size-full wp-image-657" src="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=960" alt="Screen Shot 2017-05-11 at 2.34.16 PM" srcset="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=960 960w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=1920 1920w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=150 150w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=300 300w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=768 768w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-34-16-pm.png?w=1024 1024w" sizes="(max-width: 960px) 100vw, 960px"   /></p>
222 | <p>&nbsp;</p>
223 | <p>I won&#8217;t list the URL specifically here, but there are ways to change it to grab the data that you&#8217;re looking for.</p>
224 | <h3>Fall back to HTML scraping</h3>
225 | <p>If the site you&#8217;re looking for data from doesn&#8217;t have an API or use JSON to load the data, you&#8217;re going to fall back to grabbing the HTML pages. Which is the only technique that people think of when imagining web scraping!</p>
226 | <p>Like the JSON data, you&#8217;re going to have to use the Inspect feature of Chrome&#8217;s development tools, but in this case right click on the text that you&#8217;re trying to grab and analyze the classes and ids in order to grab that data.</p>
227 | <p>For example, if you&#8217;re looking to scrape a WordPress blog to do something like sentiment analysis of the posts, you&#8217;ll want to do something like this:</p>
228 | <p><img data-attachment-id="632" data-permalink="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/screen-shot-2017-05-11-at-2-08-48-pm/" data-orig-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=960" data-orig-size="1628,498" data-comments-opened="1" data-image-meta="{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}" data-image-title="Screen Shot 2017-05-11 at 2.08.48 PM" data-image-description="" data-medium-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=960?w=300" data-large-file="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=960?w=960" class="alignnone size-full wp-image-632" src="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=960" alt="Screen Shot 2017-05-11 at 2.08.48 PM" srcset="https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=960 960w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=150 150w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=300 300w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=768 768w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png?w=1024 1024w, https://bigishdata.files.wordpress.com/2017/05/screen-shot-2017-05-11-at-2-08-48-pm.png 1628w" sizes="(max-width: 960px) 100vw, 960px"   /></p>
229 | <p>As for other sites, I won&#8217;t go into exactly how that&#8217;s to be done because the classes and ids vary, but odds are it&#8217;ll be structured similarly with specific classes and ids for the data part of the page. Practice HTML scraping a couple sites and you&#8217;ll see how that part is.</p>
230 | <h1>Saving the Data</h1>
231 | <p>After you have the data saved using <strong>gather.py</strong>, you&#8217;ll need to write code that scrapes the data. In case you didn&#8217;t guess this, a good name for that file is <strong>scrape.py.</strong></p>
232 | <p>With this file, you&#8217;ll want to write the code that grabs the data and structures it for what you saved. How to save the data also depends on the type of scraping job you&#8217;re writing.</p>
233 | <h3>CSV is probably a fine type of database initially</h3>
234 | <p>There are two different types of scraping projects. First is just grabbing data that is consistent and doesn&#8217;t change that much over time. Like the PGA Tour stats I&#8217;ve scraped which change week by week for the current year but obviously don&#8217;t change when grabbing stats for every year in the past. Another example of this is how to get lyrics from Genius. Lyrics don&#8217;t change, and if you&#8217;re looking for other information about the songs, that doesn&#8217;t change much over time either.</p>
235 | <p><strong>If you&#8217;re getting this kind of data, don&#8217;t worry about setting up a DB to save the data</strong>. All types of this have a limited amount and frankly, test files are also quick to analyze the data.</p>
236 | <h3>Database is useful if you have data that keeps coming</h3>
237 | <p>On the other hand, if the data you&#8217;re looking to scrape is updated continuously, you&#8217;re probably going to want a DB to store the data, especially if you have a service (Heroku, Amazon, etc.) that runs your scraping code at certain times.</p>
238 | <p>Another use for the DB is if you&#8217;re looking to scrape the data and then make a website that displays the data. Something like a script that checks <a href="http://www.productmentions.com/">Reddit comments to see how many Amazon products are mentioned</a> and then displays them online.</p>
239 | <p>And obviously the benefit of storing the data in a database rather than local files is that querying to compare data you scraped is much easier than having to load all your files into variables and <em>then</em> analyze the data. Like everything I&#8217;ve mentioned here, <strong>types of methods depends on the site, data, and information you&#8217;re trying to gather</strong>.</p>
240 | <h1>Be Nice</h1>
241 | <h3>Scrape with header that has your name and email so company knows it’s you.</h3>
242 | <p>Some sites will get mad at you if you’re scraping their data. Even when the sites aren&#8217;t &#8220;nice&#8221;, you don’t want to do something &#8220;illegal&#8221;. An example of a way to do this:</p>
243 | <pre>import requests
244 | 
245 | from = "'From': %s" % 'contact@bigishdata.com'
246 | headers = {'user-agent' : 'Jack Schultz, bigishdata.com, contact@bigishdata.com'}
247 | html = requests.get(url, headers=headers)</pre>
248 | <p>You can look up other options for the request header, but make sure it&#8217;s the same and that people who look at their server&#8217;s logs know who you are, just in case they want to get in contact.</p>
249 | <h3>Make sure you don’t keep hitting the servers!!!!</h3>
250 | <p>When you&#8217;re writing and running <strong>gather.py</strong> make sure you&#8217;re testing it in a way that doesn&#8217;t continuously hit the servers to gather the data! I&#8217;m talking about using JSON and HTML.  As for API, you&#8217;ll also want to make sure you&#8217;re not hitting the end points time and time again, especially since they track who&#8217;s hitting the API and most only allow a certain number of requests per time period.</p>
251 | <p>Then when you&#8217;re running <strong>scrape.py</strong>, don&#8217;t hit their servers over and over as well. That script deals with gathering the data that you already got from their site.</p>
252 | <p>Basically, <strong>the only time you should continuously hit the servers is when you&#8217;re running your final code that gets and saves the data / files from the site</strong>.</p>
253 | <h3>Gevent</h3>
254 | <p>Now if you&#8217;re needing to scrape data from a bunch of different web pages, Gevent is the python library to use that will help run request jobs concurrently so you&#8217;ll be able to hit the API, grab the JSON, or grab the HTML pages quicker. Since for the most part, the longest code is the kind that hits their servers and then wait for the file to be returned.</p>
255 | <pre>import gevent
256 | from gevent import monkey
257 | monkey.patch_all()
258 | ... #set the urls that you'll get the data from
259 | jobs = [gevent.spawn(gather_pages, pair[0], pair[1]) for pair in url_filenames]
260 | gevent.joinall(jobs)</pre>
261 | <p>Again, as long as you&#8217;re not going too quickly destroy their servers by asking for thousands of pages at once, feel free to use Gevent. Especially since most sites have more than 50 requests at once.</p>
262 | <h1>Practice, practice, practice</h1>
263 | <p>With all that said, and what is the case with everything, if you want to web scrape, you gotta practice. Reading so many of the tutorials is interesting and does teach you things, but if you want to learn, write the code yourself and search for tutorials that help solve your bugs.</p>
264 | <p>And remember, be nice when grabbing the data.</p>
265 | <div id="jp-post-flair" class="sharedaddy sd-like-enabled sd-sharing-enabled"><div class="sharedaddy sd-sharing-enabled"><div class="robots-nocontent sd-block sd-social sd-social-icon-text sd-sharing"><h3 class="sd-title">Share this:</h3><div class="sd-content"><ul><li class="share-twitter"><a rel="nofollow" data-shared="sharing-twitter-512" class="share-twitter sd-button share-icon" href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/?share=twitter" target="_blank" title="Click to share on Twitter"><span>Twitter</span></a></li><li class="share-facebook"><a rel="nofollow" data-shared="sharing-facebook-512" class="share-facebook sd-button share-icon" href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/?share=facebook" target="_blank" title="Share on Facebook"><span>Facebook</span></a></li><li class="share-google-plus-1"><a rel="nofollow" data-shared="sharing-google-512" class="share-google-plus-1 sd-button share-icon" href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/?share=google-plus-1" target="_blank" title="Click to share on Google+"><span>Google</span></a></li><li class="share-end"></li></ul></div></div></div><div class='sharedaddy sd-block sd-like jetpack-likes-widget-wrapper jetpack-likes-widget-unloaded' id='like-post-wrapper-92761702-512-592f0c8a53a2f' data-src='//widgets.wp.com/likes/#blog_id=92761702&amp;post_id=512&amp;origin=bigishdata.wordpress.com&amp;obj_id=92761702-512-592f0c8a53a2f' data-name='like-post-frame-92761702-512-592f0c8a53a2f'><h3 class='sd-title'>Like this:</h3><div class='likes-widget-placeholder post-likes-widget-placeholder' style='height: 55px;'><span class='button'><span>Like</span></span> <span class="loading">Loading...</span></div><span class='sd-text-color'></span><a class='sd-link-color'></a></div>
266 | <div id='jp-relatedposts' class='jp-relatedposts' >
267 | 	<h3 class="jp-relatedposts-headline"><em>Related</em></h3>
268 | </div></div>					</div><!-- .entry-content -->
269 | 		
270 | 		<footer class="entry-meta">
271 | 			This entry was posted in <a href="https://bigishdata.com/category/how-to/" rel="category tag">How To</a>, <a href="https://bigishdata.com/category/python/" rel="category tag">Python</a>, <a href="https://bigishdata.com/category/scraping/" rel="category tag">Scraping</a> and tagged <a href="https://bigishdata.com/tag/tips/" rel="tag">tips</a> on <a href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/" title="7:38 pm" rel="bookmark"><time class="entry-date" datetime="2017-05-11T19:38:44+00:00">May 11, 2017</time></a><span class="by-author"> by <span class="author vcard"><a class="url fn n" href="https://bigishdata.com/author/jackschultz23/" title="View all posts by Jack Schultz" rel="author">Jack Schultz</a></span></span>.								</footer><!-- .entry-meta -->
272 | 	</article><!-- #post -->
273 | 
274 | 				<nav class="nav-single">
275 | 					<h3 class="assistive-text">Post navigation</h3>
276 | 					<span class="nav-previous"><a href="https://bigishdata.com/2017/05/11/product-mentions-update-thoughts-when-reviewing-the-reddit-mentions/" rel="prev"><span class="meta-nav">&larr;</span> Product Mentions Update &#8212; Thoughts When Reviewing the Reddit&nbsp;Mentions</a></span>
277 | 					<span class="nav-next"></span>
278 | 				</nav><!-- .nav-single -->
279 | 
280 | 				
281 | <div id="comments" class="comments-area">
282 | 
283 | 	
284 | 			<h2 class="comments-title">
285 | 			7 thoughts on &ldquo;<span>General Tips for Web Scraping with&nbsp;Python</span>&rdquo;		</h2>
286 | 
287 | 		<ol class="commentlist">
288 | 				<li class="comment even thread-even depth-1 highlander-comment" id="li-comment-348">
289 | 		<article id="comment-348" class="comment">
290 | 			<header class="comment-meta comment-author vcard">
291 | 				<img alt='' src='https://1.gravatar.com/avatar/1b32a135f25c364d075ad7738d7d09ee?s=44&#038;d=identicon&#038;r=G' class='avatar avatar-44' height='44' width='44' /><cite><b class="fn"><a href='http://www.efiscal.net' rel='external nofollow' class='url'>Todd Meyers</a></b> </cite><a href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/#comment-348"><time datetime="2017-05-12T17:26:46+00:00">May 12, 2017 at 5:26 pm</time></a>			</header><!-- .comment-meta -->
292 | 
293 | 			
294 | 			<section class="comment-content comment">
295 | 				<p>Like this URL:<br />
296 | <a href="http://stats.nba.com/stats/leaguedashplayerstats?College=&#038;Conference=&#038;Country=&#038;DateFrom=&#038;DateTo=&#038;Division=&#038;DraftPick=&#038;DraftYear=&#038;GameScope=&#038;GameSegment=&#038;Height=&#038;LastNGames=0&#038;LeagueID=00&#038;Location=&#038;MeasureType=Base&#038;Month=0&#038;OpponentTeamID=0&#038;Outcome=&#038;PORound=0&#038;PaceAdjust=N&#038;PerMode=PerGame&#038;Period=0&#038;PlayerExperience=&#038;PlayerPosition=&#038;PlusMinus=N&#038;Rank=N&#038;Season=2016-17&#038;SeasonSegment=&#038;SeasonType=Playoffs&#038;ShotClockRange=&#038;StarterBench=&#038;TeamID=0&#038;VsConference=&#038;VsDivision=&#038;Weight=" rel="nofollow">http://stats.nba.com/stats/leaguedashplayerstats?College=&#038;Conference=&#038;Country=&#038;DateFrom=&#038;DateTo=&#038;Division=&#038;DraftPick=&#038;DraftYear=&#038;GameScope=&#038;GameSegment=&#038;Height=&#038;LastNGames=0&#038;LeagueID=00&#038;Location=&#038;MeasureType=Base&#038;Month=0&#038;OpponentTeamID=0&#038;Outcome=&#038;PORound=0&#038;PaceAdjust=N&#038;PerMode=PerGame&#038;Period=0&#038;PlayerExperience=&#038;PlayerPosition=&#038;PlusMinus=N&#038;Rank=N&#038;Season=2016-17&#038;SeasonSegment=&#038;SeasonType=Playoffs&#038;ShotClockRange=&#038;StarterBench=&#038;TeamID=0&#038;VsConference=&#038;VsDivision=&#038;Weight=</a></p>
297 | <p id="comment-like-348" data-liked=comment-not-liked class="comment-likes comment-not-liked"><a href="https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/?like_comment=348&#038;_wpnonce=537342fdd3" class="comment-like-link needs-login" rel="nofollow" data-blog="92761702"><span>Like</span></a><span id="comment-like-count-348" class="comment-like-feedback">Like</span></p>
298 | 							</section><!-- .comment-content -->
299 | 
300 | 			<div class="reply">
301 | 				<a rel='nofollow' class='comment-reply-link' href='https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/?replytocom=348#respond' onclick='return addComment.moveForm( "comment-348", "348", "respond", "512" )' aria-label='Reply to Todd Meyers'>Reply</a> <span>&darr;</span>			</div><!-- .reply -->
302 | 		</article><!-- #comment-## -->
303 | 	</li><!-- #comment-## -->
304 | 	<li class="pingback odd alt thread-odd thread-alt depth-1 highlander-comment" id="comment-349">
305 | 		<p>Pingback: <a href='https://fullstackfeed.com/general-tips-for-web-scraping-with-python/' rel='external nofollow' class='url'>General Tips for Web Scraping with Python &#8211; Full-Stack Feed</a> </p>
306 | 	</li><!-- #comment-## -->
307 | 	<li class="pingback even thread-even depth-1 highlander-comment" id="comment-353">
308 | 		<p>Pingback: <a href='http://blog.explodingads.com/?p=60357' rel='external nofollow' class='url'>2 – General Tips for Web Scraping with Python</a> </p>
309 | 	</li><!-- #comment-## -->
310 | 	<li class="pingback odd alt thread-odd thread-alt depth-1 highlander-comment" id="comment-372">
311 | 		<p>Pingback: <a href='https://beanalytics.wordpress.com/2017/05/16/this-week-in-data-science-may-16-2017/' rel='external nofollow' class='url'>This Week in Data Science (May 16, 2017) &#8211; Be Analytics</a> </p>
312 | 	</li><!-- #comment-## -->
313 | 	<li class="pingback even thread-even depth-1 highlander-comment" id="comment-382">
314 | 		<p>Pingback: <a href='http://www.dataarchitect.cloud/this-week-in-data-science-may-16-2017/' rel='external nofollow' class='url'>This Week in Data Science (May 16, 2017) &#8211; Cloud Data Architect</a> </p>
315 | 	</li><!-- #comment-## -->
316 | 	<li class="pingback odd alt thread-odd thread-alt depth-1 highlander-comment" id="comment-383">
317 | 		<p>Pingback: <a href='http://www.biva-ags.com/this-week-in-data-science-may-16-2017/' rel='external nofollow' class='url'>This Week in Data Science (May 16, 2017) - biva</a> </p>
318 | 	</li><!-- #comment-## -->
319 | 	<li class="pingback even thread-even depth-1 highlander-comment" id="comment-389">
320 | 		<p>Pingback: <a href='https://beanalytics.wordpress.com/2017/05/23/this-week-in-data-science-may-16-2017-2/' rel='external nofollow' class='url'>This Week in Data Science (May 16, 2017) &#8211; Be Analytics</a> </p>
321 | 	</li><!-- #comment-## -->
322 | 		</ol><!-- .commentlist -->
323 | 
324 | 		
325 | 		
326 | 	
327 | 		<div id="respond" class="comment-respond">
328 | 		<h3 id="reply-title" class="comment-reply-title">Leave a Reply <small><a rel="nofollow" id="cancel-comment-reply-link" href="/2017/05/11/general-tips-for-web-scraping-with-python/#respond" style="display:none;">Cancel reply</a></small></h3>			<form action="https://bigishdata.com/wp-comments-post.php" method="post" id="commentform" class="comment-form">
329 | 				<input type="hidden" id="highlander_comment_nonce" name="highlander_comment_nonce" value="9e4aee7a30" /><input type="hidden" name="_wp_http_referer" value="/2017/05/11/general-tips-for-web-scraping-with-python/" />
330 | <input type="hidden" name="hc_post_as" id="hc_post_as" value="guest" />
331 | 
332 | <div class="comment-form-field comment-textarea">
333 | 	<label for="comment">Enter your comment here...</label>
334 | 	<div id="comment-form-comment"><textarea id="comment" name="comment" title="Enter your comment here..."></textarea></div>
335 | </div>
336 | 
337 | <div id="comment-form-identity">
338 | 
339 | 	<div id="comment-form-nascar">
340 | 		<p>Fill in your details below or click an icon to log in:</p>
341 | 		<ul>
342 | 			<li class="selected" style="display:none;">
343 | 				<a href="#comment-form-guest" id="postas-guest" title="Guest">
344 | 					<span></span>
345 | 				</a>
346 | 			</li>
347 | 			<li>
348 | 				<a href="#comment-form-load-service:WordPress.com" id="postas-wordpress" title="WordPress.com">
349 | 					<span></span>
350 | 				</a>
351 | 			</li>
352 | 			<li>
353 | 				<a href="#comment-form-load-service:Twitter" id="postas-twitter" title="Twitter">
354 | 					<span></span>
355 | 				</a>
356 | 			</li>
357 | 			<li>
358 | 				<a href="#comment-form-load-service:Facebook" id="postas-facebook" title="Facebook">
359 | 					<span></span>
360 | 				</a>
361 | 			</li>
362 | 			<li>
363 | 		</ul>
364 | 	</div>
365 | 
366 | 	<div id="comment-form-guest" class="comment-form-service selected">
367 | 		<div class="comment-form-padder">
368 | 			<div class="comment-form-avatar">
369 | <a href="https://gravatar.com/site/signup/" target="_blank">				<img src="https://1.gravatar.com/avatar/ad516503a11cd5ca435acc9bb6523536?s=25&amp;d=identicon&amp;forcedefault=y&amp;r=G" alt="Gravatar" width="25" class="no-grav" />
370 | </a>			</div>
371 | 
372 | 				<div class="comment-form-fields">
373 | 				<div class="comment-form-field comment-form-email">
374 | 					<label for="email">Email <span class="required">(required)</span> <span class="nopublish">(Address never made public)</span></label>
375 | 					<div class="comment-form-input"><input id="email" name="email" type="email" value="" /></div>
376 | 				</div>
377 | 				<div class="comment-form-field comment-form-author">
378 | 					<label for="author">Name <span class="required">(required)</span></label>
379 | 					<div class="comment-form-input"><input id="author" name="author" type="text" value="" /></div>
380 | 				</div>
381 | 				<div class="comment-form-field comment-form-url">
382 | 					<label for="url">Website</label>
383 | 					<div class="comment-form-input"><input id="url" name="url" type="url" value="" /></div>
384 | 				</div>
385 | 			</div>
386 | 	
387 | 		</div>
388 | 	</div>
389 | 
390 | 	<div id="comment-form-wordpress" class="comment-form-service">
391 | 		<div class="comment-form-padder">
392 | 			<div class="comment-form-avatar">
393 | 				<img src="https://1.gravatar.com/avatar/ad516503a11cd5ca435acc9bb6523536?s=25&amp;d=identicon&amp;forcedefault=y&amp;r=G" alt="WordPress.com Logo" width="25" class="no-grav" />
394 | 			</div>
395 | 
396 | 				<div class="comment-form-fields">
397 | 				<input type="hidden" name="wp_avatar" id="wordpress-avatar" class="comment-meta-wordpress" value="" />
398 | 				<input type="hidden" name="wp_user_id" id="wordpress-user_id" class="comment-meta-wordpress" value="" />
399 | 				<input type="hidden" name="wp_access_token" id="wordpress-access_token" class="comment-meta-wordpress" value="" />
400 | 				<p class="comment-form-posting-as pa-wordpress"><strong></strong> You are commenting using your WordPress.com account. <span class="comment-form-log-out">(&nbsp;<a href="javascript:HighlanderComments.doExternalLogout( 'wordpress' );">Log&nbsp;Out</a>&nbsp;/&nbsp;<a href="#" onclick="javascript:HighlanderComments.switchAccount();return false;">Change</a>&nbsp;)</span></p>
401 | 			</div>
402 | 	
403 | 		</div>
404 | 	</div>
405 | 
406 | 	<div id="comment-form-twitter" class="comment-form-service">
407 | 		<div class="comment-form-padder">
408 | 			<div class="comment-form-avatar">
409 | 				<img src="https://1.gravatar.com/avatar/ad516503a11cd5ca435acc9bb6523536?s=25&amp;d=identicon&amp;forcedefault=y&amp;r=G" alt="Twitter picture" width="25" class="no-grav" />
410 | 			</div>
411 | 
412 | 				<div class="comment-form-fields">
413 | 				<input type="hidden" name="twitter_avatar" id="twitter-avatar" class="comment-meta-twitter" value="" />
414 | 				<input type="hidden" name="twitter_user_id" id="twitter-user_id" class="comment-meta-twitter" value="" />
415 | 				<input type="hidden" name="twitter_access_token" id="twitter-access_token" class="comment-meta-twitter" value="" />
416 | 				<p class="comment-form-posting-as pa-twitter"><strong></strong> You are commenting using your Twitter account. <span class="comment-form-log-out">(&nbsp;<a href="javascript:HighlanderComments.doExternalLogout( 'twitter' );">Log&nbsp;Out</a>&nbsp;/&nbsp;<a href="#" onclick="javascript:HighlanderComments.switchAccount();return false;">Change</a>&nbsp;)</span></p>
417 | 			</div>
418 | 	
419 | 		</div>
420 | 	</div>
421 | 
422 | 	<div id="comment-form-facebook" class="comment-form-service">
423 | 		<div class="comment-form-padder">
424 | 			<div class="comment-form-avatar">
425 | 				<img src="https://1.gravatar.com/avatar/ad516503a11cd5ca435acc9bb6523536?s=25&amp;d=identicon&amp;forcedefault=y&amp;r=G" alt="Facebook photo" width="25" class="no-grav" />
426 | 			</div>
427 | 
428 | 				<div class="comment-form-fields">
429 | 				<input type="hidden" name="fb_avatar" id="facebook-avatar" class="comment-meta-facebook" value="" />
430 | 				<input type="hidden" name="fb_user_id" id="facebook-user_id" class="comment-meta-facebook" value="" />
431 | 				<input type="hidden" name="fb_access_token" id="facebook-access_token" class="comment-meta-facebook" value="" />
432 | 				<p class="comment-form-posting-as pa-facebook"><strong></strong> You are commenting using your Facebook account. <span class="comment-form-log-out">(&nbsp;<a href="javascript:HighlanderComments.doExternalLogout( 'facebook' );">Log&nbsp;Out</a>&nbsp;/&nbsp;<a href="#" onclick="javascript:HighlanderComments.switchAccount();return false;">Change</a>&nbsp;)</span></p>
433 | 			</div>
434 | 	
435 | 		</div>
436 | 	</div>
437 | 
438 | 	<div id="comment-form-googleplus" class="comment-form-service">
439 | 		<div class="comment-form-padder">
440 | 			<div class="comment-form-avatar">
441 | 				<img src="https://1.gravatar.com/avatar/ad516503a11cd5ca435acc9bb6523536?s=25&amp;d=identicon&amp;forcedefault=y&amp;r=G" alt="Google+ photo" width="25" class="no-grav" />
442 | 			</div>
443 | 
444 | 				<div class="comment-form-fields">
445 | 				<input type="hidden" name="googleplus_avatar" id="googleplus-avatar" class="comment-meta-googleplus" value="" />
446 | 				<input type="hidden" name="googleplus_user_id" id="googleplus-user_id" class="comment-meta-googleplus" value="" />
447 | 				<input type="hidden" name="googleplus_access_token" id="googleplus-access_token" class="comment-meta-googleplus" value="" />
448 | 				<p class="comment-form-posting-as pa-googleplus"><strong></strong> You are commenting using your Google+ account. <span class="comment-form-log-out">(&nbsp;<a href="javascript:HighlanderComments.doExternalLogout( 'googleplus' );">Log&nbsp;Out</a>&nbsp;/&nbsp;<a href="#" onclick="javascript:HighlanderComments.switchAccount();return false;">Change</a>&nbsp;)</span></p>
449 | 			</div>
450 | 	
451 | 		</div>
452 | 	</div>
453 | 
454 | 
455 | 	<div id="comment-form-load-service" class="comment-form-service">
456 | 		<div class="comment-form-posting-as-cancel"><a href="javascript:HighlanderComments.cancelExternalWindow();">Cancel</a></div>
457 | 		<p>Connecting to %s</p>
458 | 	</div>
459 | 
460 | </div>
461 | 
462 | <script type="text/javascript">
463 | var highlander_expando_javascript = function(){
464 | 	var input = document.createElement( 'input' ),
465 | 	    comment = jQuery( '#comment' );
466 | 
467 | 	if ( 'placeholder' in input ) {
468 | 		comment.attr( 'placeholder', jQuery( '.comment-textarea label' ).remove().text() );
469 | 	}
470 | 
471 | 	// Expando Mode: start small, then auto-resize on first click + text length
472 | 	jQuery( '#comment-form-identity' ).hide();
473 | 	jQuery( '#comment-form-subscribe' ).hide();
474 | 	jQuery( '#commentform .form-submit' ).hide();
475 | 
476 | 	comment.css( { 'height':'10px' } ).one( 'focus', function() {
477 | 		var timer = setInterval( HighlanderComments.resizeCallback, 10 )
478 | 		jQuery( this ).animate( { 'height': HighlanderComments.initialHeight } ).delay( 100 ).queue( function(n) { clearInterval( timer ); HighlanderComments.resizeCallback(); n(); } );
479 | 		jQuery( '#comment-form-identity' ).slideDown();
480 | 		jQuery( '#comment-form-subscribe' ).slideDown();
481 | 		jQuery( '#commentform .form-submit' ).slideDown();
482 | 	});
483 | }
484 | jQuery(document).ready( highlander_expando_javascript );
485 | </script>
486 | 
487 | <div id="comment-form-subscribe">
488 | 	<p class="comment-subscription-form"><input type="checkbox" name="subscribe" id="subscribe" value="subscribe" style="width: auto;"/> <label class="subscribe-label" id="subscribe-label" for="subscribe" style="display: inline;">Notify me of new comments via email.</label></p><p class="post-subscription-form"><input type="checkbox" name="subscribe_blog" id="subscribe_blog" value="subscribe" style="width: auto;"/> <label class="subscribe-label" id="subscribe-blog-label" for="subscribe_blog"  style="display: inline;">Notify me of new posts via email.</label></p></div>
489 | 
490 | 
491 | 
492 | 
493 | <p class="form-submit"><input name="submit" type="submit" id="comment-submit" class="submit" value="Post Comment" /> <input type='hidden' name='comment_post_ID' value='512' id='comment_post_ID' />
494 | <input type='hidden' name='comment_parent' id='comment_parent' value='0' />
495 | </p><p style="display: none;"><input type="hidden" id="akismet_comment_nonce" name="akismet_comment_nonce" value="a61ec79086" /></p>
496 | <input type="hidden" name="genseq" value="1496255626" />
497 | <p style="display: none;"><input type="hidden" id="ak_js" name="ak_js" value="44"/></p>			</form>
498 | 			</div><!-- #respond -->
499 | 	<div style="clear: both"></div>
500 | </div><!-- #comments .comments-area -->
501 | 			
502 | 		</div><!-- #content -->
503 | 	</div><!-- #primary -->
504 | 
505 | 
506 | 		</div><!-- #main .wrapper -->
507 | 	<footer id="colophon" role="contentinfo">
508 | 		<div class="site-info">
509 | 						<a href="https://wordpress.com/?ref=footer_blog">Blog at WordPress.com.</a>
510 | 		</div><!-- .site-info -->
511 | 	</footer><!-- #colophon -->
512 | </div><!-- #page -->
513 | 
514 | <!--  -->
515 | <script type='text/javascript' src='//0.gravatar.com/js/gprofiles.js?ver=201722y'></script>
516 | <script type='text/javascript'>
517 | /* <![CDATA[ */
518 | var WPGroHo = {"my_hash":""};
519 | /* ]]> */
520 | </script>
521 | <script type='text/javascript' src='https://s1.wp.com/wp-content/mu-plugins/gravatar-hovercards/wpgroho.js?m=1380573781h'></script>
522 | 
523 | 	<script>
524 | 		//initialize and attach hovercards to all gravatars
525 | 		jQuery( document ).ready( function( $ ) {
526 | 
527 | 			if (typeof Gravatar === "undefined"){
528 | 				return;
529 | 			}
530 | 
531 | 			if ( typeof Gravatar.init !== "function" ) {
532 | 				return;
533 | 			}			
534 | 
535 | 			Gravatar.profile_cb = function( hash, id ) {
536 | 				WPGroHo.syncProfileData( hash, id );
537 | 			};
538 | 			Gravatar.my_hash = WPGroHo.my_hash;
539 | 			Gravatar.init( 'body', '#wp-admin-bar-my-account' );
540 | 		});
541 | 	</script>
542 | 
543 | 		<div style="display:none">
544 | 	<div class="grofile-hash-map-1b32a135f25c364d075ad7738d7d09ee">
545 | 	</div>
546 | 	</div>
547 | <script type='text/javascript'>
548 | /* <![CDATA[ */
549 | var HighlanderComments = {"loggingInText":"Logging In\u2026","submittingText":"Posting Comment\u2026","postCommentText":"Post Comment","connectingToText":"Connecting to %s","commentingAsText":"%1$s: You are commenting using your %2$s account.","logoutText":"Log Out","loginText":"Log In","connectURL":"https:\/\/bigishdata.wordpress.com\/public.api\/connect\/?action=request","logoutURL":"https:\/\/bigishdata.wordpress.com\/wp-login.php?action=logout&_wpnonce=2d10cc06f1","homeURL":"https:\/\/bigishdata.com\/","postID":"512","gravDefault":"identicon","enterACommentError":"Please enter a comment","enterEmailError":"Please enter your email address here","invalidEmailError":"Invalid email address","enterAuthorError":"Please enter your name here","gravatarFromEmail":"This picture will show whenever you leave a comment. Click to customize it.","logInToExternalAccount":"Log in to use details from one of these accounts.","change":"Change","changeAccount":"Change Account","comment_registration":"","userIsLoggedIn":"","isJetpack":"0","text_direction":"ltr"};
550 | /* ]]> */
551 | </script>
552 | <script type='text/javascript' src='https://s2.wp.com/_static/??/wp-content/js/jquery/jquery.autoresize.js,/wp-content/mu-plugins/highlander-comments/script.js?m=1479964158j'></script>
553 | 
554 | 	<div id="carousel-reblog-box">
555 | 		<form action="#" name="carousel-reblog">
556 | 			<textarea id="carousel-reblog-content" name="carousel-reblog-content" placeholder="Add your thoughts here... (optional)"></textarea>
557 | 			<label for="carousel-reblog-to-blog-id" id="carousel-reblog-lblogid">Post to</label>
558 | 			<select name="carousel-reblog-to-blog-id" id="carousel-reblog-to-blog-id">
559 | 						</select>
560 | 
561 | 			<div class="submit">
562 | 				<span class="canceltext"><a href="#" class="cancel">Cancel</a></span>
563 | 				<input type="submit" name="carousel-reblog-submit" class="button" id="carousel-reblog-submit" value="Reblog Post" />
564 | 				<input type="hidden" id="carousel-reblog-blog-id" value="92761702" />
565 | 				<input type="hidden" id="carousel-reblog-blog-url" value="https://bigishdata.com" />
566 | 				<input type="hidden" id="carousel-reblog-blog-title" value="Big-Ish Data" />
567 | 				<input type="hidden" id="carousel-reblog-post-url" value="" />
568 | 				<input type="hidden" id="carousel-reblog-post-title" value="" />
569 | 			</div>
570 | 
571 | 			<input type="hidden" id="_wpnonce" name="_wpnonce" value="b0a1b83744" /><input type="hidden" name="_wp_http_referer" value="/2017/05/11/general-tips-for-web-scraping-with-python/" />		</form>
572 | 
573 | 		<div class="arrow"></div>
574 | 	</div>
575 | 
576 | 	<script type="text/javascript">
577 | 		window.WPCOM_sharing_counts = {"https:\/\/bigishdata.com\/2017\/05\/11\/general-tips-for-web-scraping-with-python\/":512};
578 | 	</script>
579 | 		<script type="text/javascript">
580 | 			var windowOpen;
581 | 		jQuery(document).on( 'ready post-load', function(){
582 | 			jQuery( 'a.share-twitter' ).on( 'click', function() {
583 | 				if ( 'undefined' !== typeof windowOpen ){ // If there's another sharing window open, close it.
584 | 					windowOpen.close();
585 | 				}
586 | 				windowOpen = window.open( jQuery(this).attr( 'href' ), 'wpcomtwitter', 'menubar=1,resizable=1,width=600,height=350' );
587 | 				return false;
588 | 			});
589 | 		});
590 | 		</script>
591 | 				<script type="text/javascript">
592 | 			var windowOpen;
593 | 		jQuery(document).on( 'ready post-load', function(){
594 | 			jQuery( 'a.share-facebook' ).on( 'click', function() {
595 | 				if ( 'undefined' !== typeof windowOpen ){ // If there's another sharing window open, close it.
596 | 					windowOpen.close();
597 | 				}
598 | 				windowOpen = window.open( jQuery(this).attr( 'href' ), 'wpcomfacebook', 'menubar=1,resizable=1,width=600,height=400' );
599 | 				return false;
600 | 			});
601 | 		});
602 | 		</script>
603 | 				<script type="text/javascript">
604 | 			var windowOpen;
605 | 		jQuery(document).on( 'ready post-load', function(){
606 | 			jQuery( 'a.share-google-plus-1' ).on( 'click', function() {
607 | 				if ( 'undefined' !== typeof windowOpen ){ // If there's another sharing window open, close it.
608 | 					windowOpen.close();
609 | 				}
610 | 				windowOpen = window.open( jQuery(this).attr( 'href' ), 'wpcomgoogle-plus-1', 'menubar=1,resizable=1,width=480,height=550' );
611 | 				return false;
612 | 			});
613 | 		});
614 | 		</script>
615 | 		<script type='text/javascript' src='https://s1.wp.com/wp-content/mu-plugins/akismet-3.0/_inc/form.js?m=1404442431h'></script>
616 | <link rel='stylesheet' id='all-css-0-3' href='https://s1.wp.com/wp-content/mu-plugins/carousel/jetpack-carousel.css?m=1481571546h' type='text/css' media='all' />
617 | <!--[if lte IE 8]>
618 | <link rel='stylesheet' id='jetpack-carousel-ie8fix-css'  href='https://s1.wp.com/wp-content/mu-plugins/carousel/jetpack-carousel-ie8fix.css?m=1412618825h&#038;ver=20121024' type='text/css' media='all' />
619 | <![endif]-->
620 | <script type='text/javascript'>
621 | /* <![CDATA[ */
622 | var comment_like_text = {"loading":"Loading..."};
623 | /* ]]> */
624 | </script>
625 | <script type='text/javascript'>
626 | /* <![CDATA[ */
627 | var actionbardata = {"siteID":"92761702","siteName":"Big-Ish Data","siteURL":"https:\/\/bigishdata.com","icon":"<img alt='' src='https:\/\/s1.wp.com\/i\/logo\/wpcom-gray-white.png' class='avatar avatar-50' height='50' width='50' \/>","canManageOptions":"","canCustomizeSite":"","isFollowing":"","themeSlug":"pub\/twentytwelve","signupURL":"https:\/\/wordpress.com\/start\/","loginURL":"https:\/\/bigishdata.wordpress.com\/wp-login.php?redirect_to=https%3A%2F%2Fbigishdata.com%2F2017%2F05%2F11%2Fgeneral-tips-for-web-scraping-with-python%2F","themeURL":"https:\/\/wordpress.com\/theme\/twentytwelve\/","xhrURL":"https:\/\/bigishdata.com\/wp-admin\/admin-ajax.php","nonce":"5abc5a1ee3","isSingular":"1","isFolded":"","isLoggedIn":"","isMobile":"","subscribeNonce":"<input type=\"hidden\" id=\"_wpnonce\" name=\"_wpnonce\" value=\"b612d25cf8\" \/>","referer":"https:\/\/bigishdata.com\/2017\/05\/11\/general-tips-for-web-scraping-with-python\/","canFollow":"1","statusMessage":"","customizeLink":"https:\/\/bigishdata.wordpress.com\/wp-admin\/customize.php?url=https%3A%2F%2Fbigishdata.wordpress.com%2F2017%2F05%2F11%2Fgeneral-tips-for-web-scraping-with-python%2F","postID":"512","shortlink":"http:\/\/wp.me\/p6hdyu-8g","canEditPost":"","editLink":"https:\/\/wordpress.com\/post\/bigishdata.com\/512","statsLink":"https:\/\/wordpress.com\/stats\/post\/512\/bigishdata.com","i18n":{"view":"View site","follow":"Follow","following":"Following","edit":"Edit","login":"Log in","signup":"Sign up","customize":"Customize","report":"Report this content","themeInfo":"Get theme: Twenty Twelve","shortlink":"Copy shortlink","copied":"Copied","followedText":"New posts from this site will now appear in your <a href=\"https:\/\/wordpress.com\/\">Reader<\/a>","foldBar":"Collapse this bar","unfoldBar":"Expand this bar","editSubs":"Manage subscriptions","viewReader":"View site in the Reader","subscribe":"Sign me up","enterEmail":"Enter your email address","followers":"","alreadyUser":"Already have a WordPress.com account? <a href=\"https:\/\/bigishdata.wordpress.com\/wp-login.php?redirect_to=https%3A%2F%2Fbigishdata.com%2F2017%2F05%2F11%2Fgeneral-tips-for-web-scraping-with-python%2F\">Log in now.<\/a>","stats":"Stats"}};
628 | /* ]]> */
629 | </script>
630 | <script type='text/javascript'>
631 | /* <![CDATA[ */
632 | var jetpackCarouselStrings = {"widths":[370,700,1000,1200,1400,2000],"is_logged_in":"","lang":"en","ajaxurl":"https:\/\/bigishdata.com\/wp-admin\/admin-ajax.php","nonce":"df5b560106","display_exif":"1","display_geo":"1","single_image_gallery":"1","single_image_gallery_media_file":"","background_color":"black","comment":"Comment","post_comment":"Post Comment","write_comment":"Write a Comment...","loading_comments":"Loading Comments...","download_original":"View full size <span class=\"photo-size\">{0}<span class=\"photo-size-times\">\u00d7<\/span>{1}<\/span>","no_comment_text":"Please be sure to submit some text with your comment.","no_comment_email":"Please provide an email address to comment.","no_comment_author":"Please provide your name to comment.","comment_post_error":"Sorry, but there was an error posting your comment. Please try again later.","comment_approved":"Your comment was approved.","comment_unapproved":"Your comment is in moderation.","camera":"Camera","aperture":"Aperture","shutter_speed":"Shutter Speed","focal_length":"Focal Length","comment_registration":"0","require_name_email":"1","login_url":"https:\/\/bigishdata.wordpress.com\/wp-login.php?redirect_to=https%3A%2F%2Fbigishdata.com%2F2017%2F05%2F11%2Fgeneral-tips-for-web-scraping-with-python%2F","blog_id":"92761702","local_comments_commenting_as":"<fieldset><label for=\"email\">Email (Required)<\/label> <input type=\"text\" name=\"email\" class=\"jp-carousel-comment-form-field jp-carousel-comment-form-text-field\" id=\"jp-carousel-comment-form-email-field\" \/><\/fieldset><fieldset><label for=\"author\">Name (Required)<\/label> <input type=\"text\" name=\"author\" class=\"jp-carousel-comment-form-field jp-carousel-comment-form-text-field\" id=\"jp-carousel-comment-form-author-field\" \/><\/fieldset><fieldset><label for=\"url\">Website<\/label> <input type=\"text\" name=\"url\" class=\"jp-carousel-comment-form-field jp-carousel-comment-form-text-field\" id=\"jp-carousel-comment-form-url-field\" \/><\/fieldset>","reblog":"Reblog","reblogged":"Reblogged","reblog_add_thoughts":"Add your thoughts here... (optional)","reblogging":"Reblogging...","post_reblog":"Post Reblog","stats_query_args":"blog=92761702&v=wpcom&tz=0&user_id=0&subd=bigishdata","is_public":"1","reblog_enabled":""};
633 | /* ]]> */
634 | </script>
635 | <script type='text/javascript'>
636 | /* <![CDATA[ */
637 | var sharing_js_options = {"lang":"en","counts":"1"};
638 | /* ]]> */
639 | </script>
640 | <script type='text/javascript' src='https://s2.wp.com/_static/??-eJyVkN1OwzAMhV+I1Ns0gbhAPEuamtZZ/rCTlr49qaCjg6kSN7F1cr4TxzAlZWLIGDJYgQ5HMpg+GisPsLnyRSVXegoCji4o8F6w4KBD55B3zCZ6XyX1BdmFbkEmSvgf6EZYQQrGle7WwJjc3HgKv9PzgL5aU2khT1WY6+lGXNigR+p1pvgH2oykuxqqWs3gtWTk2qk4IjMtE1y1vQSzPLEkXLt7P5H0M/xWt3XhPH+XZuu6t0DNsQg6sJiTNhe1CjtMipLVm9PEIINmCv1aK/TqX47n59Ph8Hg8PdlPi4rT4g=='></script>
641 | <script type="text/javascript">
642 | // <![CDATA[
643 | (function() {
644 | try{
645 |   if ( window.external &&'msIsSiteMode' in window.external) {
646 |     if (window.external.msIsSiteMode()) {
647 |       var jl = document.createElement('script');
648 |       jl.type='text/javascript';
649 |       jl.async=true;
650 |       jl.src='/wp-content/plugins/ie-sitemode/custom-jumplist.php';
651 |       var s = document.getElementsByTagName('script')[0];
652 |       s.parentNode.insertBefore(jl, s);
653 |     }
654 |   }
655 | }catch(e){}
656 | })();
657 | // ]]>
658 | </script>		<iframe src='https://widgets.wp.com/likes/master.html?ver=20170206#ver=20170206' scrolling='no' id='likes-master' name='likes-master' style='display:none;'></iframe>
659 | 		<div id='likes-other-gravatars'><div class="likes-text"><span>%d</span> bloggers like this:</div><ul class="wpl-avatars sd-like-gravatars"></ul></div>
660 | 		<script>
661 | 			var _comscore = _comscore || [];
662 | 			_comscore.push({
663 | 				c1: "2",
664 | 				c2: "7518284"
665 | 			});
666 | 			(function() {
667 | 				var s = document.createElement("script"),
668 | 					el = document.getElementsByTagName("script")[0];
669 | 				s.defer = true;
670 | 				s.src = (document.location.protocol == "https:" ? "https://sb" : "http://b") + ".scorecardresearch.com/beacon.js";
671 | 				el.parentNode.insertBefore(s, el);
672 | 			})();
673 | 		</script>
674 | 		<noscript>
675 | 			<p class="robots-nocontent"><img src="https://sb.scorecardresearch.com/p?c1=2&c2=7518284&c3=&c4=&c5=&c6=&c15=&cv=2.0&cj=1" alt="" style="display:none;" width="1" height="1" /></p>
676 | 		</noscript><script src="//stats.wp.com/w.js?56" type="text/javascript" async defer></script>
677 | <script type="text/javascript">
678 | _tkq = window._tkq || [];
679 | _stq = window._stq || [];
680 | _tkq.push(['storeContext', {'blog_id':'92761702','blog_tz':'0','user_lang':'en','blog_lang':'en','user_id':'0'}]);
681 | _stq.push(['view', {'blog':'92761702','v':'wpcom','tz':'0','user_id':'0','post':'512','subd':'bigishdata'}]);
682 | _stq.push(['extra', {'crypt':'UE40eW5QN0p8M2Y/RE1TaVhzUzFMbjdWNHpwZGhTayxPSUFCMGNrd29+Smw0TDhnZmRTK0hlRi9QSGh6bi9GXVhBJWIlZlR5U1JMLU8/MkNtblkvY1dzK3YrWW16SXlfX2E4SWFOK25BfFZLYlslNCtuP19fb0tHaDkzcThPRSZdVmEsMC8rWT9SUkZZZU1RKzhaUnlOYTJVUFNBVk16YTh8PTlKVXhhYiZpNyV5OHVtSSsmSGNVX29nLTBWN3A5TTlOUTIlUnZETGZ5MXJrZXxOXVRMTzJfd3U4UWstQj01TERZWCZEMk1wRVFGa2V3Ult1Z3JGNnVqNGVrUVJGfkJISVJ4aWxvLTZUV1dXSHZQQXImZl9CTlctN1VzaU1oNkNhfklj'}]);
683 | _stq.push([ 'clickTrackerInit', '92761702', '512' ]);
684 | 	</script>
685 | <noscript><img src="https://pixel.wp.com/b.gif?v=noscript" style="height:0px;width:0px;overflow:hidden" alt="" /></noscript>
686 | <script>
687 | if ( 'object' === typeof wpcom_mobile_user_agent_info ) {
688 | 
689 | 	wpcom_mobile_user_agent_info.init();
690 | 	var mobileStatsQueryString = "";
691 | 	
692 | 	if( false !== wpcom_mobile_user_agent_info.matchedPlatformName )
693 | 		mobileStatsQueryString += "&x_" + 'mobile_platforms' + '=' + wpcom_mobile_user_agent_info.matchedPlatformName;
694 | 	
695 | 	if( false !== wpcom_mobile_user_agent_info.matchedUserAgentName )
696 | 		mobileStatsQueryString += "&x_" + 'mobile_devices' + '=' + wpcom_mobile_user_agent_info.matchedUserAgentName;
697 | 	
698 | 	if( wpcom_mobile_user_agent_info.isIPad() )
699 | 		mobileStatsQueryString += "&x_" + 'ipad_views' + '=' + 'views';
700 | 
701 | 	if( "" != mobileStatsQueryString ) {
702 | 		new Image().src = document.location.protocol + '//pixel.wp.com/g.gif?v=wpcom-no-pv' + mobileStatsQueryString + '&baba=' + Math.random();
703 | 	}
704 | 	
705 | }
706 | </script></body>
707 | </html>


--------------------------------------------------------------------------------
/scraping/requesting_html.py:
--------------------------------------------------------------------------------
 1 | import helpers
 2 | 
 3 | url = "https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/"
 4 | params = {} #used for ? values
 5 | headers = {'user-agent' : 'Jack Schultz, bigishdata.com, contact@bigishdata.com'}
 6 | 
 7 | 
 8 | import urllib2
 9 | import urllib
10 | data = urllib.urlencode(params)
11 | req = urllib2.Request(url, data, headers)
12 | fstring = urllib2.urlopen(req).read()
13 | helpers.write_html('urllib2', fstring)
14 | 
15 | 
16 | import requests
17 | page = requests.get(url, headers=headers)
18 | fstring = page.text
19 | helpers.write_html('requests', fstring.encode('UTF-8'))
20 | 
21 | 
22 | import httplib
23 | #note that the url here is split into the base and the path
24 | conn = httplib.HTTPConnection("bigishdata.com")
25 | conn.request("GET", "/2017/05/11/general-tips-for-web-scraping-with-python/")
26 | response = conn.getresponse()
27 | helpers.write_html('httplib', response.read())
28 | conn.close()
29 | 


--------------------------------------------------------------------------------
/scraping/requirements.txt:
--------------------------------------------------------------------------------
 1 | PyDispatcher==2.0.5
 2 | Scrapy==1.4.0
 3 | Twisted==17.1.0
 4 | beautifulsoup4==4.6.0
 5 | certifi==2017.4.17
 6 | chardet==3.0.3
 7 | cssselect==1.0.1
 8 | idna==2.5
 9 | lxml==3.7.3
10 | parsel==1.2.0
11 | pyOpenSSL==17.0.0
12 | queuelib==1.4.2
13 | requests==2.17.3
14 | service-identity==17.0.0
15 | six==1.10.0
16 | urllib3==1.21.1
17 | virtualenv==15.1.0
18 | w3lib==1.17.0
19 | wsgiref==0.1.2
20 | 


--------------------------------------------------------------------------------
/scraping/scraping_html.py:
--------------------------------------------------------------------------------
  1 | import helpers
  2 | 
  3 | with open('page.html', 'r') as f:
  4 |   page_string = f.read()
  5 | 
  6 | 
  7 | ##
  8 | ## BeautifulSoup
  9 | ##
 10 | from bs4 import BeautifulSoup as bs
 11 | soup = bs(page_string, "html.parser")
 12 | article = soup.find('div', {'class' : 'entry-content'})
 13 | 
 14 | text = {}
 15 | text['p'] = []
 16 | text['h1'] = []
 17 | text['h3'] = []
 18 | text['pre'] = []
 19 | text['imgsrc'] = []
 20 | for tag in article.contents:
 21 |   #multiple if statements here to make is easier to read
 22 |   if tag is not None and tag.name is not None:
 23 |     if tag.name == "p":
 24 |       text['p'].append(tag.text)
 25 |     elif tag.name == 'h1':
 26 |       text['h1'].append(tag.text)
 27 |     elif tag.name == 'h3':
 28 |       text['h3'].append(tag.text)
 29 |     elif tag.name == 'pre':
 30 |       text['pre'].append(tag.text)
 31 | for tag in article.findAll('img'):
 32 |   text['imgsrc'].append(tag['src'])
 33 | helpers.write_data('bs', text)
 34 | 
 35 | ##
 36 | ## LXML
 37 | ##
 38 | import lxml.html
 39 | page = lxml.html.fromstring(page_string)
 40 | post = page.find_class('entry-content')[0] #0 since only one tag with that class
 41 | 
 42 | text = {}
 43 | text['p'] = []
 44 | text['h1'] = []
 45 | text['h3'] = []
 46 | text['pre'] = []
 47 | text['imgsrc'] = []
 48 | #test_content is needed to get all of the text within the tag, not just on the top level
 49 | for tag in post.findall('p'):
 50 |   text['p'].append(tag.text_content())
 51 |   for img in tag.findall('img'): #images in paragraphs, so need to check here
 52 |     text['imgsrc'].append(img.attrib['src'])
 53 | for tag in post.findall('h1'):
 54 |   text['h1'].append(tag.text_content())
 55 | for tag in post.findall('h3'):
 56 |   text['h3'].append(tag.text_content())
 57 | for tag in post.findall('pre'):
 58 |   text['pre'].append(tag.text_content())
 59 | helpers.write_data('lxml', text)
 60 | 
 61 | 
 62 | 
 63 | ##
 64 | ## HTMLParser
 65 | ##
 66 | from HTMLParser import HTMLParser
 67 | import urllib
 68 | 
 69 | desired_tags = (u'p', u'h1', u'h3', u'pre', u'img')
 70 | class BigIshDataParser(HTMLParser):
 71 |   def __init__(self):
 72 |     HTMLParser.__init__(self)
 73 |     self.inside_entry_content = 0
 74 |     self.current_tag = None
 75 |     self.current_text = []
 76 |     self.overall_text = {}
 77 |     self.overall_text['p'] = []
 78 |     self.overall_text['h1'] = []
 79 |     self.overall_text['h3'] = []
 80 |     self.overall_text['pre'] = []
 81 |     self.overall_text['img'] = []
 82 | 
 83 |   def handle_starttag(self, tag, attributes):
 84 |     if self.inside_entry_content and tag in desired_tags:
 85 |       self.current_tag = tag
 86 |     if tag == 'div':
 87 |       for name, value in attributes:
 88 |         if name == 'class' and value == 'entry-content': #if this is correct div
 89 |           self.inside_entry_content += 1
 90 |           return #don't keep going through the attributes since there could be infinate, or just a ton of them
 91 |     if tag == 'img' and self.inside_entry_content: #need to deal with images here since they're only a start tag
 92 |       for attr in attributes:
 93 |         if attr[0] == 'src':
 94 |           self.overall_text['img'].append(attr[1])
 95 |           break
 96 | 
 97 |   def handle_endtag(self, tag):
 98 |     if tag == 'div' and self.inside_entry_content:
 99 |       self.inside_entry_content -= 1 #moving on down the divs
100 |     if tag == self.current_tag:
101 |       tstring = ''.join(self.current_text)
102 |       self.overall_text[self.current_tag].append(tstring)
103 |       self.current_text = []
104 |       self.current_tag = None
105 | 
106 |   def handle_data(self, data):
107 |     if self.inside_entry_content:
108 |       self.current_text.append(data)
109 | 
110 | p = BigIshDataParser()
111 | page_string = p.unescape(page_string.decode('UTF-8'))
112 | p.feed(page_string)
113 | helpers.write_data('htmlparser', p.overall_text)
114 | p.close()
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/scraping/selenium_test.py:
--------------------------------------------------------------------------------
 1 | import helpers
 2 | 
 3 | from selenium import webdriver
 4 | from selenium.webdriver.common.keys import Keys
 5 | 
 6 | url = 'https://bigishdata.com/2017/05/11/general-tips-for-web-scraping-with-python/'
 7 | 
 8 | driver = webdriver.PhantomJS()
 9 | driver.get(url)
10 | elem = driver.find_element_by_class_name('entry-content')
11 | 
12 | text = {}
13 | desired_tags = (u'p', u'h1', u'h3', u'pre')
14 | for tag in desired_tags:
15 |   tags = elem.find_elements_by_tag_name(tag)
16 |   text[tag] = []
17 |   for data in tags:
18 |     text[tag].append(data.text)
19 | 
20 | helpers.write_data('selenium', text)
21 | 


--------------------------------------------------------------------------------
/sklearn_classify/classify.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | from sklearn.feature_extraction.text import TfidfTransformer
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | from pandas import DataFrame
 6 | import numpy
 7 | from sklearn.naive_bayes import MultinomialNB
 8 | from sklearn.pipeline import Pipeline
 9 | from sklearn.metrics import confusion_matrix, accuracy_score
10 | 
11 | 
12 | def print_confusion_matrix(matrix, class_labels):
13 |   lines = ["" for i in range(len(class_labels)+1)]
14 |   for index, c in enumerate(class_labels):
15 |     lines[0] += "\t"
16 |     lines[0] += c
17 |     lines[index+1] += c
18 |   for index, result in enumerate(matrix):
19 |     for amount in result:
20 |       lines[index+1] += "\t"
21 |       lines[index+1] += str(amount)
22 |   for line in lines:
23 |     print line
24 | 
25 | def initialize_conversion_matrix(num_labels):
26 |   return [[0 for i in range(num_labels)] for y in range(num_labels)]
27 | 
28 | 
29 | '''
30 | counts = count_vectorizer.fit_transform(data['text'].values)
31 | bigram_counts = bigram_vectorizer.fit_transform(data['text'].values)
32 | tfidf_counts = tfidf_vectorizer.fit_transform(data['text'].values)
33 | '''
34 | 
35 | labels = ["baby", "tool", "home", "pet", "food"]
36 | 
37 | count_vectorizer = CountVectorizer(min_df=1)
38 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
39 | tfidf_vectorizer = TfidfVectorizer(min_df=1)
40 | 
41 | classifier = MultinomialNB()
42 | 
43 | pipeline = Pipeline([
44 |   ('count_vectorizer', bigram_vectorizer),
45 |   ('classifier',       classifier)
46 | ])
47 | 
48 | reviews = []
49 | for label in labels:
50 |   filename = "train_%s.json" % label
51 |   with open(filename, 'r') as f:
52 |     for line in f:
53 |       reviews.append({'text': json.loads(line)["reviewText"], 'class': label})
54 | 
55 | data = DataFrame(reviews)
56 | data = data.reindex(numpy.random.permutation(data.index))
57 | 
58 | pipeline.fit(data['text'].values, data['class'].values)
59 | 
60 | test_reviews = []
61 | for index, label in enumerate(labels):
62 |   filename = "test_%s.json" % label
63 |   with open(filename, 'r') as f:
64 |     for line in f:
65 |       test_reviews.append({'text': json.loads(line)["reviewText"], 'class': label})
66 | 
67 | test_examples = [review['text'] for review in test_reviews]
68 | test_labels = [review['class'] for review in test_reviews]
69 | 
70 | #print pipeline.score(test_examples)
71 | guesses = pipeline.predict(test_examples)
72 | 
73 | print accuracy_score(test_labels, guesses)
74 | print confusion_matrix(test_labels, guesses, labels=labels)
75 | 
76 | 


--------------------------------------------------------------------------------
/tourstats/analyze.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | from sklearn import linear_model
  4 | import os
  5 | from bokeh.plotting import figure, output_file, show,vplot
  6 | from collections import Iterable, Sequence
  7 | 
  8 | stat = 'Driving Distance'
  9 | folder_path = 'stats_csv/%s' % (stat)
 10 | 
 11 | key = 'AVG.'
 12 | 
 13 | years = []
 14 | yearly_data = []
 15 | year_hash = {}
 16 | for filename in os.listdir(folder_path):
 17 |   with open(folder_path + '/' + filename, 'rb') as csvfile:
 18 |     year = filename.split('.')[0]
 19 |     years.append(year)
 20 |     reader = csv.DictReader(csvfile)
 21 |     fieldnames = reader.fieldnames
 22 | 
 23 |     avgs = [float(row[key]) for row in reader]
 24 |     year_hash[year] = avgs
 25 |     yearly_data.append(avgs)
 26 | 
 27 | int_years = [int(year) for year in years]
 28 | 
 29 | yda = np.array(yearly_data)
 30 | 
 31 | p = figure(tools="save", title="Max, Avg, Min Driving Distance Over Time")
 32 | p.line(int_years, [np.average(asdf) for asdf in yda], line_color="red")#, fill_color="red", line_color="green", line_width=3, )
 33 | p.line(int_years, [np.min(asdf) for asdf in yda], line_color="blue")#, fill_color="red", line_color="green", line_width=3, )
 34 | p.line(int_years, [np.max(asdf) for asdf in yda], line_color="green")#, fill_color="red", line_color="green", line_width=3, )
 35 | output_file("driving_distance.html", title="Max, Avg, Min Driving Distance Over Time")
 36 | show(vplot(p))
 37 | 
 38 | '''
 39 | filename = '2015.csv'
 40 | ind = []
 41 | dep = []
 42 | names = []
 43 | with open(filename, 'rb') as csvfile:
 44 |   reader = csv.reader(csvfile)
 45 |   headings = reader.next()[1:-1] #headings
 46 |   for row in reader:
 47 |     names.append(row[0])
 48 |     ind.append(map(float, row[1:-3]))
 49 |     dep.append(float(row[-2]))
 50 | 
 51 | npind = np.array(ind)
 52 | npdep = np.array(dep)
 53 | 
 54 | regr = linear_model.LinearRegression(normalize=True)
 55 | 
 56 | regr.fit(npind, npdep)
 57 | 
 58 | for name, coeff in zip(headings, regr.coef_):
 59 |   print "%s: %s" % (name, coeff)
 60 | 
 61 | print("Residual sum of squares: %.2f"
 62 |       % np.mean((regr.predict(npind) - npdep) ** 2))
 63 | 
 64 | for name, stats, money in zip(names, ind, dep):
 65 |   predicted = '{:20,.2f}'.format(np.dot(stats, regr.coef_))
 66 |   print "%s: %s, %s" % (name, predicted, '{:20,.2f}'.format(money))
 67 | 
 68 | import csv
 69 | from bokeh.plotting import figure, output_file, show, vplot
 70 | years = range(2002,2016)
 71 | years = [2002, 2015]
 72 | for year in years:
 73 |   filename = "%s.csv" % year
 74 |   with open(filename, 'rb') as csvfile:
 75 |     reader = csv.DictReader(csvfile)
 76 |     fieldnames = reader.fieldnames
 77 |     distances = [float(row['driving_distance']) for row in reader if row['percentage_of_yardage_covered_by_tee_shots']]
 78 |     a = np.array(distances)
 79 | 
 80 |     hist, edges = np.histogram(a, density=True, bins=100)
 81 | 
 82 |     x = np.linspace(np.amin(a)-5, np.amax(a)+5, 1000)
 83 |     mu = np.mean(a)
 84 |     sigma = np.std(a)
 85 |     pdf = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(x-mu)**2 / (2*sigma**2))
 86 | 
 87 |     p1 = figure(title="%s Driving Distance" % (year),tools="save", background_fill_color="#E8DDCB")
 88 |     p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="#036564", line_color="#033649")
 89 |     p1.line(x, pdf, line_color="#D95B43", line_width=8, alpha=0.7, legend="PDF")
 90 | 
 91 |     p1.legend.location = "top_left"
 92 |     p1.xaxis.axis_label = 'Driving Distance'
 93 |     p1.yaxis.axis_label = 'Pr(x)'
 94 | 
 95 |     output_file("%s_driving_distance.html" % (year), title="%s Driving Distance" % (year))
 96 |     show(vplot(p1))
 97 | 
 98 | 
 99 | '''
100 | 


--------------------------------------------------------------------------------
/tourstats/distance_vs_putts.csv:
--------------------------------------------------------------------------------
  1 | Brian Gay,1.719,270.2
  2 | Marc Leishman,1.723,288.3
  3 | Rob Oppenheim,1.746,272.2
  4 | Scott Stallings,1.798,287.3
  5 | Chad Campbell,1.788,281.5
  6 | Ian Poulter,1.835,276.2
  7 | Cameron Percy,1.832,274.6
  8 | Emiliano Grillo,1.779,286.2
  9 | Smylie Kaufman,1.725,295.0
 10 | Paul Casey,1.786,285.4
 11 | Rickie Fowler,1.753,293.7
 12 | Jason Dufner,1.756,282.7
 13 | Ryo Ishikawa,1.723,297.4
 14 | Charles Howell III,1.739,294.6
 15 | Vaughn Taylor,1.715,272.9
 16 | Jon Curran,1.738,267.1
 17 | Matt Every,1.772,281.0
 18 | Russell Knox,1.786,281.5
 19 | Steven Bowditch,1.750,288.5
 20 | Martin Piller,1.780,283.7
 21 | Ben Crane,1.711,275.0
 22 | David Lingmerth,1.729,284.1
 23 | Bryce Molder,1.758,276.9
 24 | Jim Herman,1.781,284.0
 25 | D.H. Lee,1.762,276.4
 26 | Chesson Hadley,1.812,288.3
 27 | Robert Streb,1.808,285.4
 28 | Matt Kuchar,1.725,281.2
 29 | Mark Wilson,1.807,269.9
 30 | Adam Hadwin,1.708,283.5
 31 | Peter Malnati,1.735,277.3
 32 | Billy Horschel,1.778,287.2
 33 | Bud Cauley,1.762,282.6
 34 | Jhonattan Vegas,1.791,288.1
 35 | Rory Sabbatini,1.735,287.6
 36 | D.A. Points,1.732,270.1
 37 | Wes Roach,1.783,284.5
 38 | Hiroshi Iwata,1.751,282.5
 39 | Jonas Blixt,1.755,279.9
 40 | Mark Hubbard,1.761,278.9
 41 | Tyrone Van Aswegen,1.767,280.7
 42 | Hideki Matsuyama,1.717,289.1
 43 | Andres Gonzales,1.793,284.2
 44 | Michael Kim,1.752,280.1
 45 | Scott Brown,1.785,283.6
 46 | Cameron Beckman,1.901,264.5
 47 | Patrick Rodgers,1.791,295.6
 48 | Danny Lee,1.741,283.2
 49 | Stewart Cink,1.800,283.5
 50 | Kyle Reifers,1.746,280.9
 51 | David Toms,1.791,265.2
 52 | William McGirt,1.727,277.5
 53 | Charley Hoffman,1.769,296.5
 54 | Will Wilcox,1.770,286.8
 55 | Colt Knost,1.765,271.7
 56 | Webb Simpson,1.746,288.9
 57 | Brett Stegmaier,1.782,286.0
 58 | Lucas Glover,1.763,283.8
 59 | J.B. Holmes,1.761,306.0
 60 | Patton Kizzire,1.728,283.5
 61 | Brian Stuard,1.790,266.2
 62 | Rory McIlroy,1.773,297.2
 63 | Camilo Villegas,1.772,283.4
 64 | Graeme McDowell,1.737,275.8
 65 | Erik Compton,1.775,281.3
 66 | Bill Haas,1.782,280.0
 67 | Jonathan Byrd,1.763,275.1
 68 | Ollie Schniederjans,1.802,295.5
 69 | Tyler Aldridge,1.776,280.6
 70 | Michael Putnam,1.825,278.8
 71 | John Huh,1.741,274.1
 72 | Vijay Singh,1.773,279.9
 73 | Ken Duke,1.829,270.3
 74 | Cameron Tringale,1.730,279.4
 75 | Nick Watney,1.759,290.0
 76 | Justin Thomas,1.742,293.9
 77 | Nick Taylor,1.764,290.6
 78 | Justin Rose,1.735,289.5
 79 | Kyle Stanley,1.769,282.4
 80 | Steve Wheatcroft,1.760,281.4
 81 | Brendon de Jonge,1.784,283.1
 82 | Cameron Smith,1.745,276.6
 83 | Charlie Beljan,1.910,295.1
 84 | Stuart Appleby,1.781,276.0
 85 | Dustin Johnson,1.722,305.5
 86 | Zach Johnson,1.755,281.8
 87 | Charl Schwartzel,1.788,288.5
 88 | Billy Hurley III,1.785,263.5
 89 | Fabian Gomez,1.781,284.4
 90 | Sung Kang,1.785,277.5
 91 | Rhein Gibson,1.848,284.7
 92 | Jason Day,1.765,297.4
 93 | Andrew Loupe,1.701,296.4
 94 | Luke Donald,1.781,270.8
 95 | Alex Cejka,1.740,278.1
 96 | Lucas Lee,1.818,265.7
 97 | Justin Hicks,1.771,277.5
 98 | Sam Saunders,1.810,284.8
 99 | Tim Herron,1.825,269.1
100 | Brandt Snedeker,1.718,289.8
101 | Ryan Moore,1.715,281.0
102 | Justin Leonard,1.747,270.7
103 | Steve Stricker,1.661,268.9
104 | Tim Clark,1.796,270.2
105 | Hudson Swafford,1.786,292.6
106 | Carlos Ortiz,1.736,285.9
107 | George McNeill,1.744,277.8
108 | Jeff Overton,1.764,281.6
109 | John Senden,1.770,283.1
110 | Jimmy Walker,1.751,291.6
111 | Ben Martin,1.730,280.6
112 | Dicky Pride,1.807,268.5
113 | Hunter Stewart,1.782,283.3
114 | Brice Garnett,1.757,274.2
115 | Robert Garrigus,1.800,290.2
116 | Aaron Baddeley,1.722,281.7
117 | Jason Bohn,1.711,280.3
118 | James Hahn,1.759,286.4
119 | David Hearn,1.746,275.9
120 | Sean O'Hair,1.732,287.3
121 | Andrew Landry,1.819,278.2
122 | Shawn Stefani,1.824,287.3
123 | Graham DeLaet,1.741,288.7
124 | Brian Davis,1.783,263.8
125 | Chris Stroud,1.762,281.0
126 | Roberto Castro,1.774,279.9
127 | Russell Henley,1.762,290.2
128 | Blake Adams,1.805,270.3
129 | Andres Romero,1.727,278.4
130 | Phil Mickelson,1.718,287.0
131 | Martin Laird,1.751,290.8
132 | Derek Fathauer,1.755,282.3
133 | Retief Goosen,1.784,282.0
134 | Chris Kirk,1.824,279.7
135 | Davis Love III,1.773,283.8
136 | Morgan Hoffmann,1.754,289.2
137 | Will MacKenzie,1.783,279.1
138 | K.J. Choi,1.764,271.2
139 | Jordan Spieth,1.659,287.5
140 | Abraham Ancer,1.807,272.5
141 | John Merrick,1.804,276.1
142 | Dawie van der Walt,1.775,282.2
143 | Kevin Na,1.716,277.9
144 | Troy Merritt,1.743,283.2
145 | Sergio Garcia,1.746,282.4
146 | Whee Kim,1.847,282.4
147 | Brendan Steele,1.742,291.6
148 | Daniel Berger,1.764,290.7
149 | Boo Weekley,1.799,283.8
150 | Jason Kokrak,1.793,297.1
151 | Kevin Kisner,1.691,288.1
152 | J.J. Henry,1.811,283.9
153 | Darron Stiles,1.771,256.4
154 | Kelly Kraft,1.770,280.4
155 | Rod Pampling,1.845,273.7
156 | Johnson Wagner,1.788,280.0
157 | Chez Reavie,1.764,281.8
158 | Robert Allenby,1.851,274.9
159 | Francesco Molinari,1.751,279.2
160 | Jerry Kelly,1.787,273.2
161 | Gary Woodland,1.759,301.5
162 | Si Woo Kim,1.766,287.8
163 | Michael Thompson,1.747,284.2
164 | Steve Marino,1.806,283.6
165 | Scott Langley,1.820,277.0
166 | Thomas Aiken,1.834,275.5
167 | Alex Prugh,1.853,285.0
168 | Ricky Barnes,1.741,279.7
169 | Geoff Ogilvy,1.798,286.8
170 | Brooks Koepka,1.729,298.6
171 | Daniel Summerhays,1.728,284.2
172 | Scott Pinckney,1.814,291.2
173 | Ernie Els,1.801,283.3
174 | Jarrod Lyle,1.726,274.2
175 | Brian Harman,1.760,285.1
176 | Kevin Streelman,1.754,283.9
177 | Keegan Bradley,1.833,292.2
178 | Blayne Barber,1.795,279.2
179 | Hunter Mahan,1.759,293.3
180 | Derek Ernst,1.842,284.4
181 | Miguel Angel Carballo,1.774,284.3
182 | Zac Blair,1.776,271.0
183 | Seung-Yul Noh,1.730,288.7
184 | D.J. Trahan,1.762,282.4
185 | Brendon Todd,1.774,274.3
186 | Shane Lowry,1.780,288.3
187 | Freddie Jacobson,1.717,275.9
188 | Ryan Palmer,1.748,298.6
189 | Tim Wilkinson,1.773,275.6
190 | Chad Collins,1.775,273.8
191 | Harris English,1.778,290.8
192 | Tom Hoge,1.780,281.6
193 | Kevin Chappell,1.760,285.3
194 | Pat Perez,1.772,280.2
195 | Luke List,1.796,294.2
196 | Greg Owen,1.820,290.1
197 | Bronson Burgoon,1.791,285.6
198 | Matt Jones,1.752,289.8
199 | Shane Bertsch,1.781,275.5
200 | Andy Sullivan,1.806,276.9
201 | Adam Scott,1.726,289.5
202 | Jamie Lovemark,1.774,297.0
203 | Angel Cabrera,1.787,290.6
204 | Scott Piercy,1.765,294.1
205 | Nicholas Thompson,1.837,270.2
206 | Padraig Harrington,1.779,280.4
207 | Henrik Norlander,1.780,276.1
208 | Harold Varner III,1.779,291.8
209 | Tony Finau,1.784,300.9
210 | Patrick Reed,1.743,288.1
211 | Carl Pettersson,1.734,280.4
212 | Branden Grace,1.701,282.6
213 | Bo Van Pelt,1.871,278.5
214 | Bubba Watson,1.787,307.5
215 | Spencer Levin,1.732,277.8
216 | Jason Gore,1.730,281.2
217 | Anirban Lahiri,1.700,286.0
218 | 


--------------------------------------------------------------------------------
/tourstats/driving_distance.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!DOCTYPE html>
 3 | <html lang="en">
 4 |     <head>
 5 |         <meta charset="utf-8">
 6 |         <title>Driving Distance over the years</title>
 7 |         
 8 | <link rel="stylesheet" href="https://cdn.pydata.org/bokeh/release/bokeh-0.11.1.min.css" type="text/css" />
 9 |         
10 | <script type="text/javascript" src="https://cdn.pydata.org/bokeh/release/bokeh-0.11.1.min.js"></script>
11 | <script type="text/javascript">
12 |     Bokeh.set_log_level("info");
13 | </script>
14 |     </head>
15 |     <body>
16 |         
17 |         <div class="plotdiv" id="c5caca4f-4755-429a-8840-e59f5c0c88f5"></div>
18 |         
19 |         <script type="text/javascript">
20 |             Bokeh.$(function() {
21 |             var docs_json = {"e6a5fcce-523f-46cd-990f-0f522e752fc5":{"roots":{"references":[{"attributes":{},"id":"a9d65bf1-43e3-4b06-92d5-c91eb0e9969e","type":"BasicTickFormatter"},{"attributes":{"plot":{"id":"45b405b9-9d40-44ba-a3fa-86d285fb8994","subtype":"Figure","type":"Plot"},"ticker":{"id":"224389f3-6e82-4a0d-96c7-a520e955246b","type":"BasicTicker"}},"id":"a0e934ea-f24a-4192-a7b8-b318f9a4871a","type":"Grid"},{"attributes":{"line_color":{"value":"blue"},"x":{"field":"x"},"y":{"field":"y"}},"id":"439301b0-83f9-412b-8257-914c5ff9ff89","type":"Line"},{"attributes":{"callback":null,"column_names":["y","x"],"data":{"x":[1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016],"y":[238.7,242.3,238.5,245.1,241.2,241.7,243.4,244.2,246.2,245.5,244.2,240.8,240.6,240.9,237.9,246.6,239.5,244.8,249.0,252.5,251.3,252.7,256.6,265.9,268.2,258.7,265.9,265.3,261.4,259.0,266.4,269.8,268.9,270.5,264.6,264.6,264.6]}},"id":"8dc19525-96a3-4d37-b462-dd97a58d66e8","type":"ColumnDataSource"},{"attributes":{},"id":"3485155e-2345-44d7-961a-e2bbbd916553","type":"BasicTickFormatter"},{"attributes":{},"id":"61b17b5a-ddde-4080-952f-c473669eab07","type":"ToolEvents"},{"attributes":{"callback":null,"column_names":["y","x"],"data":{"x":[1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016],"y":[256.78228571428576,259.44876543209875,256.7689440993789,258.540251572327,259.4796703296703,260.27468354430374,261.53258426966295,262.21329787234043,263.36324324324323,261.73172043010754,262.7239583333333,261.3613756613757,260.39726775956285,260.16243386243383,261.8882681564246,263.4313829787234,266.3642105263158,267.60615384615386,270.5377659574468,272.4604166666667,273.1707692307692,279.38659793814435,279.75099009900987,286.5689473684211,287.175,288.5737623762376,289.4984693877551,289.0790816326531,287.6106598984772,287.9586956521739,287.51406249999997,291.08978494623653,290.0539267015707,287.9216666666667,290.32293577981653,290.32293577981653,290.32293577981653]}},"id":"ad4c5bd2-32cc-49d9-9519-c622905e1380","type":"ColumnDataSource"},{"attributes":{"callback":null},"id":"33aa8c5c-ee62-44b5-af61-9cf3c95bea7c","type":"DataRange1d"},{"attributes":{"callback":null},"id":"ccec7418-bcf0-40ee-adb8-b9f0090d7aef","type":"DataRange1d"},{"attributes":{"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"x":{"field":"x"},"y":{"field":"y"}},"id":"3c46c50a-5831-4d04-bda9-4f986bed1326","type":"Line"},{"attributes":{"data_source":{"id":"c721a3f5-e50d-4848-bf8f-3412f38bfa77","type":"ColumnDataSource"},"glyph":{"id":"240bd1df-2344-4363-bcb0-0cbc7a5f0220","type":"Line"},"hover_glyph":null,"nonselection_glyph":{"id":"5ede4d33-a6ed-40c4-9f65-5d377bdc852c","type":"Line"},"selection_glyph":null},"id":"0ce42fb2-2c2b-4c2c-a731-aa821977d2b2","type":"GlyphRenderer"},{"attributes":{"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"x":{"field":"x"},"y":{"field":"y"}},"id":"c4d28ae1-9745-44df-bddc-c64ecbd41dd4","type":"Line"},{"attributes":{"formatter":{"id":"a9d65bf1-43e3-4b06-92d5-c91eb0e9969e","type":"BasicTickFormatter"},"plot":{"id":"45b405b9-9d40-44ba-a3fa-86d285fb8994","subtype":"Figure","type":"Plot"},"ticker":{"id":"224389f3-6e82-4a0d-96c7-a520e955246b","type":"BasicTicker"}},"id":"b194aa2d-9582-4f36-9df3-56f3b725c8fd","type":"LinearAxis"},{"attributes":{},"id":"224389f3-6e82-4a0d-96c7-a520e955246b","type":"BasicTicker"},{"attributes":{},"id":"ba4f328e-5b55-4ea1-8909-a0463e4b6f52","type":"BasicTicker"},{"attributes":{"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"x":{"field":"x"},"y":{"field":"y"}},"id":"5ede4d33-a6ed-40c4-9f65-5d377bdc852c","type":"Line"},{"attributes":{"below":[{"id":"b194aa2d-9582-4f36-9df3-56f3b725c8fd","type":"LinearAxis"}],"left":[{"id":"4ba1c680-98e6-4908-bb9d-a9a86bfdbfac","type":"LinearAxis"}],"renderers":[{"id":"b194aa2d-9582-4f36-9df3-56f3b725c8fd","type":"LinearAxis"},{"id":"a0e934ea-f24a-4192-a7b8-b318f9a4871a","type":"Grid"},{"id":"4ba1c680-98e6-4908-bb9d-a9a86bfdbfac","type":"LinearAxis"},{"id":"aa6771b8-ca22-4aaf-a192-de806b2ee8ba","type":"Grid"},{"id":"cc368e8e-2ba5-4be0-9e69-044053781b6b","type":"GlyphRenderer"},{"id":"ceae293b-90ed-4ac7-8fdb-ddbdfd4e3732","type":"GlyphRenderer"},{"id":"0ce42fb2-2c2b-4c2c-a731-aa821977d2b2","type":"GlyphRenderer"}],"title":"Driving Distance over the years","tool_events":{"id":"61b17b5a-ddde-4080-952f-c473669eab07","type":"ToolEvents"},"tools":[{"id":"99ce6158-b175-463c-8a95-e7ecfffd9185","type":"PreviewSaveTool"}],"x_range":{"id":"ccec7418-bcf0-40ee-adb8-b9f0090d7aef","type":"DataRange1d"},"y_range":{"id":"33aa8c5c-ee62-44b5-af61-9cf3c95bea7c","type":"DataRange1d"}},"id":"45b405b9-9d40-44ba-a3fa-86d285fb8994","subtype":"Figure","type":"Plot"},{"attributes":{"line_color":{"value":"red"},"x":{"field":"x"},"y":{"field":"y"}},"id":"934e6a63-d35c-4e57-8771-e03a41eec612","type":"Line"},{"attributes":{"formatter":{"id":"3485155e-2345-44d7-961a-e2bbbd916553","type":"BasicTickFormatter"},"plot":{"id":"45b405b9-9d40-44ba-a3fa-86d285fb8994","subtype":"Figure","type":"Plot"},"ticker":{"id":"ba4f328e-5b55-4ea1-8909-a0463e4b6f52","type":"BasicTicker"}},"id":"4ba1c680-98e6-4908-bb9d-a9a86bfdbfac","type":"LinearAxis"},{"attributes":{"dimension":1,"plot":{"id":"45b405b9-9d40-44ba-a3fa-86d285fb8994","subtype":"Figure","type":"Plot"},"ticker":{"id":"ba4f328e-5b55-4ea1-8909-a0463e4b6f52","type":"BasicTicker"}},"id":"aa6771b8-ca22-4aaf-a192-de806b2ee8ba","type":"Grid"},{"attributes":{"data_source":{"id":"8dc19525-96a3-4d37-b462-dd97a58d66e8","type":"ColumnDataSource"},"glyph":{"id":"439301b0-83f9-412b-8257-914c5ff9ff89","type":"Line"},"hover_glyph":null,"nonselection_glyph":{"id":"c4d28ae1-9745-44df-bddc-c64ecbd41dd4","type":"Line"},"selection_glyph":null},"id":"ceae293b-90ed-4ac7-8fdb-ddbdfd4e3732","type":"GlyphRenderer"},{"attributes":{"children":[{"id":"45b405b9-9d40-44ba-a3fa-86d285fb8994","subtype":"Figure","type":"Plot"}]},"id":"85e4aff6-09ea-4d71-907e-a423d7a0b827","type":"VBox"},{"attributes":{"callback":null,"column_names":["y","x"],"data":{"x":[1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016],"y":[274.3,280.1,275.3,276.0,276.5,278.2,285.7,283.9,284.6,280.9,279.6,288.9,283.4,288.9,283.8,289.0,288.8,302.0,299.4,305.6,301.4,306.7,306.8,321.4,314.4,318.9,319.6,315.2,315.1,312.0,315.5,318.4,315.5,306.3,317.8,317.8,317.8]}},"id":"c721a3f5-e50d-4848-bf8f-3412f38bfa77","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"ad4c5bd2-32cc-49d9-9519-c622905e1380","type":"ColumnDataSource"},"glyph":{"id":"934e6a63-d35c-4e57-8771-e03a41eec612","type":"Line"},"hover_glyph":null,"nonselection_glyph":{"id":"3c46c50a-5831-4d04-bda9-4f986bed1326","type":"Line"},"selection_glyph":null},"id":"cc368e8e-2ba5-4be0-9e69-044053781b6b","type":"GlyphRenderer"},{"attributes":{"plot":{"id":"45b405b9-9d40-44ba-a3fa-86d285fb8994","subtype":"Figure","type":"Plot"}},"id":"99ce6158-b175-463c-8a95-e7ecfffd9185","type":"PreviewSaveTool"},{"attributes":{"line_color":{"value":"green"},"x":{"field":"x"},"y":{"field":"y"}},"id":"240bd1df-2344-4363-bcb0-0cbc7a5f0220","type":"Line"}],"root_ids":["85e4aff6-09ea-4d71-907e-a423d7a0b827"]},"title":"Bokeh Application","version":"0.11.1"}};
22 |             var render_items = [{"docid":"e6a5fcce-523f-46cd-990f-0f522e752fc5","elementid":"c5caca4f-4755-429a-8840-e59f5c0c88f5","modelid":"85e4aff6-09ea-4d71-907e-a423d7a0b827"}];
23 |             
24 |             Bokeh.embed.embed_items(docs_json, render_items);
25 |         });
26 |         </script>
27 |     </body>
28 | </html>


--------------------------------------------------------------------------------
/tourstats/driving_vs_putts.py:
--------------------------------------------------------------------------------
  1 | from models import Player, Stat, StatLine
  2 | 
  3 | from sqlalchemy.orm import sessionmaker
  4 | from sqlalchemy import create_engine, or_
  5 | 
  6 | engine = create_engine('postgresql://pgatour_user:pgatour_user_password@localhost:5432/pgatour')
  7 | Session = sessionmaker(bind=engine)
  8 | session = Session()
  9 | 
 10 | stat_names = set([
 11 |                   #'Driving Distance',
 12 |                   'Putting Average',
 13 |                   'Total Putting',
 14 |                   'Greens in Regulation Percentage',
 15 |                   'Driving Accuracy Percentage',
 16 |                   'Proximity to Hole',
 17 |                   'Birdie Average',
 18 |                   'Scrambling',
 19 |                   'Scoring Average'
 20 |                  ])
 21 | 
 22 | stats = session.query(Stat.id, Stat.name).filter(or_(Stat.name == v for v in stat_names))
 23 | stats_info = [(stat.id, stat.name) for stat in stats]
 24 | 
 25 | from sqlalchemy import text
 26 | 
 27 | sql_text_train = '''
 28 | select players.id,
 29 |        players.name,
 30 |        max(case when stat_lines.stat_id=330 then stat_lines.raw else null end) as putting_average,
 31 |        max(case when stat_lines.stat_id=157 then stat_lines.raw else null end) as driving_distance,
 32 |        max(case when stat_lines.stat_id=250 then stat_lines.raw else null end) as gir,
 33 |        max(case when stat_lines.stat_id=156 then stat_lines.raw else null end) as driving_accuracy,
 34 |        max(case when stat_lines.stat_id=382 then stat_lines.raw else null end) as scoring_average
 35 | from players
 36 | join stat_lines on stat_lines.player_id = players.id
 37 | join stats on stat_lines.stat_id=stats.id
 38 | where stat_lines.year=2012 and (stats.id=157 or stats.id=330 or stats.id=382 or stats.id=250 or stats.id=156) and stat_lines.raw is not null
 39 | group by players.name,players.id;
 40 | '''
 41 | 
 42 | select_clauses = []
 43 | where_clauses = []
 44 | for stat_info in stats_info:
 45 |   stat_id = stat_info[0]
 46 |   stat_name = stat_info[1].lower().replace(' ','_')
 47 |   select_string = ", max(case when stat_lines.stat_id=%s then stat_lines.raw else null end) as %s" % (stat_id, stat_name)
 48 |   where_string = "stats.id=%s " % (stat_id)
 49 |   select_clauses.append(select_string)
 50 |   where_clauses.append(where_string)
 51 | 
 52 | underscored_stat_names = [stat_name.lower().replace(' ','_') for stat_name in stat_names if stat_name != 'Scoring Average']
 53 | 
 54 | sql_text = 'select players.id, players.name'
 55 | for select_clause in select_clauses:
 56 |   sql_text += select_clause
 57 | sql_text += '''
 58 | from players
 59 | join stat_lines on stat_lines.player_id = players.id
 60 | join stats on stat_lines.stat_id=stats.id
 61 | where
 62 | stat_lines.year=%s
 63 | and (
 64 | '''
 65 | for index, where_clause in enumerate(where_clauses):
 66 |   if index != 0:
 67 |     sql_text += 'or '
 68 |   sql_text += where_clause
 69 | sql_text += '''
 70 | )
 71 | and stat_lines.raw is not null
 72 | group by players.name, players.id;
 73 | '''
 74 | 
 75 | import pandas as pd
 76 | import statsmodels.api as sm
 77 | from sklearn import linear_model, preprocessing
 78 | import numpy as np
 79 | import sys
 80 | current_module = sys.modules[__name__]
 81 | 
 82 | sql_text_train = sql_text % '2012'
 83 | sql_text_pred = sql_text % '2013'
 84 | 
 85 | driving_accuracy_percentage_clean = lambda x: float(x) * 0.01 * 14
 86 | greens_in_regulation_percentage_clean = lambda x: float(x) * 0.01 * 18
 87 | greens_or_fringe_in_regulation_clean = lambda x: float(x) * 0.01 * 18
 88 | putting_average_clean = lambda x: float(x) * 18
 89 | 
 90 | def proximity_to_hole_clean(val):
 91 |   distances = str(val).split("'")
 92 |   inches = int(distances[0]) * 12 + int(distances[1][1:-1])
 93 |   return inches
 94 | 
 95 | df = pd.read_sql_query(sql_text_train, engine)
 96 | df = df[df.scoring_average.notnull()]
 97 | for underscored_stat_name in underscored_stat_names:
 98 |   try:
 99 |     cleaning_function = getattr(current_module, underscored_stat_name+'_clean')
100 |     df[underscored_stat_name] = df[underscored_stat_name].map(cleaning_function)
101 |   except AttributeError:
102 |     pass
103 | 
104 | X_train = df[underscored_stat_names].astype(np.float)
105 | X_train = sm.add_constant(X_train)
106 | y = df['scoring_average'].astype(np.float)
107 | 
108 | res = sm.OLS(y,X_train).fit()
109 | print res.summary()
110 | ytrain = res.predict(X_train)
111 | 
112 | #prediction time
113 | df_pred = pd.read_sql_query(sql_text_pred, engine)
114 | df_pred = df_pred[df_pred.scoring_average.notnull()]
115 | for underscored_stat_name in underscored_stat_names:
116 |   try:
117 |     cleaning_function = getattr(current_module, underscored_stat_name+'_clean')
118 |     df_pred[underscored_stat_name] = df_pred[underscored_stat_name].map(cleaning_function)
119 |   except AttributeError:
120 |     pass
121 | 
122 | X_pred = df_pred[underscored_stat_names].astype(np.float)
123 | X_pred = sm.add_constant(X_pred)
124 | y_actual = df_pred['scoring_average'].astype(np.float)
125 | 
126 | ypred = res.predict(X_pred)
127 | 
128 | import matplotlib.pyplot as plt
129 | fig, ax = plt.subplots()
130 | #ax.scatter(df['putting_average'].astype(np.float), df['scoring_average'].astype(np.float))
131 | #ax.scatter(y_actual, ypred)
132 | ax.scatter(ytrain, y)
133 | 
134 | for index, row in df_pred['scoring_average'].iteritems():
135 |   name = df_pred.loc[index]['name']
136 |   if y_actual[index] + 1 < ypred[index] or y_actual[index] - 1 > ypred[index]:
137 |     pass
138 |    # ax.annotate(name, (y_actual[index],ypred[index]))
139 | 
140 | plt.show()
141 | '''
142 | import csv
143 | import matplotlib.pyplot as plt
144 | filename = "distance_vs_putts.csv"
145 | df = pd.read_csv(filename, index_col=0)
146 | 
147 | data = {}
148 | names = []
149 | distance = []
150 | putts = []
151 | with open(filename, 'rb') as csvfile:
152 |   reader = csv.reader(csvfile)
153 |   for row in reader:
154 |     data[row[0]] = [row[1:3]]
155 |     names.append(row[0])
156 |     putts.append(row[1])
157 |     distance.append(row[2])
158 | 
159 | fig, ax = plt.subplots()
160 | ax.scatter(distance, putts)
161 | 
162 | for i, name in enumerate(names):
163 |   ax.annotate(name, (distance[i],putts[i]))
164 | 
165 | plt.show()
166 | '''
167 | 


--------------------------------------------------------------------------------
/tourstats/gather.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | from collections import defaultdict
  4 | import csv
  5 | import urllib
  6 | import os
  7 | 
  8 | #import gevent
  9 | #from gevent import monkey
 10 | #monkey.patch_all()
 11 | 
 12 | 
 13 | url_stub = "http://www.pgatour.com/stats/stat.%s.%s.html" #stat id, year
 14 | 
 15 | def feet_string_to_inches(string):
 16 |   ''' 29'1" for example, turns it into inches '''
 17 |   splits = map(float, string[:-1].split("'"))
 18 |   return splits[0] * 12.0 + splits[1]
 19 | 
 20 | def to_dollas(string):
 21 |   return float(string[1:].replace(',', ''))
 22 | 
 23 | category_url_stub = 'http://www.pgatour.com/stats/categories.%s.html'
 24 | category_labels = ['RPTS_INQ', 'ROTT_INQ', 'RAPP_INQ', 'RARG_INQ', 'RPUT_INQ', 'RSCR_INQ', 'RSTR_INQ', 'RMNY_INQ']
 25 | pga_tour_base_url = "http://www.pgatour.com"
 26 | def gather_pages(url, filename):
 27 |   print filename
 28 |   urllib.urlretrieve(url, filename)
 29 | 
 30 | def gather_html():
 31 |   stat_ids = []
 32 |   for category in category_labels:
 33 |     category_url = category_url_stub % (category)
 34 |     page = requests.get(category_url)
 35 |     html = BeautifulSoup(page.text.replace('\n',''), 'html.parser')
 36 |     for table in html.find_all("div", class_="table-content"):
 37 |       for link in table.find_all("a"):
 38 |         stat_ids.append(link['href'].split('.')[1])
 39 |   starting_year = 2015 #page in order to see which years we have info for
 40 |   print stat_ids
 41 |   for stat_id in stat_ids:
 42 |     url = url_stub % (stat_id, starting_year)
 43 |     page = requests.get(url)
 44 |     html = BeautifulSoup(page.text.replace('\n',''), 'html.parser')
 45 |     stat = html.find("div", class_="parsys mainParsys section").find('h3').text
 46 |     print stat
 47 |     directory = "stats_html/%s" % stat.replace('/', ' ') #need to replace to avoid
 48 |     if not os.path.exists(directory):
 49 |       os.makedirs(directory)
 50 |     years = []
 51 |     for option in html.find("select", class_="statistics-details-select").find_all("option"):
 52 |       year = option['value']
 53 |       if year not in years:
 54 |         years.append(year)
 55 |     url_filenames = []
 56 |     for year in years:
 57 |       url = url_stub % (stat_id, year)
 58 |       filename = "%s/%s.html" % (directory, year)
 59 |       if not os.path.isfile(filename): #this check saves time if you've already downloaded the page
 60 |         url_filenames.append((url, filename))
 61 |     jobs = [gevent.spawn(gather_pages, pair[0], pair[1]) for pair in url_filenames]
 62 |     gevent.joinall(jobs)
 63 | 
 64 | gather_html()
 65 | 
 66 | '''
 67 | for folder in os.listdir("stats_html"):
 68 |   path = "stats_html/%s" % folder
 69 |   if os.path.isdir(path):
 70 |     for file in os.listdir(path):
 71 |       if file[0] == '.':
 72 |         continue
 73 |       csv_lines = []
 74 |       file_path = path + "/" + file
 75 |       csv_dir = "stats_csv/" + folder
 76 |       if not os.path.exists(csv_dir):
 77 |         os.makedirs(csv_dir)
 78 |       csv_file_path = csv_dir + "/" + file.split('.')[0] + '.csv'
 79 |       print csv_file_path
 80 |       if os.path.isfile(csv_file_path):
 81 |         continue
 82 |       with open(file_path, 'r') as ff:
 83 |         f = ff.read()
 84 |         html = BeautifulSoup(f.replace('\n',''), 'html.parser')
 85 |         table = html.find('table', class_='table-styled')
 86 |         headings = [t.text for t in table.find('thead').find_all('td')]
 87 |         csv_lines.append(headings)
 88 |         for tr in table.find('tbody').find_all('tr'):
 89 |           info = [td.text.replace(u'\xa0', u' ').strip() for td in tr.find_all('td')]
 90 |           csv_lines.append(info)
 91 |       #write the array to csv
 92 |       with open(csv_file_path, 'wb') as csvfile:
 93 |         writer = spamwriter = csv.writer(csvfile, delimiter=',')
 94 |         for row in csv_lines:
 95 |           writer.writerow(row)
 96 | 
 97 | 
 98 | 
 99 | column_keys = ['%', 'AVG']
100 | inputs = [
101 | {'name': 'driving_distance', 'sid': 101, 'conversion': float},
102 | {'name': 'driving_accuracy', 'sid': 102, 'conversion': float},
103 | {'name': 'greens_in_regulation', 'sid': 103, 'conversion': float},
104 | {'name': 'greens_or_fringe_in_regulation', 'sid': '02437', 'conversion': float},
105 | {'name': 'proximity_to_hole', 'sid': 331, 'conversion': feet_string_to_inches},
106 | {'name': 'scrambling', 'sid': 130, 'conversion': float},
107 | {'name': 'putts_per_round', 'sid': 119, 'conversion': float},
108 | {'name': 'percentage_of_yardage_covered_by_tee_shots', 'sid': '02341', 'conversion': float},
109 | {'name': 'strokes_gained_tee_to_green', 'sid': '02674', 'conversion': float},
110 | {'name': 'fairway_proximity', 'sid': 431, 'conversion': feet_string_to_inches},
111 | {'name': 'rough_proximity', 'sid': 437, 'conversion': feet_string_to_inches},
112 | {'name': 'proximity_to_hole_around_green', 'sid': 374, 'conversion': feet_string_to_inches},
113 | {'name': 'three_putt_avoidance', 'sid': 426, 'conversion': float},
114 | {'name': 'one_putt_percentage', 'sid': 413, 'conversion': float},
115 | {'name': 'total_putting', 'sid': '02428', 'conversion': float},
116 | 
117 | {'name': 'scoring_average', 'sid': 120, 'conversion': float},
118 | {'name': 'scoring_average_actual', 'sid': 108, 'conversion': float},
119 | {'name': 'money_leaders', 'sid': 109, 'conversion': to_dollas}
120 | ]
121 | 
122 | player_stats = defaultdict(dict)
123 | years = range(2014, 1999, -1)
124 | for year in years:
125 |   print year
126 |   for source in inputs:
127 |     print source['name']
128 |     url = url_stub % (source['sid'], year)
129 |     page = requests.get(url)
130 |     html = BeautifulSoup(page.text.replace('\n',''), 'html.parser')
131 |     for row in html.find("table", id="statsTable").find('tbody').find_all('tr'):
132 |       stat_line = [info.text for info in row.find_all('td')]
133 |       player = str(stat_line[2].replace(u'\xa0', u' ').strip())
134 |       stat = source['conversion'](stat_line[4])
135 |       player_stats[player][source['name']] = stat
136 | 
137 |   filename = "%s.csv" % year
138 |   with open(filename, 'w') as csvfile:
139 |     fieldnames = ['name'] + [s['name'] for s in inputs]
140 |     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
141 |     writer.writeheader()
142 |     for name, stats in player_stats.iteritems():
143 |       if stats.get('scoring_average') == None:
144 |         continue
145 |       stats['name'] = name
146 |       writer.writerow(stats)
147 | 
148 | '''
149 | 


--------------------------------------------------------------------------------
/tourstats/models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy.ext.declarative import declarative_base
 2 | from sqlalchemy import Column, Integer, String, ForeignKey
 3 | from sqlalchemy.orm import relationship
 4 | 
 5 | Base = declarative_base()
 6 | 
 7 | class Player(Base):
 8 |   __tablename__ = 'players'
 9 |   id = Column('id', Integer, primary_key=True)
10 |   name = Column('name', String)
11 |   stat_lines = relationship("StatLine")
12 | 
13 | class Stat(Base):
14 |   __tablename__ = 'stats'
15 |   id = Column('id', Integer, primary_key=True)
16 |   name = Column('name', String)
17 |   stat_lines = relationship("StatLine")
18 | 
19 | class StatLine(Base):
20 |   __tablename__ = 'stat_lines'
21 |   id = Column('id', Integer, primary_key=True)
22 |   player_id = Column('player_id', Integer, ForeignKey("players.id"))
23 |   player = relationship('Player')
24 |   stat_id = Column('stat_id', Integer, ForeignKey("stats.id"))
25 |   stat = relationship('Stat')
26 |   raw = Column('raw', String)
27 |   events = Column('events', Integer)
28 |   year = Column('year', Integer)
29 | 


--------------------------------------------------------------------------------
/tourstats/models.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackschultz/bigishdata/e16cb67daa5196f06a140877ba108c1aea58d995/tourstats/models.pyc


--------------------------------------------------------------------------------
/tourstats/seed.py:
--------------------------------------------------------------------------------
  1 | from sqlalchemy.ext.declarative import declarative_base
  2 | from sqlalchemy.orm import sessionmaker
  3 | 
  4 | from sqlalchemy import create_engine
  5 | from sqlalchemy import Column, Integer, String, ForeignKey
  6 | from sqlalchemy.orm import relationship
  7 | 
  8 | engine = create_engine('postgresql://pgatour_user:pgatour_user_password@localhost:5432/pgatour')
  9 | Session = sessionmaker(bind=engine)
 10 | session = Session()
 11 | 
 12 | Base = declarative_base()
 13 | 
 14 | class Player(Base):
 15 |   __tablename__ = 'players'
 16 |   id = Column('id', Integer, primary_key=True)
 17 |   name = Column('name', String)
 18 |   stat_lines = relationship("StatLine")
 19 | 
 20 | class Stat(Base):
 21 |   __tablename__ = 'stats'
 22 |   id = Column('id', Integer, primary_key=True)
 23 |   name = Column('name', String)
 24 |   stat_lines = relationship("StatLine")
 25 | 
 26 | class StatLine(Base):
 27 |   __tablename__ = 'stat_lines'
 28 |   id = Column('id', Integer, primary_key=True)
 29 |   player_id = Column('player_id', Integer, ForeignKey("players.id"))
 30 |   player = relationship('Player')
 31 |   stat_id = Column('stat_id', Integer, ForeignKey("stats.id"))
 32 |   stat = relationship('Stat')
 33 |   raw = Column('raw', String)
 34 |   events = Column('events', Integer)
 35 |   year = Column('year', Integer)
 36 | 
 37 | import os
 38 | import csv
 39 | 
 40 | '''
 41 | players = set()
 42 | def add_players_from_file(filepath):
 43 |   with open(filepath, 'rb') as csvfile:
 44 |     reader = csv.reader(csvfile)
 45 |     for row in reader:
 46 |       player_name = row[2]
 47 |       #some players have an astrisk at the end of their name
 48 |       #want to remove this for player insertion
 49 |       if len(player_name) > 0 and player_name[-1] == "*":
 50 |         player_name = player_name[0:-2]
 51 |       players.add(player_name)
 52 | 
 53 | for subdir, dirs, files in os.walk('stats_csv'):
 54 |   for dir in dirs:
 55 |     for subdir, dirs, files in os.walk("stats_csv/%s" % dir):
 56 |       for file in files:
 57 |         filepath = "stats_csv/%s/%s" % (dir, file)
 58 |         add_players_from_file(filepath)
 59 | 
 60 | for player_name in players:
 61 |   if session.query(Player).filter_by(name=player_name).count() == 0:
 62 |     p = Player(name=player_name)
 63 |     session.add(p)
 64 | 
 65 | for subdir, dirs, files in os.walk('stats_csv'):
 66 |   for dir in dirs:
 67 |     if session.query(Stat).filter_by(name=dir).count() == 0:
 68 |       print dir
 69 |       s = Stat(name=dir)
 70 |       session.add(s)
 71 | session.commit()
 72 | session.close() #for good measure
 73 | 
 74 | 
 75 | 
 76 | def acknowledge_or_create_stat_line(data, stat, year):
 77 |   for row in data:
 78 |     if len(row) >= 5:
 79 |       player_name = row[2]
 80 |       if len(player_name) > 0 and player_name[-1] == "*":
 81 |         player_name = player_name[0:-2]
 82 |       player = session.query(Player).filter_by(name=player_name).first()
 83 |       stat_line = session.query(StatLine).filter_by(player=player, stat=stat, year=year).first()
 84 |       if not stat_line:
 85 |         try:
 86 |           events = int(row[3])
 87 |         except ValueError:
 88 |           events = 0
 89 |         raw = row[4]
 90 |         stat_line = StatLine(player=player, stat=stat, year=year, events=events, raw=raw)
 91 |         session.add(stat_line)
 92 | 
 93 | def process_file(filename, stat, year):
 94 |   with open(filename, 'rb') as csvfile:
 95 |     reader = csv.reader(csvfile)
 96 |     next(reader)
 97 |     stat_count = session.query(StatLine).filter_by(stat=stat, year=year).count()
 98 |     data = list(reader) #only do this because I know reader is about 200. Bigger data sets can have issues!
 99 |     file_stat_count = len(data)
100 |     print "%s, stat_count: %s, file_stat_count: %s" % (filename, stat_count, file_stat_count)
101 |     if stat_count != file_stat_count:
102 |       acknowledge_or_create_stat_line(data, stat, year)
103 |       session.commit()
104 |   return filename
105 | 
106 | from multiprocessing import Pool
107 | pool = Pool()
108 | 
109 | for subdir, dirs, files in os.walk('stats_csv'):
110 |   for dir in dirs:
111 |     stat = session.query(Stat).filter_by(name=dir).first()
112 |     for subdir, dirs, files in os.walk("stats_csv/%s" % dir):
113 |       for file in files:
114 |         year = int(file[0:-4]) #chopping off the csv
115 |         filepath = "stats_csv/%s/%s" % (dir, file)
116 |         pool.apply_async(process_file, [filepath, stat, year])
117 | 
118 | pool.close()
119 | pool.join()
120 | '''
121 | 
122 | phil = session.query(Player).filter_by(name='Phil Mickelson').first()
123 | stat = session.query(Stat).filter_by(name='Driving Distance').first()
124 | stat_lines = session.query(StatLine).filter_by(player=phil, stat=stat).order_by("year")
125 | for stat_line in stat_lines:
126 |   print "%s: %s" % (stat_line.year, stat_line.raw)
127 | 


--------------------------------------------------------------------------------