├── README.md
├── SmartStopList.py
├── main.py
├── product.py
├── productfinder.py
├── productfinder_helper.py
├── rake.py
├── saveproducts.py
└── test_html
    ├── amazon.html
    └── amazon2.html


/README.md:
--------------------------------------------------------------------------------
 1 | # python-common-crawl-amazon-example
 2 | Amazon PRODUCT FINDER using Python and Commoncrawl dataset
 3 | Scans common crawl infomation for products and
 4 | saves them to Amazon DynomoDB database.
 5 | By David Cedar https://www.cedar.net.au/using-python-and-common-crawl-to-find-products-from-amazon-com/
 6 | 
 7 | # Install
 8 |    boto3: https://github.com/boto/boto3. Make sure to add AWS Credentials on our machine, see docs.
 9 |    & other using pip.
10 | 
11 | # Test 
12 | Make sure code is working with local example - Run
13 |    python productfinder_helper.py
14 | # Run
15 |    python main --domain amazon.com
16 | 


--------------------------------------------------------------------------------
/SmartStopList.py:
--------------------------------------------------------------------------------
  1 | wordlist = [
  2 |     'a',
  3 |     "a's",
  4 |     'able',
  5 |     'about',
  6 |     'above',
  7 |     'according',
  8 |     'accordingly',
  9 |     'across',
 10 |     'actually',
 11 |     'after',
 12 |     'afterwards',
 13 |     'again',
 14 |     'against',
 15 |     "ain't",
 16 |     'all',
 17 |     'allow',
 18 |     'allows',
 19 |     'almost',
 20 |     'alone',
 21 |     'along',
 22 |     'already',
 23 |     'also',
 24 |     'although',
 25 |     'always',
 26 |     'am',
 27 |     'among',
 28 |     'amongst',
 29 |     'an',
 30 |     'and',
 31 |     'another',
 32 |     'any',
 33 |     'anybody',
 34 |     'anyhow',
 35 |     'anyone',
 36 |     'anything',
 37 |     'anyway',
 38 |     'anyways',
 39 |     'anywhere',
 40 |     'apart',
 41 |     'appear',
 42 |     'appreciate',
 43 |     'appropriate',
 44 |     'are',
 45 |     "aren't",
 46 |     'around',
 47 |     'as',
 48 |     'aside',
 49 |     'ask',
 50 |     'asking',
 51 |     'associated',
 52 |     'at',
 53 |     'available',
 54 |     'away',
 55 |     'awfully',
 56 |     'b',
 57 |     'be',
 58 |     'became',
 59 |     'because',
 60 |     'become',
 61 |     'becomes',
 62 |     'becoming',
 63 |     'been',
 64 |     'before',
 65 |     'beforehand',
 66 |     'behind',
 67 |     'being',
 68 |     'believe',
 69 |     'below',
 70 |     'beside',
 71 |     'besides',
 72 |     'best',
 73 |     'better',
 74 |     'between',
 75 |     'beyond',
 76 |     'both',
 77 |     'brief',
 78 |     'but',
 79 |     'by',
 80 |     'c',
 81 |     "c'mon",
 82 |     "c's",
 83 |     'came',
 84 |     'can',
 85 |     "can't",
 86 |     'cannot',
 87 |     'cant',
 88 |     'cause',
 89 |     'causes',
 90 |     'certain',
 91 |     'certainly',
 92 |     'changes',
 93 |     'clearly',
 94 |     'co',
 95 |     'com',
 96 |     'come',
 97 |     'comes',
 98 |     'concerning',
 99 |     'consequently',
100 |     'consider',
101 |     'considering',
102 |     'contain',
103 |     'containing',
104 |     'contains',
105 |     'corresponding',
106 |     'could',
107 |     "couldn't",
108 |     'course',
109 |     'currently',
110 |     'd',
111 |     'definitely',
112 |     'described',
113 |     'despite',
114 |     'did',
115 |     "didn't",
116 |     'different',
117 |     'do',
118 |     'does',
119 |     "doesn't",
120 |     'doing',
121 |     "don't",
122 |     'done',
123 |     'down',
124 |     'downwards',
125 |     'during',
126 |     'e',
127 |     'each',
128 |     'edu',
129 |     'eg',
130 |     'eight',
131 |     'either',
132 |     'else',
133 |     'elsewhere',
134 |     'enough',
135 |     'entirely',
136 |     'especially',
137 |     'et',
138 |     'etc',
139 |     'even',
140 |     'ever',
141 |     'every',
142 |     'everybody',
143 |     'everyone',
144 |     'everything',
145 |     'everywhere',
146 |     'ex',
147 |     'exactly',
148 |     'example',
149 |     'except',
150 |     'f',
151 |     'far',
152 |     'few',
153 |     'fifth',
154 |     'first',
155 |     'five',
156 |     'followed',
157 |     'following',
158 |     'follows',
159 |     'for',
160 |     'former',
161 |     'formerly',
162 |     'forth',
163 |     'four',
164 |     'from',
165 |     'further',
166 |     'furthermore',
167 |     'g',
168 |     'get',
169 |     'gets',
170 |     'getting',
171 |     'given',
172 |     'gives',
173 |     'go',
174 |     'goes',
175 |     'going',
176 |     'gone',
177 |     'got',
178 |     'gotten',
179 |     'greetings',
180 |     'h',
181 |     'had',
182 |     "hadn't",
183 |     'happens',
184 |     'hardly',
185 |     'has',
186 |     "hasn't",
187 |     'have',
188 |     "haven't",
189 |     'having',
190 |     'he',
191 |     "he's",
192 |     'hello',
193 |     'help',
194 |     'hence',
195 |     'her',
196 |     'here',
197 |     "here's",
198 |     'hereafter',
199 |     'hereby',
200 |     'herein',
201 |     'hereupon',
202 |     'hers',
203 |     'herself',
204 |     'hi',
205 |     'him',
206 |     'himself',
207 |     'his',
208 |     'hither',
209 |     'hopefully',
210 |     'how',
211 |     'howbeit',
212 |     'however',
213 |     'i',
214 |     "i'd",
215 |     "i'll",
216 |     "i'm",
217 |     "i've",
218 |     'ie',
219 |     'if',
220 |     'ignored',
221 |     'immediate',
222 |     'in',
223 |     'inasmuch',
224 |     'inc',
225 |     'indeed',
226 |     'indicate',
227 |     'indicated',
228 |     'indicates',
229 |     'inner',
230 |     'insofar',
231 |     'instead',
232 |     'into',
233 |     'inward',
234 |     'is',
235 |     "isn't",
236 |     'it',
237 |     "it'd",
238 |     "it'll",
239 |     "it's",
240 |     'its',
241 |     'itself',
242 |     'j',
243 |     'just',
244 |     'k',
245 |     'keep',
246 |     'keeps',
247 |     'kept',
248 |     'know',
249 |     'knows',
250 |     'known',
251 |     'l',
252 |     'last',
253 |     'lately',
254 |     'later',
255 |     'latter',
256 |     'latterly',
257 |     'least',
258 |     'less',
259 |     'lest',
260 |     'let',
261 |     "let's",
262 |     'like',
263 |     'liked',
264 |     'likely',
265 |     'little',
266 |     'look',
267 |     'looking',
268 |     'looks',
269 |     'ltd',
270 |     'm',
271 |     'mainly',
272 |     'many',
273 |     'may',
274 |     'maybe',
275 |     'me',
276 |     'mean',
277 |     'meanwhile',
278 |     'merely',
279 |     'might',
280 |     'more',
281 |     'moreover',
282 |     'most',
283 |     'mostly',
284 |     'much',
285 |     'must',
286 |     'my',
287 |     'myself',
288 |     'n',
289 |     'name',
290 |     'namely',
291 |     'nd',
292 |     'near',
293 |     'nearly',
294 |     'necessary',
295 |     'need',
296 |     'needs',
297 |     'neither',
298 |     'never',
299 |     'nevertheless',
300 |     'new',
301 |     'next',
302 |     'nine',
303 |     'no',
304 |     'nobody',
305 |     'non',
306 |     'none',
307 |     'noone',
308 |     'nor',
309 |     'normally',
310 |     'not',
311 |     'nothing',
312 |     'novel',
313 |     'now',
314 |     'nowhere',
315 |     'o',
316 |     'obviously',
317 |     'of',
318 |     'off',
319 |     'often',
320 |     'oh',
321 |     'ok',
322 |     'okay',
323 |     'old',
324 |     'on',
325 |     'once',
326 |     'one',
327 |     'ones',
328 |     'only',
329 |     'onto',
330 |     'or',
331 |     'other',
332 |     'others',
333 |     'otherwise',
334 |     'ought',
335 |     'our',
336 |     'ours',
337 |     'ourselves',
338 |     'out',
339 |     'outside',
340 |     'over',
341 |     'overall',
342 |     'own',
343 |     'p',
344 |     'particular',
345 |     'particularly',
346 |     'per',
347 |     'perhaps',
348 |     'placed',
349 |     'please',
350 |     'plus',
351 |     'possible',
352 |     'presumably',
353 |     'probably',
354 |     'provides',
355 |     'q',
356 |     'que',
357 |     'quite',
358 |     'qv',
359 |     'r',
360 |     'rather',
361 |     'rd',
362 |     're',
363 |     'really',
364 |     'reasonably',
365 |     'regarding',
366 |     'regardless',
367 |     'regards',
368 |     'relatively',
369 |     'respectively',
370 |     'right',
371 |     's',
372 |     'said',
373 |     'same',
374 |     'saw',
375 |     'say',
376 |     'saying',
377 |     'says',
378 |     'second',
379 |     'secondly',
380 |     'see',
381 |     'seeing',
382 |     'seem',
383 |     'seemed',
384 |     'seeming',
385 |     'seems',
386 |     'seen',
387 |     'self',
388 |     'selves',
389 |     'sensible',
390 |     'sent',
391 |     'serious',
392 |     'seriously',
393 |     'seven',
394 |     'several',
395 |     'shall',
396 |     'she',
397 |     'should',
398 |     "shouldn't",
399 |     'since',
400 |     'six',
401 |     'so',
402 |     'some',
403 |     'somebody',
404 |     'somehow',
405 |     'someone',
406 |     'something',
407 |     'sometime',
408 |     'sometimes',
409 |     'somewhat',
410 |     'somewhere',
411 |     'soon',
412 |     'sorry',
413 |     'specified',
414 |     'specify',
415 |     'specifying',
416 |     'still',
417 |     'sub',
418 |     'such',
419 |     'sup',
420 |     'sure',
421 |     't',
422 |     "t's",
423 |     'take',
424 |     'taken',
425 |     'tell',
426 |     'tends',
427 |     'th',
428 |     'than',
429 |     'thank',
430 |     'thanks',
431 |     'thanx',
432 |     'that',
433 |     "that's",
434 |     'thats',
435 |     'the',
436 |     'their',
437 |     'theirs',
438 |     'them',
439 |     'themselves',
440 |     'then',
441 |     'thence',
442 |     'there',
443 |     "there's",
444 |     'thereafter',
445 |     'thereby',
446 |     'therefore',
447 |     'therein',
448 |     'theres',
449 |     'thereupon',
450 |     'these',
451 |     'they',
452 |     "they'd",
453 |     "they'll",
454 |     "they're",
455 |     "they've",
456 |     'think',
457 |     'third',
458 |     'this',
459 |     'thorough',
460 |     'thoroughly',
461 |     'those',
462 |     'though',
463 |     'three',
464 |     'through',
465 |     'throughout',
466 |     'thru',
467 |     'thus',
468 |     'to',
469 |     'together',
470 |     'too',
471 |     'took',
472 |     'toward',
473 |     'towards',
474 |     'tried',
475 |     'tries',
476 |     'truly',
477 |     'try',
478 |     'trying',
479 |     'twice',
480 |     'two',
481 |     'u',
482 |     'un',
483 |     'under',
484 |     'unfortunately',
485 |     'unless',
486 |     'unlikely',
487 |     'until',
488 |     'unto',
489 |     'up',
490 |     'upon',
491 |     'us',
492 |     'use',
493 |     'used',
494 |     'useful',
495 |     'uses',
496 |     'using',
497 |     'usually',
498 |     'uucp',
499 |     'v',
500 |     'value',
501 |     'various',
502 |     'very',
503 |     'via',
504 |     'viz',
505 |     'vs',
506 |     'w',
507 |     'want',
508 |     'wants',
509 |     'was',
510 |     "wasn't",
511 |     'way',
512 |     'we',
513 |     "we'd",
514 |     "we'll",
515 |     "we're",
516 |     "we've",
517 |     'welcome',
518 |     'well',
519 |     'went',
520 |     'were',
521 |     "weren't",
522 |     'what',
523 |     "what's",
524 |     'whatever',
525 |     'when',
526 |     'whence',
527 |     'whenever',
528 |     'where',
529 |     "where's",
530 |     'whereafter',
531 |     'whereas',
532 |     'whereby',
533 |     'wherein',
534 |     'whereupon',
535 |     'wherever',
536 |     'whether',
537 |     'which',
538 |     'while',
539 |     'whither',
540 |     'who',
541 |     "who's",
542 |     'whoever',
543 |     'whole',
544 |     'whom',
545 |     'whose',
546 |     'why',
547 |     'will',
548 |     'willing',
549 |     'wish',
550 |     'with',
551 |     'within',
552 |     'without',
553 |     "won't",
554 |     'wonder',
555 |     'would',
556 |     'would',
557 |     "wouldn't",
558 |     'x',
559 |     'y',
560 |     'yes',
561 |     'yet',
562 |     'you',
563 |     "you'd",
564 |     "you'll",
565 |     "you're",
566 |     "you've",
567 |     'your',
568 |     'yours',
569 |     'yourself',
570 |     'yourselves',
571 |     'z',
572 |     'zero'
573 | ]
574 | 
575 | 
576 | def words():
577 |     return wordlist
578 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | ## Author: David Cedar(2017) https://www.cedar.net.au
  2 | 
  3 | # Super big thanks to Justin for the inspiration https://www.bellingcat.com/resources/2015/08/13/using-python-to-mine-common-crawl/
  4 | 
  5 | ## PRODUCT FINDER ##
  6 | # Scans common crawl infomation for products and
  7 | # saves them to Amazon DynomoDB database.
  8 | 
  9 | #Install
 10 | #   boto3: https://github.com/boto/boto3. Make sure to add AWS Credentials on our machine, see docs.
 11 | #   
 12 | 
 13 | #Test Make sure code is working with local example - Run
 14 | #   python productfinder_helper.py
 15 | #Run
 16 | #   python main --domain amazon.com
 17 | 
 18 | # Version 1.2
 19 | 
 20 | import requests
 21 | import argparse
 22 | import time
 23 | import json
 24 | import StringIO
 25 | import gzip
 26 | import boto3
 27 | from bs4 import BeautifulSoup
 28 | 
 29 | #Own
 30 | from product import Product
 31 | from saveproducts import SaveProducts
 32 | from productfinder import ProductFinder
 33 | 
 34 | import sys
 35 | reload(sys)
 36 | sys.setdefaultencoding('utf8')
 37 | 
 38 | # parse the command line arguments
 39 | ap = argparse.ArgumentParser()
 40 | ap.add_argument("-d","--domain", required=True, help="The domain to target ie. youtube.com")
 41 | args = vars(ap.parse_args())
 42 | 
 43 | domain = args['domain']
 44 | 
 45 | # list of available indices, "2014-52"
 46 | # index_list = ["2017-39", "2017-34", "2017-30", "2017-26", "2017-22", "2017-17"]
 47 | index_list = ["2017-39"]
 48 | 
 49 | 
 50 | ### -----------------------
 51 | ### Searches the Common Crawl Index for a domain.
 52 | ### -----------------------
 53 | def search_domain(domain):
 54 |     record_list = []
 55 |     print "[*] Trying target domain: %s" % domain
 56 |     
 57 |     for index in index_list:
 58 |         print "[*] Trying index %s" % index
 59 |         cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index
 60 |         cc_url += "url=%s&matchType=domain&output=json" % domain
 61 |         
 62 |         response = requests.get(cc_url)
 63 |         
 64 |         if response.status_code == 200:
 65 |             records = response.content.splitlines()
 66 |             for record in records:
 67 |                 record_list.append(json.loads(record))  
 68 |             print "[*] Added %d results." % len(records)
 69 |     print "[*] Found a total of %d hits." % len(record_list)
 70 |     return record_list        
 71 | 
 72 | ### -----------------------
 73 | ###     Main Function
 74 | ### -----------------------
 75 | def main():
 76 |     print("Starting CommonCrawl Search")
 77 |     #Finds all relevant domins
 78 |     record_list = search_domain(domain)
 79 |     
 80 |     #Creating save object - Products are saved to Amazon DynamoDB
 81 |     savethread = SaveProducts().start()
 82 |     
 83 |     #Downloads page from CommconCrawl and Inspects, then Extracts infomation
 84 |     product_finder_1 = ProductFinder(record_list[0: int(len(record_list)/2)]).start(savethread)
 85 |     product_finder_2 = ProductFinder(record_list[int(len(record_list)/2): int(len(record_list))]).start(savethread)
 86 |     
 87 |     #Idle Main Thread
 88 |     while product_finder_1.check_status() != True and product_finder_2.check_status() != True:
 89 |         time.sleep(1)
 90 |         
 91 |     while savethread.alive(): 
 92 |         time.sleep(1)
 93 |         
 94 |     #Stop Threads    
 95 |     product_finder_1.stop()
 96 |     product_finder_2.stop()
 97 |     savethread.stop()
 98 | 
 99 | if __name__ == '__main__':
100 |     main()
101 |     #Fin
102 | 


--------------------------------------------------------------------------------
/product.py:
--------------------------------------------------------------------------------
 1 | ## Author: David Cedar(2017)
 2 | import json
 3 | import hashlib
 4 | from time import gmtime, strftime
 5 | 
 6 | # Version 1.2
 7 | 
 8 | ########  Product Class   ########
 9 | class Product:
10 |     title = "e"
11 |     brand = "e"
12 |     url = "e"
13 |     image_url = "e"
14 |     blob_small = "Unknown"
15 |     blob_large = "Unknown"
16 |     source_id = "asin"
17 |     source_domain = "amazon"
18 |     
19 |     ## Inti
20 |     def __init__(self, product=None ):
21 |         #Initialise Object with a Json array instead of using Setters.
22 |         if product != None:           
23 |             self.title = product.title
24 |             self.brand = product.brand
25 |             self.url = product.url
26 |             self.images = product.images
27 |             self.blob_small = product.blob_small
28 |             self.blob_large = product.blob_large
29 |             self.source_id = product.source_id
30 |             self.source_domain = product.source_domain
31 |         print("New Product object Initialised in memory")
32 |         
33 |     ## Setters and Getters    
34 |     def SetTitle(self, title):
35 |         self.title = title.strip()
36 | 
37 |     def SetBrand(self, brand):
38 |         self.brand = brand    
39 |     
40 |     def SetUrl(self, url):
41 |         self.url = url
42 |         
43 |     def SetImage(self, url):
44 |         if len(url) > 1:
45 |             self.image_url = url
46 |     
47 |     def SetSmallBlog(self, blob):
48 |         self.blob_small = blob
49 |     
50 |     def SetLargeBlob(self, blob):
51 |         self.blob_large = blob
52 |         
53 |     def SetSourceID(self, id):
54 |         #Strip removes white spaces and any other none standard chars
55 |         self.source_id = id.strip()
56 |     
57 |     def SetSourceDomain(self, domain):
58 |         self.source_domain = domain
59 |     
60 |     
61 |     ## Support 
62 |     def FormCompleted(self):
63 |         #TODO: Returns True if the fields have been filled in.
64 |         if len(self.title) > 1 and len(self.brand) > 1 and len(self.url) > 1 and len(self.source_id) > 1 and len(self.source_domain) > 1 :
65 |             return True
66 |         else:
67 |             return True
68 | 
69 |     def ReturnJson(self):
70 |         #Reutnrs Object infomation in form of a Json array
71 |         m = hashlib.md5()
72 |         m.update(self.source_id)
73 |         product = {
74 |             'uid':        m.hexdigest(), #Set as main index in DynamoDB
75 |             'title':      self.title,
76 |             'brand':      self.brand,
77 |             'url':        self.url,
78 |             'image_url':     self.image_url,
79 |             'small_keywords': self.blob_small,
80 |             'large_keywords': self.blob_large,
81 |             'sid':        self.source_id,
82 |             'domain':     self.source_domain,
83 |             'date':       strftime("%Y-%m-%d %H:%M:%S", gmtime())
84 |         }
85 |         return (product)
86 | 
87 |     def Print(self):
88 |         print("### Printing Product ###")
89 |         print(self.ReturnJson())
90 |         print("###        end       ###")


--------------------------------------------------------------------------------
/productfinder.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from product import Product
 3 | from rake import Rake
 4 | import SmartStopList
 5 | import requests
 6 | import json
 7 | import re
 8 | import productfinder_helper
 9 | from threading import Thread
10 | 
11 | # Version 1.2
12 | ## Author: David Cedar(2017)
13 | 
14 | class ProductFinder:
15 |     
16 |     record_list = list()
17 |     save_thread = None
18 |     
19 |     def __init__(self, record_list):
20 |         self.record_list = record_list
21 |         #Helper
22 |         self.stopped = False
23 |         self.finished = False
24 |     
25 |     ###---------------------------------------------------
26 |     ###   Main handler function for the multi threading
27 |     ###--------------------------------------------------- 
28 |     def start(self, savethread):
29 |         print("[*] Starting Product Finder Thread")
30 |         self.save_thread = savethread
31 |         Thread(target=self.update, args=()).start()
32 |         return self
33 |     
34 |     ### Runs on Multi Thread
35 |     def update(self):
36 |         i = 0
37 |         for record in self.record_list:
38 |             i = i + 1
39 |             #URL Checkers. Bad: artist-redirect, %%%, 
40 |             if len(record['url']) > 23 and record['url'].count('%') < 5 and record['url'].count('artist-redirect') < 1:
41 |                 print("[{} of {}]".format(i, len(self.record_list)))
42 |                 #Ok to download and inspect
43 |                 html_content = productfinder_helper.download_page(record)
44 |                 print "[*] Retrieved {} bytes for {}".format(len(html_content), record['url'])
45 |                 #Collects all the pages to a list
46 |                 product, errs = productfinder_helper.extract_product(html_content, record['url'])
47 |                 if product: 
48 |                     self.save_thread.append(product)
49 |                     print("[Success Append]")
50 |                     if errs:
51 |                         print("[Errors:]")
52 |                         for err in errs:
53 |                             print(" *  {}".format(err))
54 |                 else:
55 |                     print("Failed to EXTRACT Product")
56 |                     
57 |             # if the thread indicator variable is set, stop the thread
58 |             # and resource camera resources
59 |             if self.stopped:
60 |                 return
61 |         self.finished = True
62 |         print "[*] Total external links discovered: %d" % len(products)
63 |         
64 |     def check_status(self):
65 |         return self.finished
66 | 
67 | 


--------------------------------------------------------------------------------
/productfinder_helper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import argparse
  3 | import time
  4 | import json
  5 | import StringIO
  6 | import gzip
  7 | import boto3
  8 | from bs4 import BeautifulSoup
  9 | from product import Product
 10 | from rake import Rake
 11 | import SmartStopList
 12 | import json
 13 | import re
 14 | 
 15 | # Version 1.2
 16 | ## Author: David Cedar(2017)
 17 | #
 18 | # Downloads full page
 19 | #
 20 | def download_page(record):
 21 | 
 22 |     offset, length = int(record['offset']), int(record['length'])
 23 |     offset_end = offset + length - 1
 24 | 
 25 |     # We'll get the file via HTTPS so we don't need to worry about S3 credentials
 26 |     # Getting the file on S3 is equivalent however - you can request a Range
 27 |     prefix = 'https://commoncrawl.s3.amazonaws.com/'
 28 | 
 29 |     # We can then use the Range header to ask for just this set of bytes
 30 |     resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
 31 | 
 32 |     # The page is stored compressed (gzip) to save space
 33 |     # We can extract it using the GZIP library
 34 |     raw_data = StringIO.StringIO(resp.content)
 35 |     f = gzip.GzipFile(fileobj=raw_data)
 36 | 
 37 |     # What we have now is just the WARC response, formatted:
 38 |     data = f.read()
 39 | 
 40 |     response = ""
 41 | 
 42 |     if len(data):
 43 |         try:
 44 |             warc, header, response = data.strip().split('\r\n\r\n', 2)
 45 |         except:
 46 |             pass
 47 | 
 48 |     return response
 49 | 
 50 | #
 51 | # Helper function for Check_Page. Searchs a page for a table and loops through to find target.
 52 | #
 53 | def search_table(parsed, att, target):
 54 |     table_1 = parsed.find("table", attrs=att)
 55 |     if table_1 == None:
 56 |         #print("Failed to search table")
 57 |         return (False, None)
 58 |     table_1_rows = table_1.find_all('tr')
 59 |     found = False
 60 |     value = ""
 61 |     #Loop rows
 62 |     for row in table_1_rows:
 63 |         ths = row.find_all("th")
 64 |         tds = row.find_all("td")
 65 |         rn = ths[0].get_text()
 66 |         #Check th of table
 67 |         if target in rn:
 68 |             value = tds[0].get_text().strip()
 69 |             if len(value) > 2:
 70 |                 found = True  
 71 |     if found:
 72 |         return (True, value)
 73 |     else:
 74 |         return (False, None)
 75 | 
 76 | #
 77 | # Perform Precheck to see if page is a product
 78 | #
 79 | def check_page(parsed):
 80 |     parser = parsed
 81 | 
 82 |     #First Check of ASIN
 83 |     found, asin = search_table(parser, {"id": "productDetails_detailBullets_sections1"}, "ASIN")
 84 |     if found:
 85 |         return (True, asin)
 86 | 
 87 |     #Second Check of ASIN
 88 |     check_asin_2 = parser.find("b", text="ASIN:")    
 89 |     check_asin_3 = parser.find("b", text="ASIN: ")    
 90 | 
 91 |     if check_asin_2 == None and check_asin_3 == None:
 92 |         print("Page is Not a Product")
 93 |         return (False, None)
 94 |     else:
 95 |         if check_asin_2 != None:
 96 |             asin = check_asin_2.findParent().text[5:]
 97 |         if check_asin_3 != None:
 98 |             asin = check_asin_3.findParent().text[5:]
 99 |         #TODO: Add additional checks to confirm the page is definatly a product!
100 |         print("Page is a Product")
101 |         return (True, asin)
102 | 
103 | #
104 | # Extract Product from the single HTML page.   
105 | #
106 | def extract_product(html_content, url):
107 |     #String Buffer
108 |     string_buffer = "" 
109 |     errs = list()
110 | 
111 |     #Read page and read to extract product infomation
112 |     parser = BeautifulSoup(html_content, "html.parser")  
113 | 
114 |     #Check if the page is a product, if not skip page.
115 |     truth, asin = check_page(parser)
116 |     if not truth:
117 |         errs.append("Not product")
118 |         return (False, errs)
119 | 
120 |     #New Product as a object
121 |     product = Product()
122 |     #New Keyword rank
123 |     keyword = Rake(SmartStopList.words())
124 | 
125 | 
126 |     #Find URL
127 |     product.SetUrl(url)
128 | 
129 |     #Find Brand: Note: Some products have an image for the brand 
130 |     truth, string_buffer = search_table(parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name")
131 |     if truth:
132 |         product.SetBrand(string_buffer)
133 |     else:
134 |         string_buffer = parser.find("a", attrs={"id": "brand"})
135 |         if string_buffer != None:
136 |             product.SetBrand(string_buffer.get_text().strip())
137 |         else:
138 |             errs.append("Could not find Brand")
139 | 
140 |     #Find Title
141 |     string_buffer = parser.find("span", attrs={"id": "productTitle"})
142 |     if string_buffer != None:
143 |         product.SetTitle(string_buffer.get_text().strip())
144 |     else:
145 |         errs.append("Could not find Title")
146 |         return (False, errs) 
147 | 
148 |     #Find Image
149 |     string_buffer = parser.find("img", attrs={"id": "landingImage"})
150 |     if string_buffer != None:
151 |         string_buffer = string_buffer.get("data-old-hires")
152 |         if len(string_buffer) < 2:
153 |             string_buffer = parser.find("img", attrs={"id": "landingImage"}).get("data-a-dynamic-image")
154 |             m = re.search('https://(.+?).jpg', string_buffer)
155 |             if m:
156 |                 string_buffer = m.group(1)
157 |                 string_buffer = "https://{}.jpg".format(string_buffer)
158 |         #print ("Img Url: "+string_buffer)
159 |         product.SetImage(string_buffer)
160 |     else:
161 |         errs.append("Could not find Image")
162 | 
163 |     #Find Small Blob
164 |     #TODO: Need to perform keyword analysis
165 |     string_buffer = parser.find("div", attrs={"id": "feature-bullets"})
166 |     if string_buffer != None:
167 |         string_buffer = string_buffer.find("ul")
168 |     try:        
169 |         string_buffer = string_buffer.find_all("li")
170 |         if string_buffer != None:
171 |             string_buffer_2 = ""
172 |             for span in string_buffer:
173 |                 string_buffer_3 = span.find("span")
174 |                 if string_buffer_3 != None:
175 |                     string_buffer_3 = string_buffer_3.get_text()
176 |                     try:
177 |                         string_buffer_2 = "{} {}".format(string_buffer_2, string_buffer_3.strip())
178 |                     except: 
179 |                         pass
180 |             saved_buffer = string_buffer_2.strip()
181 |             #Calculating Key Words
182 |             keywords_1 = keyword.run(saved_buffer)
183 |             product.SetSmallBlog(keywords_1)
184 |     except:	
185 |         errs.append("Error finding li")
186 | 
187 |     else:
188 |         errs.append("Could not find small section keywords")
189 | 
190 | 
191 |     #Find Large Blob
192 |     #TODO: Need to perform keyword analysis
193 |     string_buffer = parser.find("div", attrs={"id": "productDescription"})
194 |     if string_buffer != None:
195 |         string_buffer = string_buffer.find("p")
196 |     if string_buffer != None:	
197 |         string_buffer = string_buffer.get_text()
198 |         saved_buffer = string_buffer.strip()
199 |         #Calculating Key Words
200 |         keywords_2 = keyword.run(saved_buffer)
201 |         product.SetLargeBlob(keywords_2)
202 |     else:
203 |         errs.append("Could not find large section keywords")
204 | 
205 |     #Find ASIN
206 |     product.SetSourceID(asin)
207 | 
208 |     #TODO: Perform price save!
209 | 
210 |     #Append the product to large list of products
211 |     if product.FormCompleted():
212 |         return (product, errs)
213 |     else:
214 |         return (False, errs)
215 | 
216 | ### Example code running from html file 
217 | if __name__ == '__main__':
218 |     print("Script Starting")
219 |     html = open("test_html/amazon2.html")
220 |     url = "https://www.amazon.com/gp/product/B018YHS8BS/ref=s9u_cartx_gw_i3?ie=UTF8&fpl=fresh&pd_rd_i=B018YHS8BS&pd_rd_r=1ZPRY1Q53VY71P1MH3R1&pd_rd_w=E8D0B&pd_rd_wg=l88CZ&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=&pf_rd_r=EQZ2X5XE1BBK1J41FKVB&pf_rd_t=36701&pf_rd_p=eb9f3a57-8cdf-4fa3-a48e-183b5d4b6520&pf_rd_i=desktop"
221 |     products = list()
222 |     product, errs = extract_product(html, url)
223 |     if product:
224 |         products.append( product )
225 |         product.Print()
226 |         print("[Success Append]")
227 |     else:
228 |         print("Returned False")
229 |     if errs:
230 |         print("[Errors:]")
231 |         for err in errs:
232 |             print(" *  {}".format(err))    
233 |     print("Script Finished")
234 | 


--------------------------------------------------------------------------------
/rake.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | # Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
  3 | # https://github.com/fabianvf/python-rake
  4 | # as described in:
  5 | # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
  6 | # Automatic keyword extraction from indi-vidual documents.
  7 | # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
  8 | 
  9 | import re
 10 | import operator
 11 | import decimal
 12 | 
 13 | 
 14 | def is_number(s):
 15 |     try:
 16 |         float(s) if '.' in s else int(s)
 17 |         return True
 18 |     except ValueError:
 19 |         return False
 20 | 
 21 | 
 22 | def SmartStopList():
 23 |     from .stoplists import SmartStopList
 24 |     return SmartStopList.words()
 25 | 
 26 | 
 27 | def FoxStopList():
 28 |     from .stoplists import FoxStopList
 29 |     return FoxStopList.words()
 30 | 
 31 | 
 32 | def MySQLStopList():
 33 |     from .stoplists import MySQLStopList
 34 |     return MySQLStopList.words()
 35 | 
 36 | 
 37 | def NLTKStopList():
 38 |     from .stoplists import NLTKStopList
 39 |     return NLTKStopList.words()
 40 | 
 41 | 
 42 | def GoogleSearchStopList():
 43 |     from .stoplists import GoogleSearchStopList
 44 |     return GoogleSearchStopList.words()
 45 | 
 46 | 
 47 | def RanksNLLongStopList():
 48 |     from .stoplists import RanksNLLongStopList
 49 |     return RanksNLLongStopList.words()
 50 | 
 51 | 
 52 | def RanksNLStoplist():
 53 |     from .stoplists import RanksNLStoplist
 54 |     return RanksNLStoplist.words()
 55 | 
 56 | 
 57 | def load_stop_words(stop_word_file, regex):
 58 |     with open(stop_word_file) as stop_word_file:
 59 |         stop_words = re.split(regex, stop_word_file.read())
 60 |     return [word for word in stop_words if word not in ('', ' ')]  # filters empty string matches
 61 | 
 62 | 
 63 | def separate_words(text, min_word_return_size):
 64 |     """
 65 |     Utility function to return a list of all words that are have a length greater than a specified number of characters.
 66 |     @param text The text that must be split in to words.
 67 |     @param min_word_return_size The minimum no of characters a word must have to be included.
 68 |     """
 69 |     splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
 70 |     words = []
 71 |     for single_word in splitter.split(text):
 72 |         current_word = single_word.strip().lower()
 73 |         # leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
 74 |         if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
 75 |             words.append(current_word)
 76 |     return words
 77 | 
 78 | 
 79 | def split_sentences(text):
 80 |     """
 81 |     Utility function to return a list of sentences.
 82 |     @param text The text that must be split in to sentences.
 83 |     """
 84 |     sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
 85 |     sentences = sentence_delimiters.split(text)
 86 |     return sentences
 87 | 
 88 | 
 89 | def build_stop_word_regex(stop_word_list):
 90 |     stop_word_regex_list = []
 91 |     for word in stop_word_list:
 92 |         word_regex = r'\b' + word + r'(?![\w-])'
 93 |         stop_word_regex_list.append(word_regex)
 94 |     return re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
 95 | 
 96 | 
 97 | def generate_candidate_keywords(sentence_list, stopword_pattern):
 98 |     phrase_list = []
 99 |     for s in sentence_list:
100 |         tmp = re.sub(stopword_pattern, '|', s.strip())
101 |         phrases = tmp.split("|")
102 |         for phrase in phrases:
103 |             phrase = phrase.strip().lower()
104 |             if phrase != "":
105 |                 phrase_list.append(phrase)
106 |     return phrase_list
107 | 
108 | 
109 | def calculate_word_scores(phraseList):
110 |     word_frequency = {}
111 |     word_degree = {}
112 |     for phrase in phraseList:
113 |         word_list = separate_words(phrase, 0)
114 |         word_list_length = len(word_list)
115 |         word_list_degree = word_list_length - 1
116 |         for word in word_list:
117 |             word_frequency.setdefault(word, 0)
118 |             word_frequency[word] += 1
119 |             word_degree.setdefault(word, 0)
120 |             word_degree[word] += word_list_degree
121 |     for item in word_frequency:
122 |         word_degree[item] = word_degree[item] + word_frequency[item]
123 | 
124 |     # Calculate Word scores = deg(w)/frew(w)
125 |     word_score = {}
126 |     for item in word_frequency:
127 |         word_score.setdefault(item, 0)
128 |         word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)
129 |     return word_score
130 | 
131 | 
132 | def generate_candidate_keyword_scores(phrase_list, word_score):
133 |     keyword_candidates = {}
134 |     for phrase in phrase_list:
135 |         keyword_candidates.setdefault(phrase, 0)
136 |         word_list = separate_words(phrase, 0)
137 |         candidate_score = 0
138 |         for word in word_list:
139 |             candidate_score += word_score[word]
140 |         keyword_candidates[phrase] = "%.5g" % candidate_score
141 |     return keyword_candidates
142 | 
143 | 
144 | class Rake(object):
145 |     def __init__(self, stop_words, regex='[\W\n]+'):
146 |         #lets users call predefined stopwords easily in a platform agnostic manner or use their own list
147 |         if isinstance(stop_words, list):
148 |             self.__stop_words_pattern = build_stop_word_regex(stop_words)
149 |         else:
150 |             self.__stop_words_pattern = build_stop_word_regex(load_stop_words(stop_words, regex))
151 | 
152 |     def run(self, text):
153 |         sentence_list = split_sentences(text)
154 | 
155 |         phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
156 |         #print("\nPhrase_list: {}".format(phrase_list))
157 |         
158 |         word_scores = calculate_word_scores(phrase_list)
159 |         #print("\nWord_scores: {}".format(word_scores))
160 |         
161 |         keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
162 |         #print("\nKeyword Candidates: {}".format(keyword_candidates))
163 |         
164 |         #sorted_keywords = sorted(keyword_candidates.items(), key=operator.itemgetter(1), reverse=True)
165 |         return keyword_candidates
166 | 


--------------------------------------------------------------------------------
/saveproducts.py:
--------------------------------------------------------------------------------
 1 | from threading import Thread
 2 | import boto3
 3 | 
 4 | # Version 1.2
 5 | ## Author: David Cedar(2017)
 6 | 
 7 | ### ------------------------------------
 8 | ###    Save Products to DynamoDB Class
 9 | ### ------------------------------------
10 | class SaveProducts:
11 |     
12 |     products_buffer = list()
13 |     
14 |     #Constructor function
15 |     def __init__ (self):
16 |         ### Save prodct into database
17 |         self.dynamodb = boto3.resource('dynamodb')
18 |         self.table = self.dynamodb.Table('productfinder_product_2')
19 |         #Helper
20 |         self.stopped = False
21 |     
22 |     ###---------------------------------------------------
23 |     ###   Main handler function for the multi threading
24 |     ###--------------------------------------------------- 
25 |     def start(self):
26 |         Thread(target=self.update, args=()).start()
27 |         return self
28 |     
29 |     ### Runs on Multi Thread
30 |     def update(self):
31 |         with self.table.batch_writer() as batch:
32 |             #Keep Running for Thread Life
33 |             while True:
34 |                 # keep looping infinitely until the thread is stopped
35 |                 if len(self.products_buffer) > 0:
36 |                     try:
37 |                         self.table.put_item(Item = self.products_buffer[0].ReturnJson()) #Save oldest product
38 |                         self.products_buffer.pop(0) #Remove oldest product
39 |                         print("[**] Successfully Uploaded Product")
40 |                         print("[*] Buffer Size: {}".format(len(self.products_buffer)))
41 |                     except:
42 |                         #Failed to save product into db.
43 |                         #TODO: Add err message
44 |                         print("[-] Upload Error")
45 |                         self.stopped = True
46 |                         
47 | 
48 |                 # if the thread indicator variable is set, stop the thread
49 |                 # and resource camera resources
50 |                 if self.stopped:
51 |                         return
52 |                     
53 |     def append(self, product):
54 |         # Append product into buffer
55 |         if product != None:
56 |             self.products_buffer.append(product)
57 |             print("yes")
58 |             
59 |     def alive(self):
60 |         if len(self.products_buffer) < 1: 
61 |             return False
62 |         else:
63 |             return True
64 |         
65 |     def stop(self):
66 |         # indicate that the thread should be stopped
67 |         self.stopped = True


--------------------------------------------------------------------------------