├── README.md ├── SmartStopList.py ├── main.py ├── product.py ├── productfinder.py ├── productfinder_helper.py ├── rake.py ├── saveproducts.py └── test_html ├── amazon.html └── amazon2.html /README.md: -------------------------------------------------------------------------------- 1 | # python-common-crawl-amazon-example 2 | Amazon PRODUCT FINDER using Python and Commoncrawl dataset 3 | Scans common crawl infomation for products and 4 | saves them to Amazon DynomoDB database. 5 | By David Cedar https://www.cedar.net.au/using-python-and-common-crawl-to-find-products-from-amazon-com/ 6 | 7 | # Install 8 | boto3: https://github.com/boto/boto3. Make sure to add AWS Credentials on our machine, see docs. 9 | & other using pip. 10 | 11 | # Test 12 | Make sure code is working with local example - Run 13 | python productfinder_helper.py 14 | # Run 15 | python main --domain amazon.com 16 | -------------------------------------------------------------------------------- /SmartStopList.py: -------------------------------------------------------------------------------- 1 | wordlist = [ 2 | 'a', 3 | "a's", 4 | 'able', 5 | 'about', 6 | 'above', 7 | 'according', 8 | 'accordingly', 9 | 'across', 10 | 'actually', 11 | 'after', 12 | 'afterwards', 13 | 'again', 14 | 'against', 15 | "ain't", 16 | 'all', 17 | 'allow', 18 | 'allows', 19 | 'almost', 20 | 'alone', 21 | 'along', 22 | 'already', 23 | 'also', 24 | 'although', 25 | 'always', 26 | 'am', 27 | 'among', 28 | 'amongst', 29 | 'an', 30 | 'and', 31 | 'another', 32 | 'any', 33 | 'anybody', 34 | 'anyhow', 35 | 'anyone', 36 | 'anything', 37 | 'anyway', 38 | 'anyways', 39 | 'anywhere', 40 | 'apart', 41 | 'appear', 42 | 'appreciate', 43 | 'appropriate', 44 | 'are', 45 | "aren't", 46 | 'around', 47 | 'as', 48 | 'aside', 49 | 'ask', 50 | 'asking', 51 | 'associated', 52 | 'at', 53 | 'available', 54 | 'away', 55 | 'awfully', 56 | 'b', 57 | 'be', 58 | 'became', 59 | 'because', 60 | 'become', 61 | 'becomes', 62 | 'becoming', 63 | 'been', 64 | 'before', 65 | 'beforehand', 66 | 'behind', 67 | 'being', 68 | 'believe', 69 | 'below', 70 | 'beside', 71 | 'besides', 72 | 'best', 73 | 'better', 74 | 'between', 75 | 'beyond', 76 | 'both', 77 | 'brief', 78 | 'but', 79 | 'by', 80 | 'c', 81 | "c'mon", 82 | "c's", 83 | 'came', 84 | 'can', 85 | "can't", 86 | 'cannot', 87 | 'cant', 88 | 'cause', 89 | 'causes', 90 | 'certain', 91 | 'certainly', 92 | 'changes', 93 | 'clearly', 94 | 'co', 95 | 'com', 96 | 'come', 97 | 'comes', 98 | 'concerning', 99 | 'consequently', 100 | 'consider', 101 | 'considering', 102 | 'contain', 103 | 'containing', 104 | 'contains', 105 | 'corresponding', 106 | 'could', 107 | "couldn't", 108 | 'course', 109 | 'currently', 110 | 'd', 111 | 'definitely', 112 | 'described', 113 | 'despite', 114 | 'did', 115 | "didn't", 116 | 'different', 117 | 'do', 118 | 'does', 119 | "doesn't", 120 | 'doing', 121 | "don't", 122 | 'done', 123 | 'down', 124 | 'downwards', 125 | 'during', 126 | 'e', 127 | 'each', 128 | 'edu', 129 | 'eg', 130 | 'eight', 131 | 'either', 132 | 'else', 133 | 'elsewhere', 134 | 'enough', 135 | 'entirely', 136 | 'especially', 137 | 'et', 138 | 'etc', 139 | 'even', 140 | 'ever', 141 | 'every', 142 | 'everybody', 143 | 'everyone', 144 | 'everything', 145 | 'everywhere', 146 | 'ex', 147 | 'exactly', 148 | 'example', 149 | 'except', 150 | 'f', 151 | 'far', 152 | 'few', 153 | 'fifth', 154 | 'first', 155 | 'five', 156 | 'followed', 157 | 'following', 158 | 'follows', 159 | 'for', 160 | 'former', 161 | 'formerly', 162 | 'forth', 163 | 'four', 164 | 'from', 165 | 'further', 166 | 'furthermore', 167 | 'g', 168 | 'get', 169 | 'gets', 170 | 'getting', 171 | 'given', 172 | 'gives', 173 | 'go', 174 | 'goes', 175 | 'going', 176 | 'gone', 177 | 'got', 178 | 'gotten', 179 | 'greetings', 180 | 'h', 181 | 'had', 182 | "hadn't", 183 | 'happens', 184 | 'hardly', 185 | 'has', 186 | "hasn't", 187 | 'have', 188 | "haven't", 189 | 'having', 190 | 'he', 191 | "he's", 192 | 'hello', 193 | 'help', 194 | 'hence', 195 | 'her', 196 | 'here', 197 | "here's", 198 | 'hereafter', 199 | 'hereby', 200 | 'herein', 201 | 'hereupon', 202 | 'hers', 203 | 'herself', 204 | 'hi', 205 | 'him', 206 | 'himself', 207 | 'his', 208 | 'hither', 209 | 'hopefully', 210 | 'how', 211 | 'howbeit', 212 | 'however', 213 | 'i', 214 | "i'd", 215 | "i'll", 216 | "i'm", 217 | "i've", 218 | 'ie', 219 | 'if', 220 | 'ignored', 221 | 'immediate', 222 | 'in', 223 | 'inasmuch', 224 | 'inc', 225 | 'indeed', 226 | 'indicate', 227 | 'indicated', 228 | 'indicates', 229 | 'inner', 230 | 'insofar', 231 | 'instead', 232 | 'into', 233 | 'inward', 234 | 'is', 235 | "isn't", 236 | 'it', 237 | "it'd", 238 | "it'll", 239 | "it's", 240 | 'its', 241 | 'itself', 242 | 'j', 243 | 'just', 244 | 'k', 245 | 'keep', 246 | 'keeps', 247 | 'kept', 248 | 'know', 249 | 'knows', 250 | 'known', 251 | 'l', 252 | 'last', 253 | 'lately', 254 | 'later', 255 | 'latter', 256 | 'latterly', 257 | 'least', 258 | 'less', 259 | 'lest', 260 | 'let', 261 | "let's", 262 | 'like', 263 | 'liked', 264 | 'likely', 265 | 'little', 266 | 'look', 267 | 'looking', 268 | 'looks', 269 | 'ltd', 270 | 'm', 271 | 'mainly', 272 | 'many', 273 | 'may', 274 | 'maybe', 275 | 'me', 276 | 'mean', 277 | 'meanwhile', 278 | 'merely', 279 | 'might', 280 | 'more', 281 | 'moreover', 282 | 'most', 283 | 'mostly', 284 | 'much', 285 | 'must', 286 | 'my', 287 | 'myself', 288 | 'n', 289 | 'name', 290 | 'namely', 291 | 'nd', 292 | 'near', 293 | 'nearly', 294 | 'necessary', 295 | 'need', 296 | 'needs', 297 | 'neither', 298 | 'never', 299 | 'nevertheless', 300 | 'new', 301 | 'next', 302 | 'nine', 303 | 'no', 304 | 'nobody', 305 | 'non', 306 | 'none', 307 | 'noone', 308 | 'nor', 309 | 'normally', 310 | 'not', 311 | 'nothing', 312 | 'novel', 313 | 'now', 314 | 'nowhere', 315 | 'o', 316 | 'obviously', 317 | 'of', 318 | 'off', 319 | 'often', 320 | 'oh', 321 | 'ok', 322 | 'okay', 323 | 'old', 324 | 'on', 325 | 'once', 326 | 'one', 327 | 'ones', 328 | 'only', 329 | 'onto', 330 | 'or', 331 | 'other', 332 | 'others', 333 | 'otherwise', 334 | 'ought', 335 | 'our', 336 | 'ours', 337 | 'ourselves', 338 | 'out', 339 | 'outside', 340 | 'over', 341 | 'overall', 342 | 'own', 343 | 'p', 344 | 'particular', 345 | 'particularly', 346 | 'per', 347 | 'perhaps', 348 | 'placed', 349 | 'please', 350 | 'plus', 351 | 'possible', 352 | 'presumably', 353 | 'probably', 354 | 'provides', 355 | 'q', 356 | 'que', 357 | 'quite', 358 | 'qv', 359 | 'r', 360 | 'rather', 361 | 'rd', 362 | 're', 363 | 'really', 364 | 'reasonably', 365 | 'regarding', 366 | 'regardless', 367 | 'regards', 368 | 'relatively', 369 | 'respectively', 370 | 'right', 371 | 's', 372 | 'said', 373 | 'same', 374 | 'saw', 375 | 'say', 376 | 'saying', 377 | 'says', 378 | 'second', 379 | 'secondly', 380 | 'see', 381 | 'seeing', 382 | 'seem', 383 | 'seemed', 384 | 'seeming', 385 | 'seems', 386 | 'seen', 387 | 'self', 388 | 'selves', 389 | 'sensible', 390 | 'sent', 391 | 'serious', 392 | 'seriously', 393 | 'seven', 394 | 'several', 395 | 'shall', 396 | 'she', 397 | 'should', 398 | "shouldn't", 399 | 'since', 400 | 'six', 401 | 'so', 402 | 'some', 403 | 'somebody', 404 | 'somehow', 405 | 'someone', 406 | 'something', 407 | 'sometime', 408 | 'sometimes', 409 | 'somewhat', 410 | 'somewhere', 411 | 'soon', 412 | 'sorry', 413 | 'specified', 414 | 'specify', 415 | 'specifying', 416 | 'still', 417 | 'sub', 418 | 'such', 419 | 'sup', 420 | 'sure', 421 | 't', 422 | "t's", 423 | 'take', 424 | 'taken', 425 | 'tell', 426 | 'tends', 427 | 'th', 428 | 'than', 429 | 'thank', 430 | 'thanks', 431 | 'thanx', 432 | 'that', 433 | "that's", 434 | 'thats', 435 | 'the', 436 | 'their', 437 | 'theirs', 438 | 'them', 439 | 'themselves', 440 | 'then', 441 | 'thence', 442 | 'there', 443 | "there's", 444 | 'thereafter', 445 | 'thereby', 446 | 'therefore', 447 | 'therein', 448 | 'theres', 449 | 'thereupon', 450 | 'these', 451 | 'they', 452 | "they'd", 453 | "they'll", 454 | "they're", 455 | "they've", 456 | 'think', 457 | 'third', 458 | 'this', 459 | 'thorough', 460 | 'thoroughly', 461 | 'those', 462 | 'though', 463 | 'three', 464 | 'through', 465 | 'throughout', 466 | 'thru', 467 | 'thus', 468 | 'to', 469 | 'together', 470 | 'too', 471 | 'took', 472 | 'toward', 473 | 'towards', 474 | 'tried', 475 | 'tries', 476 | 'truly', 477 | 'try', 478 | 'trying', 479 | 'twice', 480 | 'two', 481 | 'u', 482 | 'un', 483 | 'under', 484 | 'unfortunately', 485 | 'unless', 486 | 'unlikely', 487 | 'until', 488 | 'unto', 489 | 'up', 490 | 'upon', 491 | 'us', 492 | 'use', 493 | 'used', 494 | 'useful', 495 | 'uses', 496 | 'using', 497 | 'usually', 498 | 'uucp', 499 | 'v', 500 | 'value', 501 | 'various', 502 | 'very', 503 | 'via', 504 | 'viz', 505 | 'vs', 506 | 'w', 507 | 'want', 508 | 'wants', 509 | 'was', 510 | "wasn't", 511 | 'way', 512 | 'we', 513 | "we'd", 514 | "we'll", 515 | "we're", 516 | "we've", 517 | 'welcome', 518 | 'well', 519 | 'went', 520 | 'were', 521 | "weren't", 522 | 'what', 523 | "what's", 524 | 'whatever', 525 | 'when', 526 | 'whence', 527 | 'whenever', 528 | 'where', 529 | "where's", 530 | 'whereafter', 531 | 'whereas', 532 | 'whereby', 533 | 'wherein', 534 | 'whereupon', 535 | 'wherever', 536 | 'whether', 537 | 'which', 538 | 'while', 539 | 'whither', 540 | 'who', 541 | "who's", 542 | 'whoever', 543 | 'whole', 544 | 'whom', 545 | 'whose', 546 | 'why', 547 | 'will', 548 | 'willing', 549 | 'wish', 550 | 'with', 551 | 'within', 552 | 'without', 553 | "won't", 554 | 'wonder', 555 | 'would', 556 | 'would', 557 | "wouldn't", 558 | 'x', 559 | 'y', 560 | 'yes', 561 | 'yet', 562 | 'you', 563 | "you'd", 564 | "you'll", 565 | "you're", 566 | "you've", 567 | 'your', 568 | 'yours', 569 | 'yourself', 570 | 'yourselves', 571 | 'z', 572 | 'zero' 573 | ] 574 | 575 | 576 | def words(): 577 | return wordlist 578 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | ## Author: David Cedar(2017) https://www.cedar.net.au 2 | 3 | # Super big thanks to Justin for the inspiration https://www.bellingcat.com/resources/2015/08/13/using-python-to-mine-common-crawl/ 4 | 5 | ## PRODUCT FINDER ## 6 | # Scans common crawl infomation for products and 7 | # saves them to Amazon DynomoDB database. 8 | 9 | #Install 10 | # boto3: https://github.com/boto/boto3. Make sure to add AWS Credentials on our machine, see docs. 11 | # 12 | 13 | #Test Make sure code is working with local example - Run 14 | # python productfinder_helper.py 15 | #Run 16 | # python main --domain amazon.com 17 | 18 | # Version 1.2 19 | 20 | import requests 21 | import argparse 22 | import time 23 | import json 24 | import StringIO 25 | import gzip 26 | import boto3 27 | from bs4 import BeautifulSoup 28 | 29 | #Own 30 | from product import Product 31 | from saveproducts import SaveProducts 32 | from productfinder import ProductFinder 33 | 34 | import sys 35 | reload(sys) 36 | sys.setdefaultencoding('utf8') 37 | 38 | # parse the command line arguments 39 | ap = argparse.ArgumentParser() 40 | ap.add_argument("-d","--domain", required=True, help="The domain to target ie. youtube.com") 41 | args = vars(ap.parse_args()) 42 | 43 | domain = args['domain'] 44 | 45 | # list of available indices, "2014-52" 46 | # index_list = ["2017-39", "2017-34", "2017-30", "2017-26", "2017-22", "2017-17"] 47 | index_list = ["2017-39"] 48 | 49 | 50 | ### ----------------------- 51 | ### Searches the Common Crawl Index for a domain. 52 | ### ----------------------- 53 | def search_domain(domain): 54 | record_list = [] 55 | print "[*] Trying target domain: %s" % domain 56 | 57 | for index in index_list: 58 | print "[*] Trying index %s" % index 59 | cc_url = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index 60 | cc_url += "url=%s&matchType=domain&output=json" % domain 61 | 62 | response = requests.get(cc_url) 63 | 64 | if response.status_code == 200: 65 | records = response.content.splitlines() 66 | for record in records: 67 | record_list.append(json.loads(record)) 68 | print "[*] Added %d results." % len(records) 69 | print "[*] Found a total of %d hits." % len(record_list) 70 | return record_list 71 | 72 | ### ----------------------- 73 | ### Main Function 74 | ### ----------------------- 75 | def main(): 76 | print("Starting CommonCrawl Search") 77 | #Finds all relevant domins 78 | record_list = search_domain(domain) 79 | 80 | #Creating save object - Products are saved to Amazon DynamoDB 81 | savethread = SaveProducts().start() 82 | 83 | #Downloads page from CommconCrawl and Inspects, then Extracts infomation 84 | product_finder_1 = ProductFinder(record_list[0: int(len(record_list)/2)]).start(savethread) 85 | product_finder_2 = ProductFinder(record_list[int(len(record_list)/2): int(len(record_list))]).start(savethread) 86 | 87 | #Idle Main Thread 88 | while product_finder_1.check_status() != True and product_finder_2.check_status() != True: 89 | time.sleep(1) 90 | 91 | while savethread.alive(): 92 | time.sleep(1) 93 | 94 | #Stop Threads 95 | product_finder_1.stop() 96 | product_finder_2.stop() 97 | savethread.stop() 98 | 99 | if __name__ == '__main__': 100 | main() 101 | #Fin 102 | -------------------------------------------------------------------------------- /product.py: -------------------------------------------------------------------------------- 1 | ## Author: David Cedar(2017) 2 | import json 3 | import hashlib 4 | from time import gmtime, strftime 5 | 6 | # Version 1.2 7 | 8 | ######## Product Class ######## 9 | class Product: 10 | title = "e" 11 | brand = "e" 12 | url = "e" 13 | image_url = "e" 14 | blob_small = "Unknown" 15 | blob_large = "Unknown" 16 | source_id = "asin" 17 | source_domain = "amazon" 18 | 19 | ## Inti 20 | def __init__(self, product=None ): 21 | #Initialise Object with a Json array instead of using Setters. 22 | if product != None: 23 | self.title = product.title 24 | self.brand = product.brand 25 | self.url = product.url 26 | self.images = product.images 27 | self.blob_small = product.blob_small 28 | self.blob_large = product.blob_large 29 | self.source_id = product.source_id 30 | self.source_domain = product.source_domain 31 | print("New Product object Initialised in memory") 32 | 33 | ## Setters and Getters 34 | def SetTitle(self, title): 35 | self.title = title.strip() 36 | 37 | def SetBrand(self, brand): 38 | self.brand = brand 39 | 40 | def SetUrl(self, url): 41 | self.url = url 42 | 43 | def SetImage(self, url): 44 | if len(url) > 1: 45 | self.image_url = url 46 | 47 | def SetSmallBlog(self, blob): 48 | self.blob_small = blob 49 | 50 | def SetLargeBlob(self, blob): 51 | self.blob_large = blob 52 | 53 | def SetSourceID(self, id): 54 | #Strip removes white spaces and any other none standard chars 55 | self.source_id = id.strip() 56 | 57 | def SetSourceDomain(self, domain): 58 | self.source_domain = domain 59 | 60 | 61 | ## Support 62 | def FormCompleted(self): 63 | #TODO: Returns True if the fields have been filled in. 64 | if len(self.title) > 1 and len(self.brand) > 1 and len(self.url) > 1 and len(self.source_id) > 1 and len(self.source_domain) > 1 : 65 | return True 66 | else: 67 | return True 68 | 69 | def ReturnJson(self): 70 | #Reutnrs Object infomation in form of a Json array 71 | m = hashlib.md5() 72 | m.update(self.source_id) 73 | product = { 74 | 'uid': m.hexdigest(), #Set as main index in DynamoDB 75 | 'title': self.title, 76 | 'brand': self.brand, 77 | 'url': self.url, 78 | 'image_url': self.image_url, 79 | 'small_keywords': self.blob_small, 80 | 'large_keywords': self.blob_large, 81 | 'sid': self.source_id, 82 | 'domain': self.source_domain, 83 | 'date': strftime("%Y-%m-%d %H:%M:%S", gmtime()) 84 | } 85 | return (product) 86 | 87 | def Print(self): 88 | print("### Printing Product ###") 89 | print(self.ReturnJson()) 90 | print("### end ###") -------------------------------------------------------------------------------- /productfinder.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from product import Product 3 | from rake import Rake 4 | import SmartStopList 5 | import requests 6 | import json 7 | import re 8 | import productfinder_helper 9 | from threading import Thread 10 | 11 | # Version 1.2 12 | ## Author: David Cedar(2017) 13 | 14 | class ProductFinder: 15 | 16 | record_list = list() 17 | save_thread = None 18 | 19 | def __init__(self, record_list): 20 | self.record_list = record_list 21 | #Helper 22 | self.stopped = False 23 | self.finished = False 24 | 25 | ###--------------------------------------------------- 26 | ### Main handler function for the multi threading 27 | ###--------------------------------------------------- 28 | def start(self, savethread): 29 | print("[*] Starting Product Finder Thread") 30 | self.save_thread = savethread 31 | Thread(target=self.update, args=()).start() 32 | return self 33 | 34 | ### Runs on Multi Thread 35 | def update(self): 36 | i = 0 37 | for record in self.record_list: 38 | i = i + 1 39 | #URL Checkers. Bad: artist-redirect, %%%, 40 | if len(record['url']) > 23 and record['url'].count('%') < 5 and record['url'].count('artist-redirect') < 1: 41 | print("[{} of {}]".format(i, len(self.record_list))) 42 | #Ok to download and inspect 43 | html_content = productfinder_helper.download_page(record) 44 | print "[*] Retrieved {} bytes for {}".format(len(html_content), record['url']) 45 | #Collects all the pages to a list 46 | product, errs = productfinder_helper.extract_product(html_content, record['url']) 47 | if product: 48 | self.save_thread.append(product) 49 | print("[Success Append]") 50 | if errs: 51 | print("[Errors:]") 52 | for err in errs: 53 | print(" * {}".format(err)) 54 | else: 55 | print("Failed to EXTRACT Product") 56 | 57 | # if the thread indicator variable is set, stop the thread 58 | # and resource camera resources 59 | if self.stopped: 60 | return 61 | self.finished = True 62 | print "[*] Total external links discovered: %d" % len(products) 63 | 64 | def check_status(self): 65 | return self.finished 66 | 67 | -------------------------------------------------------------------------------- /productfinder_helper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import argparse 3 | import time 4 | import json 5 | import StringIO 6 | import gzip 7 | import boto3 8 | from bs4 import BeautifulSoup 9 | from product import Product 10 | from rake import Rake 11 | import SmartStopList 12 | import json 13 | import re 14 | 15 | # Version 1.2 16 | ## Author: David Cedar(2017) 17 | # 18 | # Downloads full page 19 | # 20 | def download_page(record): 21 | 22 | offset, length = int(record['offset']), int(record['length']) 23 | offset_end = offset + length - 1 24 | 25 | # We'll get the file via HTTPS so we don't need to worry about S3 credentials 26 | # Getting the file on S3 is equivalent however - you can request a Range 27 | prefix = 'https://commoncrawl.s3.amazonaws.com/' 28 | 29 | # We can then use the Range header to ask for just this set of bytes 30 | resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)}) 31 | 32 | # The page is stored compressed (gzip) to save space 33 | # We can extract it using the GZIP library 34 | raw_data = StringIO.StringIO(resp.content) 35 | f = gzip.GzipFile(fileobj=raw_data) 36 | 37 | # What we have now is just the WARC response, formatted: 38 | data = f.read() 39 | 40 | response = "" 41 | 42 | if len(data): 43 | try: 44 | warc, header, response = data.strip().split('\r\n\r\n', 2) 45 | except: 46 | pass 47 | 48 | return response 49 | 50 | # 51 | # Helper function for Check_Page. Searchs a page for a table and loops through to find target. 52 | # 53 | def search_table(parsed, att, target): 54 | table_1 = parsed.find("table", attrs=att) 55 | if table_1 == None: 56 | #print("Failed to search table") 57 | return (False, None) 58 | table_1_rows = table_1.find_all('tr') 59 | found = False 60 | value = "" 61 | #Loop rows 62 | for row in table_1_rows: 63 | ths = row.find_all("th") 64 | tds = row.find_all("td") 65 | rn = ths[0].get_text() 66 | #Check th of table 67 | if target in rn: 68 | value = tds[0].get_text().strip() 69 | if len(value) > 2: 70 | found = True 71 | if found: 72 | return (True, value) 73 | else: 74 | return (False, None) 75 | 76 | # 77 | # Perform Precheck to see if page is a product 78 | # 79 | def check_page(parsed): 80 | parser = parsed 81 | 82 | #First Check of ASIN 83 | found, asin = search_table(parser, {"id": "productDetails_detailBullets_sections1"}, "ASIN") 84 | if found: 85 | return (True, asin) 86 | 87 | #Second Check of ASIN 88 | check_asin_2 = parser.find("b", text="ASIN:") 89 | check_asin_3 = parser.find("b", text="ASIN: ") 90 | 91 | if check_asin_2 == None and check_asin_3 == None: 92 | print("Page is Not a Product") 93 | return (False, None) 94 | else: 95 | if check_asin_2 != None: 96 | asin = check_asin_2.findParent().text[5:] 97 | if check_asin_3 != None: 98 | asin = check_asin_3.findParent().text[5:] 99 | #TODO: Add additional checks to confirm the page is definatly a product! 100 | print("Page is a Product") 101 | return (True, asin) 102 | 103 | # 104 | # Extract Product from the single HTML page. 105 | # 106 | def extract_product(html_content, url): 107 | #String Buffer 108 | string_buffer = "" 109 | errs = list() 110 | 111 | #Read page and read to extract product infomation 112 | parser = BeautifulSoup(html_content, "html.parser") 113 | 114 | #Check if the page is a product, if not skip page. 115 | truth, asin = check_page(parser) 116 | if not truth: 117 | errs.append("Not product") 118 | return (False, errs) 119 | 120 | #New Product as a object 121 | product = Product() 122 | #New Keyword rank 123 | keyword = Rake(SmartStopList.words()) 124 | 125 | 126 | #Find URL 127 | product.SetUrl(url) 128 | 129 | #Find Brand: Note: Some products have an image for the brand 130 | truth, string_buffer = search_table(parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name") 131 | if truth: 132 | product.SetBrand(string_buffer) 133 | else: 134 | string_buffer = parser.find("a", attrs={"id": "brand"}) 135 | if string_buffer != None: 136 | product.SetBrand(string_buffer.get_text().strip()) 137 | else: 138 | errs.append("Could not find Brand") 139 | 140 | #Find Title 141 | string_buffer = parser.find("span", attrs={"id": "productTitle"}) 142 | if string_buffer != None: 143 | product.SetTitle(string_buffer.get_text().strip()) 144 | else: 145 | errs.append("Could not find Title") 146 | return (False, errs) 147 | 148 | #Find Image 149 | string_buffer = parser.find("img", attrs={"id": "landingImage"}) 150 | if string_buffer != None: 151 | string_buffer = string_buffer.get("data-old-hires") 152 | if len(string_buffer) < 2: 153 | string_buffer = parser.find("img", attrs={"id": "landingImage"}).get("data-a-dynamic-image") 154 | m = re.search('https://(.+?).jpg', string_buffer) 155 | if m: 156 | string_buffer = m.group(1) 157 | string_buffer = "https://{}.jpg".format(string_buffer) 158 | #print ("Img Url: "+string_buffer) 159 | product.SetImage(string_buffer) 160 | else: 161 | errs.append("Could not find Image") 162 | 163 | #Find Small Blob 164 | #TODO: Need to perform keyword analysis 165 | string_buffer = parser.find("div", attrs={"id": "feature-bullets"}) 166 | if string_buffer != None: 167 | string_buffer = string_buffer.find("ul") 168 | try: 169 | string_buffer = string_buffer.find_all("li") 170 | if string_buffer != None: 171 | string_buffer_2 = "" 172 | for span in string_buffer: 173 | string_buffer_3 = span.find("span") 174 | if string_buffer_3 != None: 175 | string_buffer_3 = string_buffer_3.get_text() 176 | try: 177 | string_buffer_2 = "{} {}".format(string_buffer_2, string_buffer_3.strip()) 178 | except: 179 | pass 180 | saved_buffer = string_buffer_2.strip() 181 | #Calculating Key Words 182 | keywords_1 = keyword.run(saved_buffer) 183 | product.SetSmallBlog(keywords_1) 184 | except: 185 | errs.append("Error finding li") 186 | 187 | else: 188 | errs.append("Could not find small section keywords") 189 | 190 | 191 | #Find Large Blob 192 | #TODO: Need to perform keyword analysis 193 | string_buffer = parser.find("div", attrs={"id": "productDescription"}) 194 | if string_buffer != None: 195 | string_buffer = string_buffer.find("p") 196 | if string_buffer != None: 197 | string_buffer = string_buffer.get_text() 198 | saved_buffer = string_buffer.strip() 199 | #Calculating Key Words 200 | keywords_2 = keyword.run(saved_buffer) 201 | product.SetLargeBlob(keywords_2) 202 | else: 203 | errs.append("Could not find large section keywords") 204 | 205 | #Find ASIN 206 | product.SetSourceID(asin) 207 | 208 | #TODO: Perform price save! 209 | 210 | #Append the product to large list of products 211 | if product.FormCompleted(): 212 | return (product, errs) 213 | else: 214 | return (False, errs) 215 | 216 | ### Example code running from html file 217 | if __name__ == '__main__': 218 | print("Script Starting") 219 | html = open("test_html/amazon2.html") 220 | url = "https://www.amazon.com/gp/product/B018YHS8BS/ref=s9u_cartx_gw_i3?ie=UTF8&fpl=fresh&pd_rd_i=B018YHS8BS&pd_rd_r=1ZPRY1Q53VY71P1MH3R1&pd_rd_w=E8D0B&pd_rd_wg=l88CZ&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=&pf_rd_r=EQZ2X5XE1BBK1J41FKVB&pf_rd_t=36701&pf_rd_p=eb9f3a57-8cdf-4fa3-a48e-183b5d4b6520&pf_rd_i=desktop" 221 | products = list() 222 | product, errs = extract_product(html, url) 223 | if product: 224 | products.append( product ) 225 | product.Print() 226 | print("[Success Append]") 227 | else: 228 | print("Returned False") 229 | if errs: 230 | print("[Errors:]") 231 | for err in errs: 232 | print(" * {}".format(err)) 233 | print("Script Finished") 234 | -------------------------------------------------------------------------------- /rake.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | # Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm 3 | # https://github.com/fabianvf/python-rake 4 | # as described in: 5 | # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 6 | # Automatic keyword extraction from indi-vidual documents. 7 | # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd. 8 | 9 | import re 10 | import operator 11 | import decimal 12 | 13 | 14 | def is_number(s): 15 | try: 16 | float(s) if '.' in s else int(s) 17 | return True 18 | except ValueError: 19 | return False 20 | 21 | 22 | def SmartStopList(): 23 | from .stoplists import SmartStopList 24 | return SmartStopList.words() 25 | 26 | 27 | def FoxStopList(): 28 | from .stoplists import FoxStopList 29 | return FoxStopList.words() 30 | 31 | 32 | def MySQLStopList(): 33 | from .stoplists import MySQLStopList 34 | return MySQLStopList.words() 35 | 36 | 37 | def NLTKStopList(): 38 | from .stoplists import NLTKStopList 39 | return NLTKStopList.words() 40 | 41 | 42 | def GoogleSearchStopList(): 43 | from .stoplists import GoogleSearchStopList 44 | return GoogleSearchStopList.words() 45 | 46 | 47 | def RanksNLLongStopList(): 48 | from .stoplists import RanksNLLongStopList 49 | return RanksNLLongStopList.words() 50 | 51 | 52 | def RanksNLStoplist(): 53 | from .stoplists import RanksNLStoplist 54 | return RanksNLStoplist.words() 55 | 56 | 57 | def load_stop_words(stop_word_file, regex): 58 | with open(stop_word_file) as stop_word_file: 59 | stop_words = re.split(regex, stop_word_file.read()) 60 | return [word for word in stop_words if word not in ('', ' ')] # filters empty string matches 61 | 62 | 63 | def separate_words(text, min_word_return_size): 64 | """ 65 | Utility function to return a list of all words that are have a length greater than a specified number of characters. 66 | @param text The text that must be split in to words. 67 | @param min_word_return_size The minimum no of characters a word must have to be included. 68 | """ 69 | splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]') 70 | words = [] 71 | for single_word in splitter.split(text): 72 | current_word = single_word.strip().lower() 73 | # leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases 74 | if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word): 75 | words.append(current_word) 76 | return words 77 | 78 | 79 | def split_sentences(text): 80 | """ 81 | Utility function to return a list of sentences. 82 | @param text The text that must be split in to sentences. 83 | """ 84 | sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s') 85 | sentences = sentence_delimiters.split(text) 86 | return sentences 87 | 88 | 89 | def build_stop_word_regex(stop_word_list): 90 | stop_word_regex_list = [] 91 | for word in stop_word_list: 92 | word_regex = r'\b' + word + r'(?![\w-])' 93 | stop_word_regex_list.append(word_regex) 94 | return re.compile('|'.join(stop_word_regex_list), re.IGNORECASE) 95 | 96 | 97 | def generate_candidate_keywords(sentence_list, stopword_pattern): 98 | phrase_list = [] 99 | for s in sentence_list: 100 | tmp = re.sub(stopword_pattern, '|', s.strip()) 101 | phrases = tmp.split("|") 102 | for phrase in phrases: 103 | phrase = phrase.strip().lower() 104 | if phrase != "": 105 | phrase_list.append(phrase) 106 | return phrase_list 107 | 108 | 109 | def calculate_word_scores(phraseList): 110 | word_frequency = {} 111 | word_degree = {} 112 | for phrase in phraseList: 113 | word_list = separate_words(phrase, 0) 114 | word_list_length = len(word_list) 115 | word_list_degree = word_list_length - 1 116 | for word in word_list: 117 | word_frequency.setdefault(word, 0) 118 | word_frequency[word] += 1 119 | word_degree.setdefault(word, 0) 120 | word_degree[word] += word_list_degree 121 | for item in word_frequency: 122 | word_degree[item] = word_degree[item] + word_frequency[item] 123 | 124 | # Calculate Word scores = deg(w)/frew(w) 125 | word_score = {} 126 | for item in word_frequency: 127 | word_score.setdefault(item, 0) 128 | word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) 129 | return word_score 130 | 131 | 132 | def generate_candidate_keyword_scores(phrase_list, word_score): 133 | keyword_candidates = {} 134 | for phrase in phrase_list: 135 | keyword_candidates.setdefault(phrase, 0) 136 | word_list = separate_words(phrase, 0) 137 | candidate_score = 0 138 | for word in word_list: 139 | candidate_score += word_score[word] 140 | keyword_candidates[phrase] = "%.5g" % candidate_score 141 | return keyword_candidates 142 | 143 | 144 | class Rake(object): 145 | def __init__(self, stop_words, regex='[\W\n]+'): 146 | #lets users call predefined stopwords easily in a platform agnostic manner or use their own list 147 | if isinstance(stop_words, list): 148 | self.__stop_words_pattern = build_stop_word_regex(stop_words) 149 | else: 150 | self.__stop_words_pattern = build_stop_word_regex(load_stop_words(stop_words, regex)) 151 | 152 | def run(self, text): 153 | sentence_list = split_sentences(text) 154 | 155 | phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern) 156 | #print("\nPhrase_list: {}".format(phrase_list)) 157 | 158 | word_scores = calculate_word_scores(phrase_list) 159 | #print("\nWord_scores: {}".format(word_scores)) 160 | 161 | keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores) 162 | #print("\nKeyword Candidates: {}".format(keyword_candidates)) 163 | 164 | #sorted_keywords = sorted(keyword_candidates.items(), key=operator.itemgetter(1), reverse=True) 165 | return keyword_candidates 166 | -------------------------------------------------------------------------------- /saveproducts.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | import boto3 3 | 4 | # Version 1.2 5 | ## Author: David Cedar(2017) 6 | 7 | ### ------------------------------------ 8 | ### Save Products to DynamoDB Class 9 | ### ------------------------------------ 10 | class SaveProducts: 11 | 12 | products_buffer = list() 13 | 14 | #Constructor function 15 | def __init__ (self): 16 | ### Save prodct into database 17 | self.dynamodb = boto3.resource('dynamodb') 18 | self.table = self.dynamodb.Table('productfinder_product_2') 19 | #Helper 20 | self.stopped = False 21 | 22 | ###--------------------------------------------------- 23 | ### Main handler function for the multi threading 24 | ###--------------------------------------------------- 25 | def start(self): 26 | Thread(target=self.update, args=()).start() 27 | return self 28 | 29 | ### Runs on Multi Thread 30 | def update(self): 31 | with self.table.batch_writer() as batch: 32 | #Keep Running for Thread Life 33 | while True: 34 | # keep looping infinitely until the thread is stopped 35 | if len(self.products_buffer) > 0: 36 | try: 37 | self.table.put_item(Item = self.products_buffer[0].ReturnJson()) #Save oldest product 38 | self.products_buffer.pop(0) #Remove oldest product 39 | print("[**] Successfully Uploaded Product") 40 | print("[*] Buffer Size: {}".format(len(self.products_buffer))) 41 | except: 42 | #Failed to save product into db. 43 | #TODO: Add err message 44 | print("[-] Upload Error") 45 | self.stopped = True 46 | 47 | 48 | # if the thread indicator variable is set, stop the thread 49 | # and resource camera resources 50 | if self.stopped: 51 | return 52 | 53 | def append(self, product): 54 | # Append product into buffer 55 | if product != None: 56 | self.products_buffer.append(product) 57 | print("yes") 58 | 59 | def alive(self): 60 | if len(self.products_buffer) < 1: 61 | return False 62 | else: 63 | return True 64 | 65 | def stop(self): 66 | # indicate that the thread should be stopped 67 | self.stopped = True --------------------------------------------------------------------------------