├── .DS_Store ├── AmazonScraper.py ├── LICENSE ├── Main.py ├── README.md ├── chromedriver └── chromedriver.exe /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mxmnci/AmazonCommentScraper/f97fcd51cad4c0143e7936c54804a1ced79c8185/.DS_Store -------------------------------------------------------------------------------- /AmazonScraper.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from selenium import webdriver 3 | from datetime import datetime 4 | from datetime import timedelta 5 | from random import randint 6 | from time import sleep 7 | from tqdm import tqdm 8 | import sys 9 | import csv 10 | import os 11 | 12 | def descWordCount(str1): 13 | total = 1 14 | for i in range(len(str1)): 15 | if(str1[i] == ' ' or str1 == '\n' or str1 == '\t'): 16 | total = total + 1 17 | return total 18 | 19 | def containsWord(description, words): 20 | containsWord = False 21 | for word in words: 22 | if word in description.lower(): 23 | containsWord = True 24 | return containsWord 25 | 26 | def scrape(AMAZONURL, PRODUCTHANDLE, RATINGLIMIT, COMMENTLIMIT, EXCLUDE): 27 | print(EXCLUDE) 28 | try: 29 | os.remove("comments.csv") 30 | except OSError: 31 | pass 32 | if getattr(sys, 'frozen', False): 33 | if (sys.platform == "darwin"): 34 | chromedriver_path = os.path.join(sys._MEIPASS, 'chromedriver') 35 | elif (sys.platform == "win32"): 36 | chromedriver_path = os.path.join(sys._MEIPASS, 'chromedriver.exe') 37 | else: 38 | print("Chrome Driver not found...") 39 | else: 40 | if (sys.platform == "darwin"): 41 | chromedriver_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"chromedriver") 42 | elif (sys.platform == "win32"): 43 | chromedriver_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"chromedriver.exe") 44 | else: 45 | print("Chrome Driver not found...") 46 | browser = webdriver.Chrome(chromedriver_path) 47 | url = AMAZONURL 48 | browser.get(url) 49 | print("Please be patient while the comments are gathered. \nDO NOT close the browser window.") 50 | try: 51 | currentButton = browser.find_element_by_xpath("//*[@id='reviews-medley-footer']/div[2]/a") 52 | currentButton.click() 53 | except: 54 | print("There are no comments to extract, terminating process...") 55 | sys.exit() 56 | html = browser.page_source 57 | soup = BeautifulSoup(html, 'html.parser') 58 | REVIEWCOUNT = int(soup.find("span", {"class" : "totalReviewCount"}).get_text().replace(",","")) 59 | loop = tqdm(total=REVIEWCOUNT, position=0, leave=False) 60 | TOTALCOMMENTCOUNT = 0 61 | COMMENTCOUNT = 10 62 | sleep(1) 63 | with open('DONOTUSE.csv', 'w', encoding='utf-8') as csvfile: 64 | filewriter = csv.writer(csvfile, delimiter=',', 65 | quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') 66 | filewriter.writerow(['product_handle', 'state', 'rating', 'title','author', 'email', 'body', 'created_at']) 67 | productHandle = PRODUCTHANDLE 68 | while COMMENTCOUNT == 10 and (TOTALCOMMENTCOUNT < COMMENTLIMIT or COMMENTLIMIT == 0): 69 | loop.set_description("Status: ") 70 | loop.update(COMMENTCOUNT) 71 | COMMENTCOUNT = 0 72 | html = browser.page_source 73 | soup = BeautifulSoup(html, 'html.parser') 74 | for i in soup.find_all("div", {"class" : "a-section celwidget"}): 75 | rating = int(i.find("span", {"class" : "a-icon-alt"}).get_text()[0]) 76 | profileName = i.find("span", {"class" : "a-profile-name"}).get_text() 77 | description = i.find("span", {"class" : "review-text-content"}).get_text()[:-1] 78 | title = i.find("a", {"class" : "review-title-content"}).contents[0].get_text() 79 | fakeTime = str(datetime.now().replace(microsecond=0) - timedelta(days=randint(0,100), minutes=randint(0,60), seconds=randint(0,60))) + " -0700" 80 | 81 | if (rating >= RATINGLIMIT and profileName != "Amazon Customer" and "\"" not in description and not containsWord(description, EXCLUDE) and not containsWord(title, EXCLUDE)): 82 | filewriter.writerow([productHandle, 'published', "\"" + str(rating) + "\"", "\"" + title + "\"", "\"" + profileName + "\"", 'imported@review.com', "\"" + description + "\"", fakeTime]) 83 | 84 | COMMENTCOUNT += 1 85 | TOTALCOMMENTCOUNT += 1 86 | try: 87 | currentButton = browser.find_element_by_xpath("//*[@id='cm_cr-pagination_bar']/ul/li[2]") 88 | currentButton.click() 89 | except: 90 | print("There are no comments to extract, terminating process...") 91 | sys.exit() 92 | sleep(1) 93 | 94 | loop.close() 95 | 96 | text = open("DONOTUSE.csv", "r", encoding='utf-8') 97 | text = ''.join([i for i in text]).replace("|", "") 98 | out = open("comments.csv","w", encoding='utf-8') 99 | out.writelines(text) 100 | out.close() 101 | os.remove("DONOTUSE.csv") 102 | s = open("comments.csv", mode='r', encoding='utf-8-sig').read() #CONVERT UTF-8 TO NO BOM - Necessary for shopify import 103 | open("comments.csv", mode='w', encoding='utf-8').write(s) 104 | 105 | with open("comments.csv", 'rb') as open_file: # CONVERT LINE ENDINGS TO UNIX - Necessary for shopify import 106 | content = open_file.read() 107 | WINDOWS_LINE_ENDING = b'\r\n' 108 | UNIX_LINE_ENDING = b'\n' 109 | content = content.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING) 110 | with open("comments.csv", 'wb') as open_file: 111 | open_file.write(content) 112 | 113 | print("Comments have been gathered successfully, your file is located at: " + os.path.abspath("comments.csv")) 114 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 maxmonciardini 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Main.py: -------------------------------------------------------------------------------- 1 | import AmazonScraper 2 | 3 | AmazonScraper.scrape(input("Please input your Amazon Product URL: "), 4 | input("Please input the EXACT Shopify product name: "), 5 | int(input("Please input the minimum rating you would like to gather 1-5: ")), 6 | int(input("Please input the amount of comments you would like to search (input 0 to search all): ")), 7 | input("Please input the words you would like to exclude separated by commas (If there are none then leave this blank): \n").split(',')) 8 | 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AmazonCommentScraper - Mac OS & Windows! 2 | A python based app that uses the selenium automation framework to convert comments from any Amazon product page into a CSV file (compatible with Shopify!). 3 | 4 | ## Instructions: 5 | 1. To install necessary dependencies, run this command: 6 | ```pip3 install beautifulsoup4, selenium, tqdm``` 7 | 2. Open up your Command Prompt or Terminal inside the "AmazonCommentScraper" folder and run this command:
8 | ```python3 Main.py```

9 | 3. Go to Amazon and locate your desired product and copy the product link. Example:
10 | ```https://www.amazon.com/Scotch-Brand-Applications-Engineered-810K6/dp/B00006IF67/ref=cm_cr_arp_d_product_top?ie=UTF8```

11 | 4. (Optional) If you are using Shopify make sure this name matches EXACTLY what your product name is at the end of the product link! Example:```https://modernbeyond.com/products/magic-scrubber-electric-brush```
12 | Use only the ```magic-scrubber-electric-brush``` part of your product link!

13 | 5. Choose the minimum rating that you would like to gather. Example: choosing 3 stars collects reviews that are 3 stars and above.

14 | 6. Choose the amount of comments that you would like the program to search for before terminating. Input 0 to search through all of them.

15 | 7. Input any words that you would like to exclude from the comments. All reviews containing these words will be ignored. 16 | 17 | ### That's it, Enjoy :) 18 | -------------------------------------------------------------------------------- /chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mxmnci/AmazonCommentScraper/f97fcd51cad4c0143e7936c54804a1ced79c8185/chromedriver -------------------------------------------------------------------------------- /chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mxmnci/AmazonCommentScraper/f97fcd51cad4c0143e7936c54804a1ced79c8185/chromedriver.exe --------------------------------------------------------------------------------