├── .DS_Store
├── AmazonScraper.py
├── LICENSE
├── Main.py
├── README.md
├── chromedriver
└── chromedriver.exe
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mxmnci/AmazonCommentScraper/f97fcd51cad4c0143e7936c54804a1ced79c8185/.DS_Store
--------------------------------------------------------------------------------
/AmazonScraper.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from selenium import webdriver
3 | from datetime import datetime
4 | from datetime import timedelta
5 | from random import randint
6 | from time import sleep
7 | from tqdm import tqdm
8 | import sys
9 | import csv
10 | import os
11 |
12 | def descWordCount(str1):
13 | total = 1
14 | for i in range(len(str1)):
15 | if(str1[i] == ' ' or str1 == '\n' or str1 == '\t'):
16 | total = total + 1
17 | return total
18 |
19 | def containsWord(description, words):
20 | containsWord = False
21 | for word in words:
22 | if word in description.lower():
23 | containsWord = True
24 | return containsWord
25 |
26 | def scrape(AMAZONURL, PRODUCTHANDLE, RATINGLIMIT, COMMENTLIMIT, EXCLUDE):
27 | print(EXCLUDE)
28 | try:
29 | os.remove("comments.csv")
30 | except OSError:
31 | pass
32 | if getattr(sys, 'frozen', False):
33 | if (sys.platform == "darwin"):
34 | chromedriver_path = os.path.join(sys._MEIPASS, 'chromedriver')
35 | elif (sys.platform == "win32"):
36 | chromedriver_path = os.path.join(sys._MEIPASS, 'chromedriver.exe')
37 | else:
38 | print("Chrome Driver not found...")
39 | else:
40 | if (sys.platform == "darwin"):
41 | chromedriver_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"chromedriver")
42 | elif (sys.platform == "win32"):
43 | chromedriver_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"chromedriver.exe")
44 | else:
45 | print("Chrome Driver not found...")
46 | browser = webdriver.Chrome(chromedriver_path)
47 | url = AMAZONURL
48 | browser.get(url)
49 | print("Please be patient while the comments are gathered. \nDO NOT close the browser window.")
50 | try:
51 | currentButton = browser.find_element_by_xpath("//*[@id='reviews-medley-footer']/div[2]/a")
52 | currentButton.click()
53 | except:
54 | print("There are no comments to extract, terminating process...")
55 | sys.exit()
56 | html = browser.page_source
57 | soup = BeautifulSoup(html, 'html.parser')
58 | REVIEWCOUNT = int(soup.find("span", {"class" : "totalReviewCount"}).get_text().replace(",",""))
59 | loop = tqdm(total=REVIEWCOUNT, position=0, leave=False)
60 | TOTALCOMMENTCOUNT = 0
61 | COMMENTCOUNT = 10
62 | sleep(1)
63 | with open('DONOTUSE.csv', 'w', encoding='utf-8') as csvfile:
64 | filewriter = csv.writer(csvfile, delimiter=',',
65 | quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
66 | filewriter.writerow(['product_handle', 'state', 'rating', 'title','author', 'email', 'body', 'created_at'])
67 | productHandle = PRODUCTHANDLE
68 | while COMMENTCOUNT == 10 and (TOTALCOMMENTCOUNT < COMMENTLIMIT or COMMENTLIMIT == 0):
69 | loop.set_description("Status: ")
70 | loop.update(COMMENTCOUNT)
71 | COMMENTCOUNT = 0
72 | html = browser.page_source
73 | soup = BeautifulSoup(html, 'html.parser')
74 | for i in soup.find_all("div", {"class" : "a-section celwidget"}):
75 | rating = int(i.find("span", {"class" : "a-icon-alt"}).get_text()[0])
76 | profileName = i.find("span", {"class" : "a-profile-name"}).get_text()
77 | description = i.find("span", {"class" : "review-text-content"}).get_text()[:-1]
78 | title = i.find("a", {"class" : "review-title-content"}).contents[0].get_text()
79 | fakeTime = str(datetime.now().replace(microsecond=0) - timedelta(days=randint(0,100), minutes=randint(0,60), seconds=randint(0,60))) + " -0700"
80 |
81 | if (rating >= RATINGLIMIT and profileName != "Amazon Customer" and "\"" not in description and not containsWord(description, EXCLUDE) and not containsWord(title, EXCLUDE)):
82 | filewriter.writerow([productHandle, 'published', "\"" + str(rating) + "\"", "\"" + title + "\"", "\"" + profileName + "\"", 'imported@review.com', "\"" + description + "\"", fakeTime])
83 |
84 | COMMENTCOUNT += 1
85 | TOTALCOMMENTCOUNT += 1
86 | try:
87 | currentButton = browser.find_element_by_xpath("//*[@id='cm_cr-pagination_bar']/ul/li[2]")
88 | currentButton.click()
89 | except:
90 | print("There are no comments to extract, terminating process...")
91 | sys.exit()
92 | sleep(1)
93 |
94 | loop.close()
95 |
96 | text = open("DONOTUSE.csv", "r", encoding='utf-8')
97 | text = ''.join([i for i in text]).replace("|", "")
98 | out = open("comments.csv","w", encoding='utf-8')
99 | out.writelines(text)
100 | out.close()
101 | os.remove("DONOTUSE.csv")
102 | s = open("comments.csv", mode='r', encoding='utf-8-sig').read() #CONVERT UTF-8 TO NO BOM - Necessary for shopify import
103 | open("comments.csv", mode='w', encoding='utf-8').write(s)
104 |
105 | with open("comments.csv", 'rb') as open_file: # CONVERT LINE ENDINGS TO UNIX - Necessary for shopify import
106 | content = open_file.read()
107 | WINDOWS_LINE_ENDING = b'\r\n'
108 | UNIX_LINE_ENDING = b'\n'
109 | content = content.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING)
110 | with open("comments.csv", 'wb') as open_file:
111 | open_file.write(content)
112 |
113 | print("Comments have been gathered successfully, your file is located at: " + os.path.abspath("comments.csv"))
114 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 maxmonciardini
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Main.py:
--------------------------------------------------------------------------------
1 | import AmazonScraper
2 |
3 | AmazonScraper.scrape(input("Please input your Amazon Product URL: "),
4 | input("Please input the EXACT Shopify product name: "),
5 | int(input("Please input the minimum rating you would like to gather 1-5: ")),
6 | int(input("Please input the amount of comments you would like to search (input 0 to search all): ")),
7 | input("Please input the words you would like to exclude separated by commas (If there are none then leave this blank): \n").split(','))
8 |
9 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AmazonCommentScraper - Mac OS & Windows!
2 | A python based app that uses the selenium automation framework to convert comments from any Amazon product page into a CSV file (compatible with Shopify!).
3 |
4 | ## Instructions:
5 | 1. To install necessary dependencies, run this command:
6 | ```pip3 install beautifulsoup4, selenium, tqdm```
7 | 2. Open up your Command Prompt or Terminal inside the "AmazonCommentScraper" folder and run this command:
8 | ```python3 Main.py```
9 | 3. Go to Amazon and locate your desired product and copy the product link. Example:
10 | ```https://www.amazon.com/Scotch-Brand-Applications-Engineered-810K6/dp/B00006IF67/ref=cm_cr_arp_d_product_top?ie=UTF8```
11 | 4. (Optional) If you are using Shopify make sure this name matches EXACTLY what your product name is at the end of the product link! Example:```https://modernbeyond.com/products/magic-scrubber-electric-brush```
12 | Use only the ```magic-scrubber-electric-brush``` part of your product link!
13 | 5. Choose the minimum rating that you would like to gather. Example: choosing 3 stars collects reviews that are 3 stars and above.
14 | 6. Choose the amount of comments that you would like the program to search for before terminating. Input 0 to search through all of them.
15 | 7. Input any words that you would like to exclude from the comments. All reviews containing these words will be ignored.
16 |
17 | ### That's it, Enjoy :)
18 |
--------------------------------------------------------------------------------
/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mxmnci/AmazonCommentScraper/f97fcd51cad4c0143e7936c54804a1ced79c8185/chromedriver
--------------------------------------------------------------------------------
/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mxmnci/AmazonCommentScraper/f97fcd51cad4c0143e7936c54804a1ced79c8185/chromedriver.exe
--------------------------------------------------------------------------------