├── .gitignore ├── README.md ├── setup.sh └── src ├── __init__.py ├── __pycache__ ├── user_agents.cpython-37.pyc └── webdriver_wrapper.cpython-37.pyc ├── scraper.py ├── user_agents.py └── webdriver_wrapper.py /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scraping Amazon Reviews 2 | 3 | In order to run the scraper, you need to first clone the repository. 4 | 5 | ``` 6 | git clone https://github.com/NikolaiT/scraping-amazon-reviews 7 | ``` 8 | 9 | Then you need to download the headless chrome browser and the chrome driver. You can do so with this command. 10 | 11 | ``` 12 | ./setup.sh 13 | ``` 14 | 15 | Now you can scrape amazon reviews by editing the file `scraper.py` and add some amazon product urls you want to have the reviews from: 16 | 17 | ``` 18 | if __name__ == '__main__': 19 | config = { 20 | "urls": [ 21 | "https://www.amazon.de/Crocs-Crocband-Unisex-Erwachsene-Charcoal-Ocean/dp/B007B9MI8K/ref=sr_1_1?s=shoes&ie=UTF8&qid=1537363983&sr=1-1", 22 | "https://www.amazon.de/Samsung-UE55MU6179U-Fernseher-Triple-Schwarz/dp/B06XGS3Q4Y/ref=sr_1_4?s=home-theater&ie=UTF8&qid=1538584798&sr=1-4&keywords=tv", 23 | "https://www.amazon.de/gp/product/B07BKN76JS/ref=s9_acsd_zwish_hd_bw_bDtHh_cr_x__w?pf_rd_m=A3JWKAKR8XB7XF&pf_rd_s=merchandised-search-8&pf_rd_r=TM716ESMTY46877D33XM&pf_rd_r=TM716ESMTY46877D33XM&pf_rd_t=101&pf_rd_p=5f7031a3-d321-54f0-8d79-d0961244d5fa&pf_rd_p=5f7031a3-d321-54f0-8d79-d0961244d5fa&pf_rd_i=3310781" 24 | ]} 25 | main(config) 26 | ``` 27 | 28 | Then just run the scraper: 29 | 30 | ``` 31 | python src/scraper.py 32 | ``` 33 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Downloading chromedriver and headless chromium..."; 4 | 5 | mkdir -p bin/ 6 | 7 | # Get chromedriver 8 | curl -SL https://chromedriver.storage.googleapis.com/2.37/chromedriver_linux64.zip > chromedriver.zip 9 | unzip chromedriver.zip -d bin/ 10 | 11 | # Get Headless-chrome 12 | # https://github.com/adieuadieu/serverless-chrome/releases 13 | curl -SL https://github.com/adieuadieu/serverless-chrome/releases/download/v1.0.0-37/stable-headless-chromium-amazonlinux-2017-03.zip > headless-chromium.zip 14 | unzip headless-chromium.zip -d bin/ 15 | 16 | # Clean 17 | rm headless-chromium.zip chromedriver.zip 18 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NikolaiT/scraping-amazon-reviews/302baf8889a078295072fce6d88ad66b50b0e41d/src/__init__.py -------------------------------------------------------------------------------- /src/__pycache__/user_agents.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NikolaiT/scraping-amazon-reviews/302baf8889a078295072fce6d88ad66b50b0e41d/src/__pycache__/user_agents.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/webdriver_wrapper.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NikolaiT/scraping-amazon-reviews/302baf8889a078295072fce6d88ad66b50b0e41d/src/__pycache__/webdriver_wrapper.cpython-37.pyc -------------------------------------------------------------------------------- /src/scraper.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import json 4 | import pprint 5 | import time 6 | import random 7 | import logging 8 | from webdriver_wrapper import WebDriverWrapper, AmazonDetectionException 9 | 10 | logger = logging.getLogger(__name__) 11 | logger.setLevel(logging.INFO) 12 | 13 | def random_sleep(r): 14 | min, max = r 15 | assert max>min 16 | sleepy_time = random.randrange(min, max) 17 | time.sleep(sleepy_time) 18 | logger.info('sleeping for {} s'.format(sleepy_time)) 19 | 20 | def scrape_amazon_reviews(config): 21 | """ 22 | Scrapes amazon reviews. 23 | """ 24 | urls = config['urls'] 25 | sleep_range = (1, 3) 26 | 27 | driver = WebDriverWrapper() 28 | 29 | try: 30 | for i, product_url in enumerate(urls): 31 | 32 | driver.open_amazon_product(product_url) 33 | driver.scrape_reviews() 34 | 35 | # sleep a bit 36 | if sleep_range: 37 | random_sleep(sleep_range) 38 | 39 | except AmazonDetectionException as e: 40 | logger.fatal('Amazon detected the scraping. Aborting.') 41 | pass 42 | 43 | pprint.pprint(driver.results) 44 | 45 | driver.close() 46 | logger.info('Got {} results out from {} urls'.format(len(driver.results.keys())-3, len(urls))) 47 | 48 | return driver.results, driver.status 49 | 50 | 51 | # this is the lambda function 52 | def main(config): 53 | data, status = scrape_amazon_reviews(config) 54 | 55 | return { 56 | "statusCode": status, 57 | "body": json.dumps(data) 58 | } 59 | 60 | if __name__ == '__main__': 61 | config = { 62 | "urls": [ 63 | "https://www.amazon.de/Crocs-Crocband-Unisex-Erwachsene-Charcoal-Ocean/dp/B007B9MI8K/ref=sr_1_1?s=shoes&ie=UTF8&qid=1537363983&sr=1-1", 64 | "https://www.amazon.de/Samsung-UE55MU6179U-Fernseher-Triple-Schwarz/dp/B06XGS3Q4Y/ref=sr_1_4?s=home-theater&ie=UTF8&qid=1538584798&sr=1-4&keywords=tv", 65 | "https://www.amazon.de/gp/product/B07BKN76JS/ref=s9_acsd_zwish_hd_bw_bDtHh_cr_x__w?pf_rd_m=A3JWKAKR8XB7XF&pf_rd_s=merchandised-search-8&pf_rd_r=TM716ESMTY46877D33XM&pf_rd_r=TM716ESMTY46877D33XM&pf_rd_t=101&pf_rd_p=5f7031a3-d321-54f0-8d79-d0961244d5fa&pf_rd_p=5f7031a3-d321-54f0-8d79-d0961244d5fa&pf_rd_i=3310781" 66 | ]} 67 | main(config) 68 | -------------------------------------------------------------------------------- /src/user_agents.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | # taken here 4 | # https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ 5 | 6 | user_agents = ''' 7 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 8 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 9 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 10 | Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0 11 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15 12 | Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0 13 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134 14 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 15 | Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0 16 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0 17 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 18 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 19 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36 20 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 21 | Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0 22 | Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0 23 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36 24 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 25 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 26 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 27 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 28 | Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko 29 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 30 | Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko 31 | Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0 32 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 33 | Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0 34 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15 35 | Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0 36 | Mozilla/5.0 (iPad; CPU OS 11_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.0 Mobile/15E148 Safari/604.1 37 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:61.0) Gecko/20100101 Firefox/61.0 38 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15 39 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15 40 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/68.0.3440.106 Chrome/68.0.3440.106 Safari/537.36 41 | Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0 42 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299 43 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15 44 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 45 | Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0 46 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36 47 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0 48 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36 49 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 50 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 51 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15 52 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 53 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36 54 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0 55 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:61.0) Gecko/20100101 Firefox/61.0 56 | Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 57 | Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0 58 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.103 YaBrowser/18.7.0.2695 Yowser/2.5 Safari/537.36 59 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 60 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 61 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 62 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.44 63 | Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0 64 | Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0 65 | Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0 66 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.71 67 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0 68 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/604.5.6 (KHTML, like Gecko) Version/11.0.3 Safari/604.5.6 69 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36 70 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 71 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 72 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/68.0.3440.75 Chrome/68.0.3440.75 Safari/537.36 73 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 74 | Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0 75 | Mozilla/5.0 (Windows NT 6.1; rv:61.0) Gecko/20100101 Firefox/61.0 76 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3525.5 Safari/537.36 77 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063 78 | Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0 79 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.44 80 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 81 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64 82 | Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0 83 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 84 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36 85 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0 86 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36 87 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36 OPR/53.0.2907.99 (Edition Yx 02) 88 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 89 | ''' 90 | 91 | def random_ua(): 92 | all_user_agents = [ua.strip() for ua in user_agents.split('\n') if ua.strip()] 93 | return random.choice(all_user_agents) 94 | 95 | 96 | if __name__ == '__main__': 97 | print(random_ua()) 98 | -------------------------------------------------------------------------------- /src/webdriver_wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import uuid 4 | import json 5 | import logging 6 | import io 7 | import csv 8 | import time 9 | import datetime 10 | from user_agents import random_ua 11 | 12 | from selenium import webdriver 13 | from selenium.common.exceptions import TimeoutException, WebDriverException, ElementNotVisibleException, NoSuchElementException 14 | from selenium.webdriver.common.keys import Keys 15 | from selenium.webdriver.common.by import By 16 | from selenium.webdriver.support.ui import WebDriverWait 17 | from selenium.webdriver.support import expected_conditions as EC 18 | 19 | __author__ = 'Nikolai Tschacher' 20 | __url__ = 'https://incolumitas.com/' 21 | __version__ = '0.2' 22 | 23 | class AmazonDetectionException(Exception): 24 | pass 25 | 26 | class WebDriverWrapper: 27 | def __init__(self): 28 | 29 | self.status = 200 30 | self.results = { 31 | 'initialized': str(datetime.datetime.now()), 32 | 'data': [], 33 | } 34 | self.ipinfo = {} 35 | self.save_debug_screenshot = False 36 | self.max_review_pages = 3 37 | 38 | chrome_options = webdriver.ChromeOptions() 39 | self._tmp_folder = '/tmp/{}'.format(uuid.uuid4()) 40 | 41 | if not os.path.exists(self._tmp_folder): 42 | os.makedirs(self._tmp_folder) 43 | 44 | self.user_data_path = os.path.join(self._tmp_folder, 'user-data/') 45 | 46 | if not os.path.exists(self.user_data_path): 47 | os.makedirs(self.user_data_path) 48 | 49 | self.data_path = os.path.join(self._tmp_folder, 'data-path/') 50 | 51 | if not os.path.exists(self.data_path): 52 | os.makedirs(self.data_path) 53 | 54 | self.cache_dir = os.path.join(self._tmp_folder, 'cache-dir/') 55 | 56 | if not os.path.exists(self.cache_dir): 57 | os.makedirs(self.cache_dir) 58 | 59 | logging.basicConfig( 60 | format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s', 61 | handlers=[ 62 | logging.FileHandler("{}.log".format(os.path.join(self.data_path, 'google-scraper'))), 63 | logging.StreamHandler() 64 | ]) 65 | 66 | self.logger = logging.getLogger(__name__) 67 | self.logger.setLevel(logging.INFO) 68 | 69 | chrome_options.add_argument('--headless') 70 | chrome_options.add_argument('--no-sandbox') 71 | chrome_options.add_argument('--disable-gpu') 72 | chrome_options.add_argument('--window-size=1280x1696') 73 | chrome_options.add_argument('--user-data-dir={}'.format(self.user_data_path)) 74 | chrome_options.add_argument('--hide-scrollbars') 75 | chrome_options.add_argument('--enable-logging') 76 | chrome_options.add_argument('--log-level=0') 77 | chrome_options.add_argument('--v=99') 78 | chrome_options.add_argument('--single-process') 79 | chrome_options.add_argument('--data-path={}'.format(self.data_path)) 80 | chrome_options.add_argument('--ignore-certificate-errors') 81 | chrome_options.add_argument('--homedir={}'.format(self._tmp_folder)) 82 | chrome_options.add_argument('--disk-cache-dir={}'.format(self.cache_dir)) 83 | chrome_options.add_argument('user-agent={}'.format(random_ua)) 84 | 85 | chrome_options.binary_location = os.path.join(os.getcwd(), 'bin/headless-chromium') 86 | 87 | self._driver = webdriver.Chrome(chrome_options=chrome_options) 88 | 89 | def get_url(self, url): 90 | self._driver.get(url) 91 | 92 | def set_input_value(self, xpath, value): 93 | elem_send = self._driver.find_element_by_xpath(xpath) 94 | elem_send.send_keys(value) 95 | 96 | def click(self, xpath): 97 | elem_click = self._driver.find_element_by_xpath(xpath) 98 | elem_click.click() 99 | 100 | def save_screen(self, fname): 101 | self._driver.save_screenshot(os.path.join(self.user_data_path, fname)) 102 | 103 | def save_html(self, path=''): 104 | with open(path, 'w') as f: 105 | f.write(self._driver.page_source) 106 | 107 | def get_inner_html(self, xpath): 108 | elem_value = self._driver.find_element_by_xpath(xpath) 109 | return elem_value.get_attribute('innerHTML') 110 | 111 | def get_html(self): 112 | return self._driver.page_source 113 | 114 | def check_ip(self): 115 | self.get_url('https://ipinfo.io/json') 116 | try: 117 | pre = WebDriverWait(self._driver, 2).until( 118 | EC.visibility_of_element_located((By.TAG_NAME, 'pre'))) 119 | self.ipinfo = json.loads(pre.text) 120 | self.results['ipinfo'] = self.ipinfo 121 | except TimeoutException: 122 | self.status = 400 123 | self.logger.warning('Cannot get ipinfo json.') 124 | 125 | def open_amazon_product(self, url): 126 | self.product_url = url 127 | 128 | self.get_url(url) 129 | 130 | search_input = None 131 | try: 132 | search_input = WebDriverWait(self._driver, 5).until( 133 | EC.visibility_of_element_located((By.ID, 'acrCustomerReviewLink'))) 134 | self.logger.info('Got a customer review link.') 135 | except TimeoutException: 136 | self.status = 400 137 | self.logger.error('No customer review link located after 5 seconds.') 138 | 139 | # check whether google blocked us 140 | self.handle_detection() 141 | 142 | def scrape_reviews(self): 143 | try: 144 | review_link = self._driver.find_element_by_css_selector('a[data-hook="see-all-reviews-link-foot"]') 145 | link = review_link.get_attribute('href') 146 | self.get_url(link) 147 | except WebDriverException as e: 148 | self.status = 400 149 | self.logger.error('Cannot locate to amazon all reviews: {}'.format(e)) 150 | 151 | # here the page source of the reviews should become available 152 | try: 153 | WebDriverWait(self._driver, 5).until( 154 | EC.visibility_of_element_located((By.ID, 'cm_cr-product_info'))) 155 | self.logger.info('review page loaded') 156 | except TimeoutException: 157 | self.status = 400 158 | self.logger.error('Cannot load review page') 159 | 160 | # check whether google blocked us 161 | self.handle_detection() 162 | 163 | # debug screenshot 164 | if self.save_debug_screenshot: 165 | self.save_screen('{}.png'.format(keyword.strip())) 166 | 167 | self.num_review_page = 0 168 | 169 | while self.num_review_page < self.max_review_pages: 170 | # now we can scrape the results out of the html 171 | self.num_review_page += 1 172 | self.parse_review_results() 173 | 174 | try: 175 | next_reviews_page = self._driver.find_element_by_css_selector('#cm_cr-pagination_bar .a-last a') 176 | self.get_url(next_reviews_page.get_attribute('href')) 177 | WebDriverWait(self._driver, 5).until( 178 | EC.visibility_of_element_located((By.ID, 'cm_cr-product_info'))) 179 | except WebDriverException as e: 180 | self.status = 400 181 | self.logger.error('Cannot go next page: {}'.format(e)) 182 | break 183 | 184 | def parse_review_results(self): 185 | # now we can scrape the results out of the html 186 | data = { 187 | 'product': self.product_url, 188 | 'time': str(datetime.datetime.now()), 189 | 'average_stars': '', 190 | 'total_reviews': '', 191 | 'num_reviews_scraped': 0, 192 | 'reviews': [], 193 | } 194 | all_links = [] 195 | 196 | try: 197 | data['average_stars'] = self._driver.find_element_by_css_selector('[data-hook="rating-out-of-text"]').text 198 | data['total_reviews'] = self._driver.find_element_by_css_selector('[data-hook="total-review-count"]').text 199 | except NoSuchElementException as e: 200 | self.logger.warning('Cannot scrape {}'.format(e)) 201 | 202 | try: 203 | all_reviews = self._driver.find_elements_by_css_selector('#cm_cr-review_list div[data-hook="review"]') 204 | except NoSuchElementException as e: 205 | self.logger.warning('Cannot find reviews container: {}'.format(e)) 206 | 207 | for i, result in enumerate(all_reviews): 208 | data['reviews'].append(self.scrape_single_review(result)) 209 | data['num_reviews_scraped'] += 1 210 | 211 | # highest voted positive review 212 | # positive = self._driver.find_element_by_css_selector('.positive-review') 213 | # data['positive'] = self.scrape_single_review(positive) 214 | 215 | # highest voted critical review 216 | # critical = self._driver.find_element_by_css_selector('.critical-review') 217 | # data['critical'] = self.scrape_single_review(critical) 218 | 219 | self.results['review-page-{}'.format(self.num_review_page)] = data 220 | self.results['last_scrape'] = str(datetime.datetime.now()) 221 | 222 | def scrape_single_review(self, result): 223 | selectors = { 224 | 'title': '[data-hook="review-title"]', 225 | 'author': '[data-hook="review-author"]', 226 | 'date': '[data-hook="review-date"]', 227 | 'rating': '[data-hook="review-star-rating"]', 228 | 'helpful_vote': '[data-hook="helpful-vote-statement"]', 229 | 'verified_buy': '[data-hook="avp-badge"]', 230 | 'body': '[data-hook="review-body"]', 231 | } 232 | results = dict() 233 | 234 | for key, selector in selectors.items(): 235 | try: 236 | element = result.find_element_by_css_selector(selector) 237 | results[key] = element.text 238 | except NoSuchElementException as e: 239 | self.logger.debug('Cannot scrape review results for selector {}'.format(selector)) 240 | 241 | try: 242 | results['author_url'] = result.find_element_by_css_selector('a[data-hook="review-author"]').get_attribute('href') 243 | results['rating'] = result.find_element_by_css_selector('i[data-hook="review-star-rating"] span').text 244 | except NoSuchElementException as e: 245 | self.logger.warning('Cannot scrape addditional data: {}'.format(e)) 246 | 247 | return results 248 | 249 | def detected_by_amazon(self): 250 | """ 251 | I never actually was detected, therefore I this function is still useless. 252 | """ 253 | needles = { 254 | 'inurl': 'amazondetectionstring17734', 255 | 'inhtml': 'amazondetectionstring674', 256 | } 257 | return needles['inurl'] in self._driver.current_url and needles['inhtml'] in self._driver.page_source 258 | 259 | def handle_detection(self): 260 | if self.detected_by_amazon(): 261 | self.logger.error('Amazon detected us. Stop scraping.') 262 | self.status = '400' 263 | raise AmazonDetectionException('Google detected the scraping.') 264 | 265 | def store_json(self, data, fname): 266 | """Stores a dict in a file in the data directory. 267 | """ 268 | path = os.path.join(self.data_path, '{}.json'.format(fname)) 269 | with open(path, 'w') as f: 270 | json.dump(data, f) 271 | return path 272 | 273 | def close(self): 274 | # Close webdriver connection 275 | self._driver.quit() 276 | 277 | # Remove specific tmp dir of this "run" 278 | shutil.rmtree(self._tmp_folder) 279 | 280 | # Remove possible core dumps 281 | folder = '/tmp' 282 | for the_file in os.listdir(folder): 283 | file_path = os.path.join(folder, the_file) 284 | try: 285 | if 'core.headless-chromi' in file_path and os.path.exists(file_path) and os.path.isfile(file_path): 286 | os.unlink(file_path) 287 | except Exception as e: 288 | print(e) 289 | --------------------------------------------------------------------------------