├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── amazon2csv ├── amazon2csv.py ├── amazonscraper ├── __init__.py └── client.py ├── pytest.ini ├── requirements.txt ├── setup.cfg ├── setup.py ├── snapshot_amazon2csv.png └── test ├── not_satisfied.html └── test_amazonscraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # pytest 104 | .pytest_cache/ 105 | 106 | # code 107 | .vscode/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | # command to install dependencies 5 | before_script: 6 | - pip install -r requirements.txt 7 | - pip install python-coveralls 8 | - pip install pytest-cov 9 | install: 10 | - pip install . 11 | # command to run tests 12 | script: 13 | - pytest # or py.test for Python versions 3.5 and below 14 | after_success: 15 | coveralls 16 | deploy: 17 | provider: pypi 18 | user: "thibdct" 19 | password: 20 | # To be generated in the repo path, with travis encrypt 21 | secure: "fAElle3s1UbGZ3vrYoOKp51TwMAsb2mcVnqoLUjRmPDENTjnOAGws9g8fDI1NlwuOhFBe7UCJucSOUf0muY97sdZAPFqH6VZ2xWmWVTDzGpsSWuQ4CdW0LnmIHXM3Sundh4gHYXtSLXvbXnq2uthF6/34fQoIMpoO0I6CPLkR1t3xdD6QNajFrdLeBPDnl2AfoNG3F814r48+bRpV8nHSbaDk9bSL7Io7HwRyodUv5jJ7ubNP8K25B7E5y3e9dCzGQM4+Nx/6m9+3HBJEXf68VwyTVtZgvGDy2OI7z4KdoTt9nfSJQr5R9sFvReT7W7H1cG4g/on3EJwf+14Hp1+Od0O55pJQwNr4BQdxwk07ZKbdy3K4xZ9wu3Wfp+nicV7nsP3KFWh1DnyBxXw95Yh1uBeWvO5SxgJcHF/SZmPqGVNrcg9X7hJ4SMucFAHeezkA6xnl5bzvvHmtprt0C/kvHKvDFl3m2t2NPjIz/mND/xnEEgidiGZ+egCqHNkq8YjLKilEva2JMSk8DFacThqQzxcffCeTpKErAzYhPWsP5WD+as4zR9IoOiL4YlI2pg4bsi/y45lMXluq02KxyFpWMExCWnjCSg2I+yvNqO2scn9525q4+b7zXhBc2qq/a3wcc/1NFzwU1BprspjkHoC3GmvOdTNtPsmNmmJKRuTyLY=" 22 | on: 23 | tags: true -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5-alpine AS build-env 2 | 3 | # This dockerfile allows you to use the amzon2csv.py command very easily 4 | 5 | # You can build the docker image with the command : 6 | # docker build --no-cache -t amazon2csv . 7 | 8 | # You can create a container and use the command with : 9 | # docker run -it --rm amazon2csv --keywords="Python programming" --maxproductnb=2 10 | 11 | RUN pip install -U --no-cache-dir --target /app amazonscraper \ 12 | && find /app | grep -E "(__pycache__|\.pyc|\.pyo$)" | xargs rm -rf 13 | 14 | FROM gcr.io/distroless/python3 15 | 16 | COPY --from=build-env /app /app 17 | 18 | ENV PYTHONPATH=/app 19 | ENV LC_ALL=C.UTF-8 20 | ENV LANG=C.UTF-8 21 | 22 | ENTRYPOINT ["python", "/app/bin/amazon2csv.py"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Thibault Ducret 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.py 2 | include *.txt 3 | include MANIFEST.in 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # amazon-scraper-python 2 | 3 | [![Travis](https://img.shields.io/travis/tducret/amazon-scraper-python.svg)](https://travis-ci.org/tducret/amazon-scraper-python) 4 | [![Coveralls github](https://img.shields.io/coveralls/github/tducret/amazon-scraper-python.svg)](https://coveralls.io/github/tducret/amazon-scraper-python) 5 | [![PyPI](https://img.shields.io/pypi/v/amazonscraper.svg)](https://pypi.org/project/amazonscraper/) 6 | [![Docker Build Status](https://img.shields.io/docker/build/thibdct/amazon2csv.svg)](https://hub.docker.com/r/thibdct/amazon2csv/) 7 | ![License](https://img.shields.io/github/license/tducret/amazon-scraper-python.svg) 8 | 9 | 10 | # Description 11 | 12 | This package allows you to search for products on [Amazon](https://www.amazon.com/) and extract some useful information (ratings, number of comments). 13 | 14 | I wrote a French blog post about it [here](https://www.tducret.com/scraping/2018/06/05/amazon2csv-ou-comment-filtrer-les-produits-d-amazon-dans-excel.html) 15 | 16 | # Requirements 17 | 18 | - Python 3 19 | - pip3 20 | 21 | # Installation 22 | 23 | ```bash 24 | pip3 install -U amazonscraper 25 | ``` 26 | 27 | # Command line tool `amazon2csv.py` 28 | 29 | After the package installation, you can use the `amazon2csv.py` command in the terminal. 30 | 31 | After passing a search request to the command (and an optional maximum number of products), it will return the results as csv : 32 | 33 | ```bash 34 | amazon2csv.py --keywords="Python programming" --maxproductnb=2 35 | ``` 36 | 37 | ```csv 38 | Product title,Rating,Number of customer reviews,Product URL,Image URL,ASIN 39 | "Python Crash Course: A Hands-On, Project-Based Introduction to Programming",4.5,370,https://www.amazon.com/Python-Crash-Course-Hands-Project-Based/dp/1593276036,https://images-na.ssl-images-amazon.com/images/I/51F48HFHq6L.jpg,1593276036 40 | "A Smarter Way to Learn Python: Learn it faster. Remember it longer.",4.7,384,https://www.amazon.com/Smarter-Way-Learn-Python-Remember-ebook/dp/B077Z55G3B,https://images-na.ssl-images-amazon.com/images/I/51fNZfTUPXL.jpg,B077Z55G3 41 | ``` 42 | 43 | You can also pass a search url (if you added complex filters for example), and save it to a file : 44 | 45 | ```bash 46 | amazon2csv.py --url="https://www.amazon.com/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=python+scraping" > output.csv 47 | ``` 48 | 49 | You can then open it with your favorite spreadsheet editor (and play with the filters) : 50 | 51 | ![snapshot amazon2csv](snapshot_amazon2csv.png) 52 | 53 | More info about the command in the help : 54 | 55 | ```bash 56 | amazon2csv.py --help 57 | ``` 58 | 59 | # Using the `amazonscraper` Python package 60 | 61 | ```python 62 | # -*- coding: utf-8 -*- 63 | import amazonscraper 64 | 65 | results = amazonscraper.search("Python programming", max_product_nb=2) 66 | 67 | for result in results: 68 | print("{}".format(result.title)) 69 | print(" - ASIN : {}".format(result.asin)) 70 | print(" - {} out of 5 stars, {} customer reviews".format(result.rating, result.review_nb)) 71 | print(" - {}".format(result.url)) 72 | print(" - Image : {}".format(result.img)) 73 | print() 74 | 75 | print("Number of results : %d" % (len(results))) 76 | 77 | ``` 78 | 79 | Which will output : 80 | 81 | ``` 82 | Python Crash Course: A Hands-On, Project-Based Introduction to Programming 83 | - ASIN : 1593276036 84 | - 4.5 out of 5 stars, 370 customer reviews 85 | - https://www.amazon.com/Python-Crash-Course-Hands-Project-Based/dp/1593276036 86 | - Image : https://images-na.ssl-images-amazon.com/images/I/51F48HFHq6L.jpg 87 | 88 | A Smarter Way to Learn Python: Learn it faster. Remember it longer. 89 | - ASIN : B077Z55G3B 90 | - 4.7 out of 5 stars, 384 customer reviews 91 | - https://www.amazon.com/Smarter-Way-Learn-Python-Remember-ebook/dp/B077Z55G3B 92 | - Image : https://images-na.ssl-images-amazon.com/images/I/51fNZfTUPXL.jpg 93 | 94 | Number of results : 2 95 | ``` 96 | 97 | ### Attributes of the `Product` object 98 | 99 | Attribute name | Description 100 | ------------------- | --------------------------------------- 101 | title | Product title 102 | rating | Rating of the products (number between 0 and 5, False if missing) 103 | review_nb | Number of customer reviews (False if missing) 104 | url | Product URL 105 | img | Image URL 106 | asin | Product ASIN ([Amazon Standard Identification Number](https://fr.wikipedia.org/wiki/Amazon_Standard_Identification_Number)) 107 | 108 | -------------- 109 | 110 | # Docker 111 | 112 | You can use the amazon2csv tool with the [Docker image](https://hub.docker.com/r/thibdct/amazon2csv/) 113 | 114 | You may execute : 115 | 116 | `docker run -it --rm thibdct/amazon2csv --keywords="Python programming" --maxproductnb=2` 117 | 118 | ## 🤘 The easy way 🤘 119 | 120 | I also built a bash wrapper to execute the Docker container easily. 121 | 122 | Install it with : 123 | 124 | ```bash 125 | curl -s https://raw.githubusercontent.com/tducret/amazon-scraper-python/master/amazon2csv \ 126 | > /usr/local/bin/amazon2csv && chmod +x /usr/local/bin/amazon2csv 127 | ``` 128 | *You may replace `/usr/local/bin` with another folder that is in your $PATH* 129 | 130 | Check that it works : 131 | 132 | *On the first execution, the script will download the Docker image, so please be patient* 133 | 134 | ```bash 135 | amazon2csv --help 136 | amazon2csv --keywords="Python programming" --maxproductnb=2 137 | ``` 138 | 139 | You can upgrade the app with : 140 | 141 | ```bash 142 | amazon2csv --upgrade 143 | ``` 144 | 145 | and even uninstall with : 146 | 147 | ```bash 148 | amazon2csv --uninstall 149 | ``` 150 | 151 | ## TODO 152 | 153 | - [ ] If no product was found with the CSS selectors, it may be a new Amazon page style => change user agent and get the new page. Loop on all the user agents and check all the CSS selectors again 154 | - [ ] Find a way to get the products without css selectors -------------------------------------------------------------------------------- /amazon2csv: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # A wrapper script for invoking a docker container 4 | # Based on https://spin.atomicobject.com/2015/11/30/command-line-tools-docker/ 5 | 6 | DOCKER_IMAGE="thibdct/amazon2csv" 7 | 8 | error(){ 9 | error_code=$1 10 | echo "ERROR: $2" >&2 11 | exit $1 12 | } 13 | check_cmd_in_path(){ 14 | cmd=$1 15 | which $cmd > /dev/null 2>&1 || error 1 "$cmd not found!" 16 | } 17 | upgrade(){ 18 | docker pull $DOCKER_IMAGE 19 | exit 1 20 | } 21 | uninstall(){ 22 | read -p "Are you sure to uninstall (y/n)? " -n 1 -r 23 | echo 24 | if [[ $REPLY =~ ^[Yy]$ ]] 25 | then 26 | docker rmi $DOCKER_IMAGE 27 | rm $0 28 | fi 29 | exit 1 30 | } 31 | 32 | # Checks for dependencies 33 | check_cmd_in_path docker 34 | 35 | case $1 in 36 | --uninstall) 37 | uninstall 38 | ;; 39 | --upgrade) 40 | upgrade 41 | ;; 42 | esac 43 | 44 | # Run our containerized command 45 | exec docker run -it --rm $DOCKER_IMAGE "$@" -------------------------------------------------------------------------------- /amazon2csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import click 4 | import amazonscraper 5 | 6 | 7 | @click.command() 8 | @click.option( 9 | '--keywords', '-k', 10 | type=str, 11 | help='your keywords to find some products (ex : "python+scraping")', 12 | default="", 13 | ) 14 | @click.option( 15 | '--url', '-u', 16 | type=str, 17 | help='an Amazon result page URL (ex : \ 18 | https://www.amazon.com/s/field-keywords=python%2Bscraping', 19 | default="", 20 | ) 21 | @click.option( 22 | '--csvseparator', '-s', 23 | type=str, 24 | help='CSV separator (ex : ;)', 25 | default=",", 26 | ) 27 | @click.option( 28 | '--maxproductnb', '-m', 29 | type=int, 30 | help='Maximum number of products (ex : 100)', 31 | default="100", 32 | ) 33 | @click.version_option( 34 | version=amazonscraper.__version__, 35 | message='%(prog)s, based on amazonscraper module version %(version)s' 36 | ) 37 | @click.option( 38 | '--outputhtml', '-o', 39 | type=str, 40 | help='Save the html page to the current folder with the specified name', 41 | default="", 42 | ) 43 | def main(keywords, url, csvseparator, maxproductnb, outputhtml): 44 | """ Search for products on Amazon, and extract it as CSV """ 45 | products = amazonscraper.search( 46 | keywords=keywords, 47 | search_url=url, 48 | max_product_nb=maxproductnb) 49 | 50 | print(products.csv(separator=csvseparator)) 51 | 52 | if (outputhtml != ""): 53 | with open(outputhtml, "w") as f: 54 | f.write(products.last_html_page) 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /amazonscraper/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ This package allows you to search for products on Amazon and extract some 3 | useful information (title, ratings, number of reviews). 4 | """ 5 | from builtins import object 6 | import csv 7 | from amazonscraper.client import Client 8 | 9 | 10 | __version__ = '0.1.2' # Should be the same in setup.py 11 | 12 | 13 | class Products(object): 14 | """Class of the products""" 15 | def __init__(self, product_dict_list=[]): 16 | self.products = [] 17 | self.last_html_page = "" # HTML content of the last scraped page 18 | self.html_pages = [] 19 | for product_dict in product_dict_list: 20 | self._add_product(product_dict) 21 | 22 | def _add_product(self, product_dict): 23 | """ Append a product to the object product list 24 | >>> p = Products([{'title':'Book title', 'rating': '4.2',\ 25 | 'review_nb': '15', 'url':'http://www.amazon.com/book'}]) 26 | >>> p.products[1] 27 | Traceback (most recent call last): 28 | ... 29 | IndexError: list index out of range 30 | >>> p._add_product({'title':'Book title 2', 'rating': '4.3',\ 31 | 'review_nb': '12', 'url':'http://www.amazon.com/book2'}) 32 | >>> len(p.products) 33 | 2 34 | >>> print(p[1].title) 35 | Book title 2 36 | """ 37 | product = Product(product_dict) 38 | self.products.append(product) 39 | 40 | def __len__(self): 41 | return len(self.products) 42 | 43 | def __getitem__(self, key): 44 | """ Method to access the object as a list 45 | (ex : products[1]) """ 46 | return self.products[key] 47 | 48 | def csv(self, file_name, separator=","): 49 | """ Returns a CSV string with the product info 50 | >>> p = Products([{'title':'Book title', 'rating': '4.2',\ 51 | 'review_nb': '15', 'url':'http://www.amazon.com/book', 'asin':'A12345'}]) 52 | >>> p.csv() 53 | 'Product title,Rating,Number of customer reviews,\ 54 | Product URL,Image URL,ASIN\\n"Book title",4.2,15,http://www.amazon.com/book,,A12345' 55 | 56 | >>> print(p.csv(separator=";")) 57 | Product title;Rating;Number of customer reviews;Product URL;Image URL;ASIN 58 | "Book title";4,2;15;http://www.amazon.com/book;;A12345 59 | 60 | >>> p2 = Products() 61 | >>> p2.csv() 62 | 'Product title,Rating,Number of customer reviews,Product URL,Image URL,ASIN' 63 | """ 64 | 65 | if not self.products: 66 | return 67 | 68 | with open(file_name, 'w') as csvfile: 69 | writer = csv.writer(csvfile, delimiter=separator) 70 | 71 | header = list(self.products[0].product.keys()) 72 | writer.writerow(header) 73 | 74 | for product in self.products: 75 | writer.writerow(list(product.product.values())) 76 | 77 | class Product(object): 78 | """Class of a product""" 79 | def __init__(self, product_dict={}): 80 | self.product = product_dict 81 | 82 | def __getattr__(self, attr): 83 | """ Method to access a dictionnary key as an attribute 84 | (ex : product.title) """ 85 | return self.product.get(attr, "") 86 | 87 | 88 | def search(keywords="", search_url="", max_product_nb=100): 89 | """Function to get the list of products from amazon""" 90 | amz = Client() 91 | product_dict_list = amz._get_products( 92 | keywords=keywords, 93 | search_url=search_url, 94 | max_product_nb=max_product_nb) 95 | products = Products(product_dict_list) 96 | products.html_pages = amz.html_pages 97 | products.last_html_page = amz.html_pages[-1] 98 | 99 | return products 100 | -------------------------------------------------------------------------------- /amazonscraper/client.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Module to get and parse the product info on Amazon 4 | """ 5 | 6 | import requests 7 | import re 8 | from urllib.parse import urljoin 9 | from bs4 import BeautifulSoup 10 | import time 11 | 12 | _BASE_URL = "https://www.amazon.com/" 13 | _DEFAULT_BEAUTIFULSOUP_PARSER = "html.parser" 14 | _DEFAULT_USER_AGENT = 'Mozilla/5.0 (Linux; Android 7.0; \ 15 | SM-A520F Build/NRD90M; wv) AppleWebKit/537.36 \ 16 | (KHTML, like Gecko) Version/4.0 \ 17 | Chrome/65.0.3325.109 Mobile Safari/537.36' 18 | _CHROME_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; \ 19 | Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) \ 20 | Chrome/67.0.3396.79 Safari/537.36' 21 | 22 | _USER_AGENT_LIST = [ 23 | _DEFAULT_USER_AGENT, 24 | _CHROME_DESKTOP_USER_AGENT, 25 | ] 26 | 27 | _CSS_SELECTORS_MOBILE = { 28 | "product": "#resultItems > li", 29 | "title": "a > div > div.sx-table-detail > h5 > span", 30 | "rating": "a > div > div.sx-table-detail > \ 31 | div.a-icon-row.a-size-small > i > span", 32 | "review_nb": "a > div > div.sx-table-detail > \ 33 | div.a-icon-row.a-size-small > span", 34 | "url": "a[href]", 35 | "img": "img[src]", 36 | "next_page_url": "ul.a-pagination > li.a-last > a[href]", 37 | } 38 | # Sometimes, the result page is displayed with another layout 39 | _CSS_SELECTORS_MOBILE_GRID = { 40 | "product": "#grid-atf-content > li > div.s-item-container", 41 | "title": "a > div > h5.sx-title > span", 42 | "rating": "a > div > div.a-icon-row.a-size-mini > i > span", 43 | "review_nb": "a > div > div.a-icon-row.a-size-mini > span", 44 | "url": "a[href]", 45 | "img": "img[src]", 46 | "next_page_url": "ul.a-pagination > li.a-last > a[href]", 47 | } 48 | _CSS_SELECTORS_DESKTOP = { 49 | "product": "ul > li.s-result-item > div.s-item-container", 50 | "title": "a.s-access-detail-page > h2", 51 | "rating": "i.a-icon-star > span", 52 | "review_nb": "div.a-column.a-span5.a-span-last > \ 53 | div.a-row.a-spacing-mini > \ 54 | a.a-size-small.a-link-normal.a-text-normal", 55 | "url": "div.a-row.a-spacing-small > div.a-row.a-spacing-none > a[href]", 56 | "img": "div.a-column.a-span12.a-text-center > a.a-link-normal.a-text-normal > img[src]", 57 | "next_page_url": "a#pagnNextLink", 58 | } 59 | _CSS_SELECTORS_DESKTOP_2 = { 60 | "product": "div.s-result-list.sg-row > div.s-result-item", 61 | "title": "div div.sg-row h5 > span", 62 | "rating": "div div.sg-row .a-spacing-top-mini i span", 63 | "review_nb": "div div.sg-row .a-spacing-top-mini span.a-size-small", 64 | "url": "div div a.a-link-normal", 65 | "img": "img[src]", 66 | "next_page_url": "li.a-last > a[href]", 67 | } 68 | 69 | _CSS_SELECTOR_LIST = [ 70 | _CSS_SELECTORS_MOBILE, 71 | _CSS_SELECTORS_MOBILE_GRID, 72 | _CSS_SELECTORS_DESKTOP, 73 | _CSS_SELECTORS_DESKTOP_2, 74 | ] 75 | 76 | # Maximum number of requests to do if Amazon returns a bad page (anti-scraping) 77 | _MAX_TRIAL_REQUESTS = 5 78 | _WAIT_TIME_BETWEEN_REQUESTS = 1 79 | 80 | 81 | class Client(object): 82 | """Do the requests with the Amazon servers""" 83 | 84 | def __init__(self): 85 | """ Init of the client """ 86 | 87 | self.session = requests.session() 88 | self.current_user_agent_index = 0 89 | self.headers = { 90 | 'Host': 'www.amazon.com', 91 | 'User-Agent': _USER_AGENT_LIST[0], 92 | 'Accept': 'text/html,application/xhtml+xml,\ 93 | application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 94 | } 95 | self.product_dict_list = [] 96 | self.html_pages = [] 97 | 98 | def _change_user_agent(self): 99 | """ Change the User agent of the requests 100 | (useful if anti-scraping) 101 | >>> c = Client() 102 | >>> c.current_user_agent_index 103 | 0 104 | >>> c.headers['User-Agent'] == _USER_AGENT_LIST[0] 105 | True 106 | >>> c._change_user_agent() 107 | >>> c.current_user_agent_index 108 | 1 109 | >>> c.headers['User-Agent'] == _USER_AGENT_LIST[1] 110 | True 111 | >>> c2 = Client() 112 | >>> for i in range(0,9): c2._change_user_agent() 113 | >>> c2.current_user_agent_index == 9 % len(_USER_AGENT_LIST) 114 | True 115 | """ 116 | index = (self.current_user_agent_index + 1) % len(_USER_AGENT_LIST) 117 | self.headers['User-Agent'] = _USER_AGENT_LIST[index] 118 | self.current_user_agent_index = index 119 | 120 | def _get(self, url): 121 | """ GET request with the proper headers """ 122 | ret = self.session.get(url, headers=self.headers) 123 | if ret.status_code != 200: 124 | raise ConnectionError( 125 | 'Status code {status} for url {url}\n{content}'.format( 126 | status=ret.status_code, url=url, content=ret.text)) 127 | return ret 128 | 129 | def _update_headers(self, search_url): 130 | """ Update the 'Host' field in the header with the proper Amazon domain 131 | >>> c = Client() 132 | >>> print(c.headers['Host']) 133 | www.amazon.com 134 | >>> c._update_headers("https://www.amazon.fr/s/lkdjsdlkjlk") 135 | >>> print(c.headers['Host']) 136 | www.amazon.fr 137 | """ 138 | self.base_url = "https://" + \ 139 | search_url.split("://")[1].split("/")[0] + "/" 140 | self.headers['Host'] = self.base_url.split("://")[1].split("/")[0] 141 | 142 | def _get_search_url(self, keywords): 143 | """ Get the Amazon search URL, based on the keywords passed 144 | >>> c = Client() 145 | >>> print(c._get_search_url(keywords="python")) 146 | https://www.amazon.com/s?k=python 147 | """ 148 | search_url = urljoin(_BASE_URL, ("s?k=%s" % (keywords))) 149 | return search_url 150 | 151 | def _check_page(self, html_content): 152 | """Check if the page is a valid result page 153 | (even if there is no result) """ 154 | if "Sign in for the best experience" in html_content: 155 | valid_page = False 156 | elif "The request could not be satisfied." in html_content: 157 | valid_page = False 158 | elif "Robot Check" in html_content: 159 | valid_page = False 160 | else: 161 | valid_page = True 162 | return valid_page 163 | 164 | 165 | def _get_page_html(self, search_url): 166 | """Retrieve the page at `search_url`""" 167 | trials = 0 168 | res = None 169 | 170 | while trials < _MAX_TRIAL_REQUESTS: 171 | 172 | print('Trying user agent: {}'.format(self.headers['User-Agent'])) 173 | trials += 1 174 | try: 175 | res = self._get(search_url) 176 | 177 | valid_page = self._check_page(res.text) 178 | 179 | # To counter the "SSLError bad handshake" exception 180 | except requests.exceptions.SSLError: 181 | valid_page = False 182 | 183 | except ConnectionError: 184 | valid_page = False 185 | 186 | if valid_page: 187 | break 188 | 189 | self._change_user_agent() 190 | time.sleep(_WAIT_TIME_BETWEEN_REQUESTS) 191 | 192 | if not valid_page: 193 | raise ValueError('No valid pages found! Perhaps the page returned is a CAPTCHA? Check products.last_html_page') 194 | return res.text 195 | 196 | def _get_n_ratings(self, product): 197 | """Given the HTML of a `product`, extract the number of ratings""" 198 | 199 | n_ratings_css_selectors = [ 200 | "div.a-row.a-size-small span.a-size-base", 201 | "div div.sg-row .a-spacing-top-mini span.a-size-small", 202 | "div.a-column.a-span5.a-span-last > div.a-row.a-spacing-mini > a.a-size-small.a-link-normal.a-text-normal", 203 | ] 204 | 205 | for selector in n_ratings_css_selectors: 206 | 207 | n_ratings = _css_select(product, selector) 208 | 209 | try: 210 | n_ratings = int(n_ratings.replace(',', '')) 211 | break 212 | except ValueError: 213 | pass 214 | 215 | if not n_ratings: 216 | print(f' Failed to extract number of ratings!') 217 | return float('nan') 218 | 219 | return n_ratings 220 | 221 | 222 | def _get_title(self, product): 223 | """Given the HTML of a `product`, extract the title""" 224 | 225 | title_css_selectors = [ 226 | 'h5 span', 227 | "a.s-access-detail-page > h2", 228 | "div div.sg-row h5 > span" 229 | ] 230 | 231 | for selector in title_css_selectors: 232 | 233 | title = _css_select(product, selector) 234 | 235 | if title: 236 | break 237 | 238 | if not title: 239 | print(' Failed to extract title!') 240 | 241 | return title 242 | 243 | 244 | def _get_rating(self, product): 245 | """Given the HTML of a `product`, extract the average rating""" 246 | 247 | rating = re.search(r'(\d.\d) out of 5', str(product)) 248 | 249 | if rating: 250 | rating = rating.groups()[0] 251 | # convert string to float and replace European decimal seperator ',' with '.'s 252 | rating = float(rating.replace(",", ".")) 253 | else: 254 | rating = float('nan') 255 | print(f' Failed to extract rating!') 256 | 257 | return rating 258 | 259 | 260 | def _get_prices(self, product): 261 | """ 262 | Given the HTML of a `product`, extract all prices. 263 | """ 264 | # XXX currently does not handle shipping prices or prices for the 265 | # various formats of books. 266 | 267 | # match all prices of the form $X,XXX.XX: 268 | raw_prices = product.find_all(text=re.compile('\$[\d,]+.\d\d')) 269 | 270 | prices = { 271 | 'prices_per_unit': set(), 272 | 'units': set(), 273 | 'prices_main': set(), 274 | } 275 | 276 | # attempt to identify the prices 277 | for raw_price in raw_prices: 278 | 279 | # get the price as a float rather than a string or BeautifulSoup object 280 | price = float(re.search('\$([\d,]+.\d\d)', raw_price).groups()[0]) 281 | 282 | # ignore promotional strikethrough prices 283 | if raw_price.parent.parent.attrs.get('data-a-strike') == 'true': 284 | continue 285 | 286 | # ignore promotional freebies 287 | elif raw_price == '$0.00': 288 | continue 289 | 290 | # extract price per unit price and unit 291 | elif raw_price.startswith('(') and '/' in raw_price: 292 | price_per_unit = re.findall(r'/(.*)\)', raw_price)[0] 293 | prices['prices_per_unit'].add(price) 294 | prices['units'].add(price_per_unit) 295 | 296 | # any other price is hopefully the main price 297 | else: 298 | prices['prices_main'].add(price) 299 | 300 | # clean up the discoverd prices 301 | for price_type, price_value in prices.copy().items(): 302 | 303 | if len(price_value) == 0: 304 | prices[price_type] = float('nan') 305 | 306 | elif len(price_value) == 1: 307 | prices[price_type] = price_value.pop() 308 | 309 | else: 310 | print(' Multiple prices found. Consider selecting a format on Amazon and using that URL!') 311 | prices[price_type] = ', '.join(map(str, price_value)) 312 | 313 | return prices 314 | 315 | def _extract_page(self, page, max_product_nb): 316 | """ 317 | Extract the products on a given HTML page of Amazon results and return 318 | the URL of the next page of results 319 | """ 320 | 321 | soup = BeautifulSoup(page, _DEFAULT_BEAUTIFULSOUP_PARSER) 322 | 323 | # shuffle through CSS selectors until we get a list of products 324 | selector = 0 325 | for css_selector_dict in _CSS_SELECTOR_LIST: 326 | selector += 1 327 | css_selector = css_selector_dict.get("product", "") 328 | products = soup.select(css_selector) 329 | 330 | if len(products) >= 1: 331 | break 332 | 333 | # For each product of the result page 334 | for product in products: 335 | 336 | # Check if the maximum number to search has been reached 337 | if len(self.product_dict_list) >= max_product_nb: 338 | break 339 | 340 | product_dict = {} 341 | 342 | # extract title 343 | product_dict['title'] = self._get_title(product) 344 | 345 | print('Extracting {}'.format(product_dict['title'][:80])) 346 | 347 | # extract rating 348 | product_dict['rating'] = self._get_rating(product) 349 | 350 | # extract number of ratings 351 | product_dict['review_nb'] = self._get_n_ratings(product) 352 | 353 | # Get image before url and asin 354 | css_selector = css_selector_dict.get("img", "") 355 | img_product_soup = product.select(css_selector) 356 | if img_product_soup: 357 | img_url = img_product_soup[0].get('src') 358 | # Check if it is not a base64 formatted image 359 | if "data:image/webp" in img_url: 360 | img_url = img_product_soup[0].get( 361 | 'data-search-image-source-set', 362 | '').split(' ')[0] 363 | 364 | if img_url != '': 365 | img_url = _get_high_res_img_url(img_url=img_url) 366 | 367 | product_dict['img'] = img_url 368 | 369 | 370 | # Extract ASIN and product URL 371 | css_selector = css_selector_dict.get("url", "") 372 | 373 | url_product_soup = product.select(css_selector) 374 | 375 | product_dict['url'] = '' 376 | product_dict['asin'] = '' 377 | 378 | if url_product_soup: 379 | url = urljoin( 380 | self.base_url, 381 | url_product_soup[0].get('href')) 382 | 383 | if 'slredirect' not in url: 384 | product_dict['url'] = url.split("/ref=")[0] 385 | 386 | product_dict['asin'] = product_dict['url'].split("/")[-1] 387 | 388 | if not product_dict['url']: 389 | print(' Failed to extract URL!') 390 | 391 | if not product_dict['asin']: 392 | print(' Failed to extract ASIN!') 393 | 394 | 395 | # Amazon has many prices associated with a given product 396 | prices = self._get_prices(product) 397 | product_dict.update(prices) 398 | 399 | self.product_dict_list.append(product_dict) 400 | 401 | 402 | css_selector = css_selector_dict.get("next_page_url") 403 | url_next_page_soup = soup.select(css_selector) 404 | if url_next_page_soup: 405 | url_next_page = urljoin( 406 | self.base_url, 407 | url_next_page_soup[0].get('href')) 408 | else: 409 | raise(ValueError('Could not find the URL of the next page of results!')) 410 | return url_next_page 411 | 412 | 413 | def _get_products(self, keywords="", search_url="", max_product_nb=100): 414 | 415 | if search_url == "": 416 | search_url = self._get_search_url(keywords) 417 | self._update_headers(search_url) 418 | 419 | while len(self.product_dict_list) < max_product_nb: 420 | 421 | # get the html of the specified page 422 | page = self._get_page_html(search_url) 423 | self.html_pages.append(page) 424 | 425 | # extract the needed products from the page and return the url of 426 | # the next page 427 | search_url = self._extract_page(page, max_product_nb=max_product_nb) 428 | 429 | return self.product_dict_list 430 | 431 | 432 | def _css_select(soup, css_selector): 433 | """ 434 | Returns the content of the element pointed by the CSS selector, or an empty 435 | string if not found 436 | """ 437 | selection = soup.select(css_selector) 438 | retour = "" 439 | if len(selection) > 0: 440 | if hasattr(selection[0], 'text'): 441 | retour = selection[0].text.strip() 442 | return retour 443 | 444 | def _get_high_res_img_url(img_url): 445 | """ Returns a modified url pointing to the high resolution version of 446 | the image 447 | >>> print(_get_high_res_img_url("https://images-na.ssl-images-amazon.com/\ 448 | images/I/513gErH1dML._AC_SX236_SY340_FMwebp_QL65_.jpg")) 449 | https://images-na.ssl-images-amazon.com/\ 450 | images/I/513gErH1dML.jpg 451 | >>> print(_get_high_res_img_url("https://images-na.ssl-images-amazon.com/\ 452 | images/I/51F48HFHq6L._AC_SX118_SY170_QL70_.jpg")) 453 | https://images-na.ssl-images-amazon.com/\ 454 | images/I/51F48HFHq6L.jpg 455 | """ 456 | high_res_url = img_url.split("._")[0] + ".jpg" 457 | return high_res_url 458 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --doctest-modules --cov amazonscraper -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.6.0 2 | click>=6.7 3 | beautifulsoup4>=4.6.0 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | [aliases] 4 | test=pytest 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from setuptools import setup 3 | try: # Pour pip >= 10 4 | from pip._internal.req import parse_requirements 5 | except ImportError: # For pip <= 9 6 | from pip.req import parse_requirements 7 | 8 | # Based on http://peterdowns.com/posts/first-time-with-pypi.html 9 | 10 | __version__ = '0.1.2' # Should match with __init.py__ 11 | _NOM_PACKAGE = 'amazonscraper' 12 | _URL_GITHUB = 'https://github.com/tducret/amazon-scraper-python' 13 | _DESCRIPTION = 'Package to search for products on Amazon and extract \ 14 | some useful information (title, ratings, number of reviews)' 15 | _MOTS_CLES = ['api', 'amazon', 'python', 'amazonscraper', 'parsing', 16 | 'python-wrapper', 'scraping', 'scraper', 'parser'] 17 | _SCRIPTS = ['amazon2csv.py'] 18 | # To delete here + 'scripts' dans setup() 19 | # if no command is used in the package 20 | 21 | install_reqs = parse_requirements('requirements.txt', session='hack') 22 | try: 23 | requirements = [str(ir.req) for ir in install_reqs] 24 | except: 25 | requirements = [str(ir.requirement) for ir in install_reqs] 26 | 27 | setup( 28 | name=_NOM_PACKAGE, 29 | packages=[_NOM_PACKAGE], 30 | package_data={}, 31 | scripts=_SCRIPTS, 32 | version=__version__, 33 | license='MIT', 34 | platforms='Posix; MacOS X', 35 | description=_DESCRIPTION, 36 | long_description=_DESCRIPTION, 37 | author='Thibault Ducret', 38 | author_email='thibault.ducret@gmail.com', 39 | url=_URL_GITHUB, 40 | download_url='%s/tarball/%s' % (_URL_GITHUB, __version__), 41 | keywords=_MOTS_CLES, 42 | setup_requires=requirements, 43 | install_requires=requirements, 44 | classifiers=['Programming Language :: Python :: 3'], 45 | python_requires='>=3', 46 | tests_require=['pytest'], 47 | ) 48 | 49 | # ------------------------------------------ 50 | # To upload a new version on pypi 51 | # ------------------------------------------ 52 | # Make sure everything was pushed (with a git status) 53 | # (or git commit --am "Comment" and git push) 54 | # git tag 0.1.2 -m "Added image urls for each product"; git push --tags 55 | 56 | # Do a generation test on the pypi test repository 57 | # python3 setup.py sdist register -r pypitest 58 | 59 | # Upload test of the package on the pypi test repository 60 | # python3 setup.py sdist upload -r pypitest 61 | 62 | # Upload of the package on the official pypi repository 63 | # python3 setup.py sdist upload -r pypi 64 | 65 | # If you need to delete a tag 66 | # git push --delete origin VERSION 67 | # git tag -d VERSION 68 | -------------------------------------------------------------------------------- /snapshot_amazon2csv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tducret/amazon-scraper-python/30f812f0d2f2e7dd2f2af7ec1ad23626a0a0cabd/snapshot_amazon2csv.png -------------------------------------------------------------------------------- /test/not_satisfied.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | The request could not be satisfied. 7 | 8 | 9 | 10 | The request could not be satisfied. 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /test/test_amazonscraper.py: -------------------------------------------------------------------------------- 1 | import amazonscraper 2 | import pytest 3 | 4 | _MAX_PRODUCT_NB = 10 5 | 6 | 7 | def test_amazonscraper_get_products_with_keywords(): 8 | products = amazonscraper.search( 9 | keywords="Python", 10 | max_product_nb=_MAX_PRODUCT_NB) 11 | 12 | assert len(products) == _MAX_PRODUCT_NB 13 | 14 | 15 | def test_amazonscraper_get_products_with_url(): 16 | url = "https://www.amazon.com/s/\ 17 | ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=python" 18 | products = amazonscraper.search( 19 | search_url=url, 20 | max_product_nb=_MAX_PRODUCT_NB) 21 | 22 | assert isinstance(products, amazonscraper.Products) 23 | assert len(products) == _MAX_PRODUCT_NB 24 | product = products[0] 25 | assert isinstance(product, amazonscraper.Product) 26 | assert product.title != "" 27 | assert product.review_nb != "" 28 | assert product.rating != "" 29 | assert product.url != "" 30 | assert product.asin != "" 31 | 32 | 33 | def test_amazonscraper_invalid_url(): 34 | url = "https://0.0.0.0" 35 | with pytest.raises(Exception): 36 | amazonscraper.search( 37 | search_url=url, 38 | max_product_nb=_MAX_PRODUCT_NB) 39 | 40 | 41 | def test_amazonscraper_sign_in_suggestion_url(): 42 | # or https://www.amazon.com/ref=assoc_res_sw_logo 43 | url = "https://www.amazon.com/gp/aw/ref=mw_access" 44 | products = amazonscraper.search( 45 | search_url=url, 46 | max_product_nb=_MAX_PRODUCT_NB) 47 | assert len(products) == 0 48 | 49 | 50 | def test_amazonscraper_not_satisfied_url(): 51 | url = "https://raw.githack.com/tducret/\ 52 | amazon-scraper-python/master/test/not_satisfied.html" 53 | products = amazonscraper.search( 54 | search_url=url, 55 | max_product_nb=_MAX_PRODUCT_NB) 56 | assert len(products) == 0 57 | 58 | 59 | def test_amazonscraper_404_url(): 60 | url = "https://raw.githack.com/tducret/\ 61 | amazon-scraper-python/master/test/404.html" 62 | products = amazonscraper.search( 63 | search_url=url, 64 | max_product_nb=_MAX_PRODUCT_NB) 65 | assert len(products) == 0 66 | 67 | 68 | def test_amazonscraper_get_100_products(): 69 | products = amazonscraper.search( 70 | keywords="Python", 71 | max_product_nb=100) 72 | 73 | assert len(products) == 100 74 | 75 | 76 | def test_amazonscraper_csv_header(): 77 | products = amazonscraper.search( 78 | keywords="Python", 79 | max_product_nb=1) 80 | products.csv('test.csv') 81 | with open('test.csv') as f: 82 | csv_str = f.read() 83 | assert "title,rating,review_nb,img,url,asin,prices_per_unit,units,prices_main" in csv_str 84 | --------------------------------------------------------------------------------