├── .gitignore ├── README.md ├── amazon ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ ├── amazon_reviews.py │ ├── amazon_search.py │ └── amazon_search_product.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | venv/ 30 | 31 | 32 | ## Custom 33 | data/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # amazon-python-scrapy-scraper 2 | Python Scrapy spiders that scrape product data and reviews from [Amazon.com](https://www.amazon.com/). 3 | 4 | This Scrapy project contains 3 seperate spiders: 5 | 6 | | Spider | Description | 7 | |----------|-------------| 8 | | `amazon_search` | Scrapes all product data from the Amazon product search page for a given list of keywords. | 9 | | `amazon_search_product` | Crawls Amazon product search pages for a given list of keywords, then scrapes each individual product page. | 10 | | `amazon_reviews` | Scrapes all Amazon product reviews from a list of product ASINs. | 11 | 12 | 13 | The following articles go through in detail how these Amazon spiders were developed, which you can use to understand the spiders and edit them for your own use case. 14 | 15 | - [Python Scrapy: Build A Amazon.com Product Scraper](https://scrapeops.io/python-scrapy-playbook/python-scrapy-amazon-product-scraper/) 16 | - [Python Scrapy: Build A Amazon.com Product Reviews Scraper](https://scrapeops.io/python-scrapy-playbook/python-scrapy-amazon-reviews-scraper/) 17 | 18 | ## ScrapeOps Proxy 19 | This Amazon spider uses [ScrapeOps Proxy](https://scrapeops.io/proxy-aggregator/) as the proxy solution. ScrapeOps has a free plan that allows you to make up to 1,000 requests per month which makes it ideal for the development phase, but can be easily scaled up to millions of pages per month if needs be. 20 | 21 | You can [sign up for a free API key here](https://scrapeops.io/app/register/main). 22 | 23 | To use the ScrapeOps Proxy you need to first install the proxy middleware: 24 | 25 | ```python 26 | 27 | pip install scrapeops-scrapy-proxy-sdk 28 | 29 | ``` 30 | 31 | Then activate the ScrapeOps Proxy by adding your API key to the `SCRAPEOPS_API_KEY` in the ``settings.py`` file. 32 | 33 | ```python 34 | 35 | SCRAPEOPS_API_KEY = 'YOUR_API_KEY' 36 | 37 | SCRAPEOPS_PROXY_ENABLED = True 38 | 39 | DOWNLOADER_MIDDLEWARES = { 40 | 'scrapeops_scrapy_proxy_sdk.scrapeops_scrapy_proxy_sdk.ScrapeOpsScrapyProxySdk': 725, 41 | } 42 | 43 | ``` 44 | 45 | 46 | ## ScrapeOps Monitoring 47 | To monitor our scraper, this spider uses the [ScrapeOps Monitor](https://scrapeops.io/monitoring-scheduling/), a free monitoring tool specifically designed for web scraping. 48 | 49 | **Live demo here:** [ScrapeOps Demo](https://scrapeops.io/app/login/demo) 50 | 51 | ![ScrapeOps Dashboard](https://scrapeops.io/assets/images/scrapeops-promo-286a59166d9f41db1c195f619aa36a06.png) 52 | 53 | To use the ScrapeOps Proxy you need to first install the monitoring SDK: 54 | 55 | ``` 56 | 57 | pip install scrapeops-scrapy 58 | 59 | ``` 60 | 61 | 62 | Then activate the ScrapeOps Proxy by adding your API key to the `SCRAPEOPS_API_KEY` in the ``settings.py`` file. 63 | 64 | ```python 65 | 66 | SCRAPEOPS_API_KEY = 'YOUR_API_KEY' 67 | 68 | # Add In The ScrapeOps Monitoring Extension 69 | EXTENSIONS = { 70 | 'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500, 71 | } 72 | 73 | 74 | DOWNLOADER_MIDDLEWARES = { 75 | 76 | ## ScrapeOps Monitor 77 | 'scrapeops_scrapy.middleware.retry.RetryMiddleware': 550, 78 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, 79 | 80 | ## Proxy Middleware 81 | 'scrapeops_scrapy_proxy_sdk.scrapeops_scrapy_proxy_sdk.ScrapeOpsScrapyProxySdk': 725, 82 | } 83 | 84 | ``` 85 | 86 | If you are using both the ScrapeOps Proxy & Monitoring then you just need to enter the API key once. 87 | 88 | 89 | ## Running The Scrapers 90 | Make sure Scrapy and the ScrapeOps Monitor is installed: 91 | 92 | ``` 93 | 94 | pip install scrapy scrapeops-scrapy 95 | 96 | ``` 97 | 98 | To run the Amazon spiders you should first set the search query parameters you want to search by updating the `keyword_list` list in the spiders: 99 | 100 | ```python 101 | 102 | def start_requests(self): 103 | keyword_list = ['ipad'] 104 | for keyword in keyword_list: 105 | amazon_search_url = f'https://www.amazon.com/s?k={keyword}&page=1' 106 | yield scrapy.Request(url=amazon_search_url, callback=self.parse_search_results, meta={'keyword': keyword, 'page': 1}) 107 | 108 | ``` 109 | 110 | Then to run the spider, enter one of the following command: 111 | 112 | ``` 113 | 114 | scrapy crawl amazon_search_product 115 | 116 | ``` 117 | 118 | 119 | ## Customizing The Amazon Product Scraper 120 | The following are instructions on how to modify the Amazon Product scraper for your particular use case. 121 | 122 | Check out this [guide to building a Amazon.com Scrapy product spider](https://scrapeops.io/python-scrapy-playbook/python-scrapy-amazon-product-scraper/) if you need any more information. 123 | 124 | ### Configuring Amazon Product Search 125 | To change the query parameters for the product search just change the keywords and locations in the `keyword_list` lists in the spider. 126 | 127 | For example: 128 | 129 | ```python 130 | 131 | def start_requests(self): 132 | keyword_list = ['ipad', 'laptops'] 133 | for keyword in keyword_list: 134 | amazon_search_url = f'https://www.amazon.com/s?k={keyword}&page=1' 135 | yield scrapy.Request(url=amazon_search_url, callback=self.parse_search_results, meta={'keyword': keyword, 'page': 1}) 136 | 137 | ``` 138 | 139 | ### Extract More/Different Data 140 | Amazon product pages contain a lot of useful data, however, in this spider is configured to only parse some of the data. 141 | 142 | You can expand or change the data that gets extract by changing the yield statements: 143 | 144 | ```python 145 | 146 | def parse_product_data(self, response): 147 | image_data = json.loads(re.findall(r"colorImages':.*'initial':\s*(\[.+?\])},\n", response.text)[0]) 148 | variant_data = re.findall(r'dimensionValuesDisplayData"\s*:\s* ({.+?}),\n', response.text) 149 | feature_bullets = [bullet.strip() for bullet in response.css("#feature-bullets li ::text").getall()] 150 | price = response.css('.a-price span[aria-hidden="true"] ::text').get("") 151 | if not price: 152 | price = response.css('.a-price .a-offscreen ::text').get("") 153 | yield { 154 | "name": response.css("#productTitle::text").get("").strip(), 155 | "price": price, 156 | "stars": response.css("i[data-hook=average-star-rating] ::text").get("").strip(), 157 | "rating_count": response.css("div[data-hook=total-review-count] ::text").get("").strip(), 158 | "feature_bullets": feature_bullets, 159 | "images": image_data, 160 | "variant_data": variant_data, 161 | } 162 | 163 | ``` 164 | 165 | ### Speeding Up The Crawl 166 | The spiders are set to only use 1 concurrent thread in the settings.py file as the ScrapeOps Free Proxy Plan only gives you 1 concurrent thread. 167 | 168 | However, if you upgrade to a paid ScrapeOps Proxy plan you will have more concurrent threads. Then you can increase the concurrency limit in your scraper by updating the `CONCURRENT_REQUESTS` value in your ``settings.py`` file. 169 | 170 | ```python 171 | # settings.py 172 | 173 | CONCURRENT_REQUESTS = 10 174 | 175 | ``` 176 | 177 | ### Storing Data 178 | The spiders are set to save the scraped data into a CSV file and store it in a data folder using [Scrapy's Feed Export functionality](https://docs.scrapy.org/en/latest/topics/feed-exports.html). 179 | 180 | ```python 181 | 182 | custom_settings = { 183 | 'FEEDS': { 'data/%(name)s_%(time)s.csv': { 'format': 'csv',}} 184 | } 185 | 186 | ``` 187 | 188 | If you would like to save your CSV files to a AWS S3 bucket then check out our [Saving CSV/JSON Files to Amazon AWS S3 Bucket guide here](https://scrapeops.io//python-scrapy-playbook/scrapy-save-aws-s3) 189 | 190 | Or if you would like to save your data to another type of database then be sure to check out these guides: 191 | 192 | - [Saving Data to JSON](https://scrapeops.io/python-scrapy-playbook/scrapy-save-json-files) 193 | - [Saving Data to SQLite Database](https://scrapeops.io/python-scrapy-playbook/scrapy-save-data-sqlite) 194 | - [Saving Data to MySQL Database](https://scrapeops.io/python-scrapy-playbook/scrapy-save-data-mysql) 195 | - [Saving Data to Postgres Database](https://scrapeops.io/python-scrapy-playbook/scrapy-save-data-postgres) 196 | 197 | ### Deactivating ScrapeOps Proxy & Monitor 198 | To deactivate the ScrapeOps Proxy & Monitor simply comment out the follow code in your `settings.py` file: 199 | 200 | ```python 201 | # settings.py 202 | 203 | # SCRAPEOPS_API_KEY = 'YOUR_API_KEY' 204 | 205 | # SCRAPEOPS_PROXY_ENABLED = True 206 | 207 | # EXTENSIONS = { 208 | # 'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500, 209 | # } 210 | 211 | # DOWNLOADER_MIDDLEWARES = { 212 | 213 | # ## ScrapeOps Monitor 214 | # 'scrapeops_scrapy.middleware.retry.RetryMiddleware': 550, 215 | # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, 216 | 217 | # ## Proxy Middleware 218 | # 'scrapeops_scrapy_proxy_sdk.scrapeops_scrapy_proxy_sdk.ScrapeOpsScrapyProxySdk': 725, 219 | # } 220 | 221 | 222 | 223 | ``` 224 | 225 | -------------------------------------------------------------------------------- /amazon/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/python-scrapy-playbook/amazon-python-scrapy-scraper/fdbd825faf979513019071cb5b5e021c8af6fdec/amazon/__init__.py -------------------------------------------------------------------------------- /amazon/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class AmazonItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /amazon/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class AmazonSpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class AmazonDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /amazon/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class AmazonPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /amazon/settings.py: -------------------------------------------------------------------------------- 1 | 2 | BOT_NAME = 'amazon' 3 | 4 | SPIDER_MODULES = ['amazon.spiders'] 5 | NEWSPIDER_MODULE = 'amazon.spiders' 6 | 7 | # Obey robots.txt rules 8 | ROBOTSTXT_OBEY = False 9 | 10 | SCRAPEOPS_API_KEY = 'YOUR_API_KEY' 11 | 12 | SCRAPEOPS_PROXY_ENABLED = True 13 | # SCRAPEOPS_PROXY_SETTINGS = {'country': 'us'} 14 | 15 | # Add In The ScrapeOps Monitoring Extension 16 | EXTENSIONS = { 17 | 'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500, 18 | } 19 | 20 | LOG_LEVEL = 'INFO' 21 | 22 | DOWNLOADER_MIDDLEWARES = { 23 | 24 | ## ScrapeOps Monitor 25 | 'scrapeops_scrapy.middleware.retry.RetryMiddleware': 550, 26 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, 27 | 28 | ## Proxy Middleware 29 | 'scrapeops_scrapy_proxy_sdk.scrapeops_scrapy_proxy_sdk.ScrapeOpsScrapyProxySdk': 725, 30 | } 31 | 32 | # Max Concurrency On ScrapeOps Proxy Free Plan is 1 thread 33 | CONCURRENT_REQUESTS = 1 34 | -------------------------------------------------------------------------------- /amazon/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /amazon/spiders/amazon_reviews.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from urllib.parse import urljoin 3 | 4 | class AmazonReviewsSpider(scrapy.Spider): 5 | name = "amazon_reviews" 6 | 7 | custom_settings = { 8 | 'FEEDS': { 'data/%(name)s_%(time)s.csv': { 'format': 'csv',}} 9 | } 10 | 11 | def start_requests(self): 12 | asin_list = ['B09G9FPHY6'] 13 | for asin in asin_list: 14 | amazon_reviews_url = f'https://www.amazon.com/product-reviews/{asin}/' 15 | yield scrapy.Request(url=amazon_reviews_url, callback=self.parse_reviews, meta={'asin': asin, 'retry_count': 0}) 16 | 17 | 18 | def parse_reviews(self, response): 19 | asin = response.meta['asin'] 20 | retry_count = response.meta['retry_count'] 21 | 22 | next_page_relative_url = response.css(".a-pagination .a-last>a::attr(href)").get() 23 | if next_page_relative_url is not None: 24 | retry_count = 0 25 | next_page = urljoin('https://www.amazon.com/', next_page_relative_url) 26 | yield scrapy.Request(url=next_page, callback=self.parse_reviews, meta={'asin': asin, 'retry_count': retry_count}) 27 | 28 | ## Adding this retry_count here so we retry any amazon js rendered review pages 29 | elif retry_count < 3: 30 | retry_count = retry_count+1 31 | yield scrapy.Request(url=response.url, callback=self.parse_reviews, dont_filter=True, meta={'asin': asin, 'retry_count': retry_count}) 32 | 33 | 34 | ## Parse Product Reviews 35 | review_elements = response.css("#cm_cr-review_list div.review") 36 | for review_element in review_elements: 37 | yield { 38 | "asin": asin, 39 | "text": "".join(review_element.css("span[data-hook=review-body] ::text").getall()).strip(), 40 | "title": review_element.css("*[data-hook=review-title]>span::text").get(), 41 | "location_and_date": review_element.css("span[data-hook=review-date] ::text").get(), 42 | "verified": bool(review_element.css("span[data-hook=avp-badge] ::text").get()), 43 | "rating": review_element.css("*[data-hook*=review-star-rating] ::text").re(r"(\d+\.*\d*) out")[0], 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /amazon/spiders/amazon_search.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from urllib.parse import urljoin 3 | 4 | class AmazonSearchSpider(scrapy.Spider): 5 | name = "amazon_search" 6 | 7 | custom_settings = { 8 | 'FEEDS': { 'data/%(name)s_%(time)s.csv': { 'format': 'csv',}} 9 | } 10 | 11 | def start_requests(self): 12 | keyword_list = ['ipad'] 13 | for keyword in keyword_list: 14 | amazon_search_url = f'https://www.amazon.com/s?k={keyword}&page=1' 15 | yield scrapy.Request(url=amazon_search_url, callback=self.parse_search_results, meta={'keyword': keyword, 'page': 1}) 16 | 17 | def parse_search_results(self, response): 18 | page = response.meta['page'] 19 | keyword = response.meta['keyword'] 20 | 21 | ## Extract Overview Product Data 22 | search_products = response.css("div.s-result-item[data-component-type=s-search-result]") 23 | for product in search_products: 24 | relative_url = product.css("h2>a::attr(href)").get() 25 | asin = relative_url.split('/')[3] if len(relative_url.split('/')) >= 4 else None 26 | product_url = urljoin('https://www.amazon.com/', relative_url).split("?")[0] 27 | yield { 28 | "keyword": keyword, 29 | "asin": asin, 30 | "url": product_url, 31 | "ad": True if "/slredirect/" in product_url else False, 32 | "title": product.css("h2>a>span::text").get(), 33 | "price": product.css(".a-price[data-a-size=xl] .a-offscreen::text").get(), 34 | "real_price": product.css(".a-price[data-a-size=b] .a-offscreen::text").get(), 35 | "rating": (product.css("span[aria-label~=stars]::attr(aria-label)").re(r"(\d+\.*\d*) out") or [None])[0], 36 | "rating_count": product.css("span[aria-label~=stars] + span::attr(aria-label)").get(), 37 | "thumbnail_url": product.xpath("//img[has-class('s-image')]/@src").get(), 38 | } 39 | 40 | 41 | ## Get All Pages 42 | if page == 1: 43 | available_pages = response.xpath( 44 | '//*[contains(@class, "s-pagination-item")][not(has-class("s-pagination-separator"))]/text()' 45 | ).getall() 46 | 47 | last_page = available_pages[-1] 48 | for page_num in range(2, int(last_page)): 49 | amazon_search_url = f'https://www.amazon.com/s?k={keyword}&page={page_num}' 50 | yield scrapy.Request(url=amazon_search_url, callback=self.parse_search_results, meta={'keyword': keyword, 'page': page_num}) 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /amazon/spiders/amazon_search_product.py: -------------------------------------------------------------------------------- 1 | import json 2 | import scrapy 3 | from urllib.parse import urljoin 4 | import re 5 | 6 | class AmazonSearchProductSpider(scrapy.Spider): 7 | name = "amazon_search_product" 8 | 9 | custom_settings = { 10 | 'FEEDS': { 'data/%(name)s_%(time)s.csv': { 'format': 'csv',}} 11 | } 12 | 13 | def start_requests(self): 14 | keyword_list = ['ipad'] 15 | for keyword in keyword_list: 16 | amazon_search_url = f'https://www.amazon.com/s?k={keyword}&page=1' 17 | yield scrapy.Request(url=amazon_search_url, callback=self.discover_product_urls, meta={'keyword': keyword, 'page': 1}) 18 | 19 | def discover_product_urls(self, response): 20 | page = response.meta['page'] 21 | keyword = response.meta['keyword'] 22 | 23 | ## Discover Product URLs 24 | search_products = response.css("div.s-result-item[data-component-type=s-search-result]") 25 | for product in search_products: 26 | relative_url = product.css("h2>a::attr(href)").get() 27 | product_url = urljoin('https://www.amazon.com/', relative_url).split("?")[0] 28 | yield scrapy.Request(url=product_url, callback=self.parse_product_data, meta={'keyword': keyword, 'page': page}) 29 | 30 | ## Get All Pages 31 | if page == 1: 32 | available_pages = response.xpath( 33 | '//*[contains(@class, "s-pagination-item")][not(has-class("s-pagination-separator"))]/text()' 34 | ).getall() 35 | 36 | last_page = available_pages[-1] 37 | for page_num in range(2, int(last_page)): 38 | amazon_search_url = f'https://www.amazon.com/s?k={keyword}&page={page_num}' 39 | yield scrapy.Request(url=amazon_search_url, callback=self.discover_product_urls, meta={'keyword': keyword, 'page': page_num}) 40 | 41 | 42 | def parse_product_data(self, response): 43 | image_data = json.loads(re.findall(r"colorImages':.*'initial':\s*(\[.+?\])},\n", response.text)[0]) 44 | variant_data = re.findall(r'dimensionValuesDisplayData"\s*:\s* ({.+?}),\n', response.text) 45 | feature_bullets = [bullet.strip() for bullet in response.css("#feature-bullets li ::text").getall()] 46 | price = response.css('.a-price span[aria-hidden="true"] ::text').get("") 47 | if not price: 48 | price = response.css('.a-price .a-offscreen ::text').get("") 49 | yield { 50 | "name": response.css("#productTitle::text").get("").strip(), 51 | "price": price, 52 | "stars": response.css("i[data-hook=average-star-rating] ::text").get("").strip(), 53 | "rating_count": response.css("div[data-hook=total-review-count] ::text").get("").strip(), 54 | "feature_bullets": feature_bullets, 55 | "images": image_data, 56 | "variant_data": variant_data, 57 | } -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = amazon.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = amazon 12 | --------------------------------------------------------------------------------