├── scripts ├── __init__.py └── populate_redis_queue.py ├── scrapy_project ├── __init__.py ├── utils │ ├── __init__.py │ ├── db_utils.py │ ├── db_migration.py │ └── utils.py ├── spiders │ ├── __init__.py │ ├── base_spider │ │ ├── __init__.py │ │ └── product_page_spider.py │ └── google_search_scraper.py ├── pipelines │ ├── base_pipeline │ │ ├── __init__.py │ │ └── base_pipelines.py │ ├── __init__.py │ └── ecommerce_pipelines.py ├── page_objects │ ├── base_page_objects │ │ ├── __init__.py │ │ └── product_page.py │ ├── website_specific_page_objects │ │ ├── __init__.py │ │ └── amazon.py │ └── __init__.py ├── scrapy_poet_overrides.py ├── items.py ├── models.py └── settings.py ├── ruff.toml ├── scrapinghub.yml ├── requirements.txt ├── .gitignore ├── scrapy.cfg ├── setup.py ├── README.md └── LICENSE /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_project/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_project/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_project/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_project/spiders/base_spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_project/pipelines/base_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | # Set the maximum line length to 79. 2 | line-length = 79 3 | -------------------------------------------------------------------------------- /scrapinghub.yml: -------------------------------------------------------------------------------- 1 | project: 000000 2 | stack: scrapy:2.11 3 | requirements: 4 | file: requirements.txt -------------------------------------------------------------------------------- /scrapy_project/page_objects/base_page_objects/__init__.py: -------------------------------------------------------------------------------- 1 | from .product_page import ProductPage 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WebScrapingSolutions/scrapy-project-template/HEAD/requirements.txt -------------------------------------------------------------------------------- /scrapy_project/page_objects/website_specific_page_objects/__init__.py: -------------------------------------------------------------------------------- 1 | from .amazon import AmazonProductPage 2 | -------------------------------------------------------------------------------- /scrapy_project/page_objects/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_page_objects import * 2 | from .website_specific_page_objects import * 3 | -------------------------------------------------------------------------------- /scrapy_project/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from scrapy_project.pipelines.base_pipeline.base_pipelines import ( 2 | BaseDBPipeline, 3 | ) 4 | from scrapy_project.pipelines.ecommerce_pipelines import * 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /venv 2 | /.idea 3 | /.gitignore 4 | /.scrapy/httpcache/beauty_category/ 5 | /build/lib/ecommerce/ 6 | /project.egg-info/ 7 | *.json 8 | /.scrapy/ 9 | **/__pycache__/ 10 | /build/ 11 | /.ruff_cache/ 12 | -------------------------------------------------------------------------------- /scrapy_project/scrapy_poet_overrides.py: -------------------------------------------------------------------------------- 1 | from web_poet import ApplyRule 2 | from .page_objects import * 3 | 4 | 5 | _SCRAPY_POET_OVERRIDES = [ 6 | ApplyRule("amazon.com", use=AmazonProductPage, instead_of=ProductPage), 7 | ] 8 | -------------------------------------------------------------------------------- /scrapy_project/utils/db_utils.py: -------------------------------------------------------------------------------- 1 | from playhouse.shortcuts import dict_to_model 2 | 3 | 4 | def list_of_dicts_to_model(model, dicts): 5 | models = [] 6 | for d in dicts: 7 | models.append(dict_to_model(model, d)) 8 | return models 9 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapy_project.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapy_project 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | 3 | from setuptools import find_packages, setup 4 | 5 | setup( 6 | name="project", 7 | packages=find_packages(), 8 | entry_points={"scrapy": ["settings = scrapy_project.settings"]}, 9 | scripts=[ 10 | "scripts/populate_redis_queue.py", 11 | ], 12 | ) 13 | -------------------------------------------------------------------------------- /scrapy_project/pipelines/ecommerce_pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy_project.models import ( 2 | ProductItemModel, 3 | ) 4 | from scrapy_project.pipelines import BaseDBPipeline 5 | 6 | 7 | class EcommercePricesDBPipeline(BaseDBPipeline): 8 | 9 | max_items = 1000 10 | 11 | def insert_to_db(self, items): 12 | ProductItemModel.insert_many(items).execute() 13 | -------------------------------------------------------------------------------- /scripts/populate_redis_queue.py: -------------------------------------------------------------------------------- 1 | import json 2 | import redis 3 | 4 | from scrapy_project.models import CustomDatabaseProxy, ProductItemModel 5 | from scrapy_project.settings import DATABASE_URI, REDIS_URL 6 | 7 | 8 | if __name__ == "__main__": 9 | # connect to db 10 | CustomDatabaseProxy(db_uri=DATABASE_URI) 11 | # select product that we want to insert to redis queue 12 | rows_to_insert = ProductItemModel.select().where((ProductItemModel.status == "NEW")) 13 | # convert it to a list of dicts 14 | json_urls = [json.dumps({"url": model.url}) for model in rows_to_insert] 15 | # insert to redis queue 16 | with redis.from_url(url=REDIS_URL) as redis_connect: 17 | redis_connect.delete("redis-product-spider:start_urls") 18 | redis_connect.lpush("redis-product-spider:start_urls", *json_urls) 19 | -------------------------------------------------------------------------------- /scrapy_project/utils/db_migration.py: -------------------------------------------------------------------------------- 1 | from playhouse.migrate import * 2 | 3 | from scrapy_project.utils.utils import CustomDatabaseProxy 4 | 5 | 6 | def get_migrator(): 7 | # see https://docs.peewee-orm.com/en/latest/peewee/playhouse.html#schema-migrations for reference 8 | db_uri = "" 9 | db_handle = CustomDatabaseProxy(db_uri=db_uri) 10 | migrator = PostgresqlMigrator(db_handle) 11 | return migrator 12 | 13 | 14 | def add_fields_to_db_schema(): 15 | migrator = get_migrator() 16 | 17 | # new example field 18 | title_field = CharField(default="") 19 | status_field = IntegerField(null=True) 20 | 21 | # actual migration 22 | migrate( 23 | migrator.add_column("some_table", "title", title_field), 24 | migrator.add_column("some_table", "status", status_field), 25 | migrator.drop_column("some_table", "old_column"), 26 | ) 27 | 28 | 29 | if __name__ == "__main__": 30 | add_fields_to_db_schema() 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Project Name: Scrapy Project Template 2 | 3 | ### Description: 4 | this is an example Scrapy project template that can be used to quickly spin up a new web scraping project with the most necessary features. 5 | 6 | ### Features: 7 | - [scrapy-poet](https://github.com/scrapinghub/scrapy-poet) integration - a better way to organize Scrapy projects 8 | - [proxy middleware](https://github.com/aivarsk/scrapy-proxies) - a simple way to start using proxies with your project 9 | - [database integration](https://github.com/coleifer/peewee) with peewee orm - a simple way to store web scraping output in a database 10 | - [ruff linter and code formatter](https://github.com/astral-sh/ruff) 11 | - [scrapy-redis](https://github.com/rmax/scrapy-redis) integration - a simple way to allow distributed spider runs 12 | - example spiders: google search and amazon.com 13 | 14 | ### Usage 15 | To use this example Scrapy project template, simply fork the repository and start writing your own code. 16 | Customize the spiders, pipelines, and items to suit your specific web scraping project needs. 17 | 18 | ### License: MIT 19 | -------------------------------------------------------------------------------- /scrapy_project/utils/utils.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse, urlunparse 2 | 3 | from peewee import DatabaseProxy 4 | from playhouse.db_url import connect 5 | from playhouse.postgres_ext import PostgresqlExtDatabase 6 | 7 | 8 | class SingletonMeta(type): 9 | _instance = None 10 | 11 | def __call__(cls, *args, **kwargs): 12 | if not cls._instance: 13 | cls._instance = super().__call__(*args, **kwargs) 14 | if ( 15 | not isinstance(cls._instance.obj, PostgresqlExtDatabase) 16 | and "db_uri" in kwargs 17 | ): 18 | db_uri = kwargs["db_uri"] 19 | parsed = urlparse(db_uri) 20 | if parsed.scheme == "postgres": 21 | parsed = parsed._replace(scheme="postgresext") 22 | db_uri = urlunparse(parsed) 23 | conn = connect(db_uri, unquote_password=True) 24 | cls._instance.initialize(conn) 25 | return cls._instance 26 | 27 | 28 | class CustomDatabaseProxy(DatabaseProxy, metaclass=SingletonMeta): 29 | def __init__(self, db_uri=None): 30 | super().__init__() 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024, Web Scraping Solutions 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /scrapy_project/spiders/base_spider/product_page_spider.py: -------------------------------------------------------------------------------- 1 | from scrapy_poet import callback_for 2 | 3 | from scrapy_project.page_objects.base_page_objects.product_page import ( 4 | ProductPage, 5 | ) 6 | from scrapy_redis.spiders import RedisSpider 7 | 8 | 9 | class ProductSpider(RedisSpider): 10 | """ 11 | input: redis queue 12 | output: product items 13 | """ 14 | 15 | name = "base_product_spider" 16 | 17 | custom_settings = { 18 | "SCHEDULER": "scrapy_redis.scheduler.Scheduler", 19 | "DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter", 20 | "CONCURRENT_REQUESTS": 6, 21 | "CONCURRENT_REQUESTS_PER_DOMAIN": 6, 22 | "DOWNLOAD_DELAY": 0, 23 | "DOWNLOAD_TIMEOUT": 120, 24 | "CLOSESPIDER_ITEMCOUNT": 15000, 25 | "CLOSESPIDER_TIMEOUT": 3600 * 3, 26 | "RETRY_HTTP_CODES": [], 27 | "HTTPERROR_ALLOWED_CODES": [404], 28 | "DOWNLOADER_MIDDLEWARES": { 29 | "scrapy.downloadermiddlewares.retry.RetryMiddleware": 90, 30 | 'scrapy_proxies.RandomProxy': 100, 31 | "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 110, 32 | "scrapy_poet.InjectionMiddleware": 543, 33 | }, 34 | "ITEM_PIPELINES": { 35 | "scrapy_project.pipelines.EcommercePricesDBPipeline": 100, 36 | }, 37 | } 38 | 39 | redis_batch_size = 50 40 | redis_key = "redis-product-spider:start_urls" 41 | 42 | parse = callback_for(ProductPage) 43 | -------------------------------------------------------------------------------- /scrapy_project/items.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class ProductItem(scrapy.Item): 5 | vendor = scrapy.Field() 6 | status = scrapy.Field() 7 | created = scrapy.Field() 8 | updated = scrapy.Field() 9 | 10 | # General Product Information 11 | availability = scrapy.Field() 12 | color = scrapy.Field() 13 | currency = scrapy.Field() 14 | currencyRaw = scrapy.Field() 15 | productId = scrapy.Field() 16 | 17 | # Product Identification Numbers (GTIN) 18 | gtin = scrapy.Field() 19 | 20 | # Product Images 21 | images = scrapy.Field() 22 | mainImage = scrapy.Field() 23 | 24 | # Product Information 25 | mpn = scrapy.Field() 26 | name = scrapy.Field() 27 | available_quantity = scrapy.Field() 28 | price = scrapy.Field() 29 | regularPrice = scrapy.Field() 30 | size = scrapy.Field() 31 | sku = scrapy.Field() 32 | style = scrapy.Field() 33 | 34 | # Additional product properties 35 | additionalProperties = scrapy.Field() 36 | 37 | # Product URLs 38 | url = scrapy.Field() 39 | canonicalUrl = scrapy.Field() 40 | 41 | # Product Rating 42 | aggregateRating = scrapy.Field() 43 | 44 | # Product Brand Information 45 | brand = scrapy.Field() 46 | 47 | # Breadcrumbs (navigation path) 48 | breadcrumbs = scrapy.Field() 49 | 50 | # Product Features 51 | features = scrapy.Field() 52 | 53 | # Product Description 54 | description = scrapy.Field() 55 | descriptionHtml = scrapy.Field() 56 | 57 | # Product Options 58 | variants = scrapy.Field() 59 | 60 | # Additional metadata 61 | metadata = scrapy.Field() 62 | -------------------------------------------------------------------------------- /scrapy_project/pipelines/base_pipeline/base_pipelines.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | 4 | from itemadapter import ItemAdapter 5 | 6 | from scrapy_project.models import connect_to_db 7 | 8 | 9 | class BaseDBPipeline: 10 | max_items = 10000 11 | items = [] 12 | total = 0 13 | 14 | def __init__(self, db_uri): 15 | self.db_handle = connect_to_db(db_uri) 16 | 17 | @classmethod 18 | def from_settings(cls, settings): 19 | db_uri = settings.get("DATABASE_URI") 20 | params = { 21 | "db_uri": db_uri, 22 | } 23 | return cls(**params) 24 | 25 | @classmethod 26 | def from_crawler(cls, crawler): 27 | return cls.from_settings(crawler.settings) 28 | 29 | def process_item(self, item, spider): 30 | item_for_db = self.prepare_item( 31 | copy.deepcopy(item) 32 | ) 33 | item_for_db.pop("parent_url", None) 34 | 35 | self.total += 1 36 | self.items.append(ItemAdapter(item_for_db).asdict()) 37 | 38 | if self.total > self.max_items: 39 | self.flush_data() 40 | return item 41 | 42 | def close_spider(self, spider): 43 | self.flush_data() 44 | 45 | def flush_data(self): 46 | with self.db_handle.atomic() as transaction: 47 | try: 48 | self.insert_to_db(self.items) 49 | except Exception as e: 50 | logging.error(f"Error in PostgreSQLItemPipeline: {e}") 51 | transaction.commit() 52 | 53 | self.items = [] 54 | self.total = 0 55 | return None 56 | 57 | def insert_to_db(self, items): 58 | pass 59 | 60 | def prepare_item(self, item): 61 | return item 62 | -------------------------------------------------------------------------------- /scrapy_project/models.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from peewee import * 4 | from playhouse.postgres_ext import JSONField 5 | 6 | from scrapy_project.utils.utils import CustomDatabaseProxy 7 | 8 | 9 | class BaseModel(Model): 10 | class Meta: 11 | database = CustomDatabaseProxy() 12 | 13 | 14 | class ProductItemModel(BaseModel): 15 | vendor = TextField(null=False) # str 16 | status = TextField(null=False) # str 17 | created = DateTimeField(null=False, default=datetime.datetime.now()) 18 | updated = DateTimeField(null=True) 19 | 20 | # General Product Information 21 | availability = TextField(default="InStock", null=True) # str 22 | color = TextField(null=True) # str 23 | currency = TextField(null=True) # str 24 | currencyRaw = TextField(null=True) # str 25 | productId = TextField(null=True) # str 26 | 27 | # Product Identification Numbers (GTIN) 28 | gtin = JSONField(null=True) # List[Dict[str, str]] 29 | 30 | # Product Images 31 | images = JSONField(null=True) # List[Dict[str, str]] 32 | mainImage = JSONField(null=True) # Dict[str, Any] 33 | 34 | # Product Information 35 | mpn = TextField(null=True) # str Manufacture Product Number 36 | name = TextField(null=True) # str Product Name 37 | available_quantity = TextField( 38 | null=True 39 | ) # str How many products are available to order 40 | price = TextField(null=True) # str Product Price 41 | regularPrice = TextField(null=True) # str Regular product price 42 | size = TextField(null=True) # str Product Size 43 | sku = TextField(null=True) # str Product Article 44 | style = TextField(null=True) # str Product Style 45 | 46 | # Additional product properties 47 | additionalProperties = JSONField(null=True) # List[Dict[str, Any]] 48 | 49 | # Product URLs 50 | url = TextField(null=True) # str 51 | canonicalUrl = TextField(null=True) # str 52 | 53 | # Product Rating 54 | aggregateRating = JSONField(null=True) # Dict[str, Any] 55 | 56 | # Product Brand Information 57 | brand = JSONField(null=True) # Dict[str, Any] 58 | 59 | # Breadcrumbs (navigation path) 60 | breadcrumbs = JSONField(null=True) # List[Dict[str, Any]] 61 | 62 | # Product Features 63 | features = JSONField(null=True) # List[Dict[str, Any]] 64 | 65 | # Product Description 66 | description = TextField(null=True) # str 67 | descriptionHtml = TextField(null=True) # str 68 | 69 | # Product Options 70 | variants = JSONField(null=True) # List[Dict[str, Any]] 71 | 72 | # Additional metadata 73 | metadata = JSONField(null=True) # Dict[str, Any] 74 | 75 | class Meta: 76 | db_table = "product_item" 77 | 78 | 79 | def connect_to_db(db_uri): 80 | db_handle = CustomDatabaseProxy(db_uri=db_uri) 81 | db_handle.create_tables([ProductItemModel]) # table creation example 82 | return db_handle 83 | -------------------------------------------------------------------------------- /scrapy_project/page_objects/base_page_objects/product_page.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import web_poet 4 | import attr 5 | 6 | from scrapy_project.items import ProductItem 7 | 8 | 9 | @attr.define 10 | class ProductPage(web_poet.WebPage): 11 | page_params: web_poet.PageParams 12 | http: web_poet.HttpClient 13 | 14 | def to_item(self): 15 | item = ProductItem() 16 | item["vendor"] = self.get_vendor() 17 | item["status"] = self.get_status() 18 | 19 | item["updated"] = self.get_updated() 20 | 21 | # General Product Information 22 | item["availability"] = self.get_availability() 23 | item["color"] = self.get_color() 24 | item["currency"] = self.get_currency() 25 | item["currencyRaw"] = self.get_currencyRaw() 26 | item["productId"] = self.get_productId() 27 | 28 | # Product Identification Numbers (GTIN) 29 | item["gtin"] = self.get_gtin() 30 | 31 | # Product Images 32 | item["images"] = self.get_images() 33 | item["mainImage"] = self.get_mainImage() 34 | 35 | # Product Information 36 | item["mpn"] = self.get_mpn() 37 | item["name"] = self.get_name() 38 | item["available_quantity"] = self.get_available_quantity() 39 | item["price"] = self.get_price() 40 | item["regularPrice"] = self.get_regularPrice() 41 | item["size"] = self.get_size() 42 | item["sku"] = self.get_sku() 43 | item["style"] = self.get_style() 44 | 45 | # Additional product properties 46 | item["additionalProperties"] = self.get_additionalProperties() 47 | 48 | # Product URLs 49 | item["url"] = self.url 50 | item["canonicalUrl"] = self.get_canonicalUrl() 51 | 52 | # Product Rating 53 | item["aggregateRating"] = self.get_aggregateRating() 54 | 55 | # Product Brand Information 56 | item["brand"] = self.get_brand() 57 | 58 | # Breadcrumbs (navigation path) 59 | item["breadcrumbs"] = self.get_breadcrumbs() 60 | 61 | # Product Features 62 | item["features"] = self.get_features() 63 | 64 | # Product Description 65 | item["description"] = self.get_description() 66 | item["descriptionHtml"] = self.get_descriptionHtml() 67 | 68 | # Product Options 69 | item["variants"] = self.get_variants() 70 | 71 | # Additional metadata 72 | item["metadata"] = self.get_metadata() 73 | yield item 74 | 75 | def get_vendor(self): 76 | return None 77 | 78 | def get_status(self): 79 | return "NEW" 80 | 81 | def get_created(self): 82 | return None 83 | 84 | def get_updated(self): 85 | return None 86 | 87 | def get_availability(self): 88 | return None 89 | 90 | def get_color(self): 91 | return None 92 | 93 | def get_currency(self): 94 | return None 95 | 96 | def get_currencyRaw(self): 97 | return None 98 | 99 | def get_productId(self): 100 | return None 101 | 102 | def get_gtin(self): 103 | return None 104 | 105 | def get_images(self): 106 | return None 107 | 108 | def get_mainImage(self): 109 | return None 110 | 111 | def get_mpn(self): 112 | return None 113 | 114 | def get_name(self): 115 | return None 116 | 117 | def get_available_quantity(self): 118 | return None 119 | 120 | def get_price(self): 121 | return None 122 | 123 | def get_regularPrice(self): 124 | return None 125 | 126 | def get_size(self): 127 | return None 128 | 129 | def get_sku(self): 130 | return None 131 | 132 | def get_style(self): 133 | return None 134 | 135 | def get_additionalProperties(self): 136 | return None 137 | 138 | def get_canonicalUrl(self): 139 | return None 140 | 141 | def get_aggregateRating(self): 142 | return None 143 | 144 | def get_brand(self): 145 | return None 146 | 147 | def get_breadcrumbs(self): 148 | return None 149 | 150 | def get_features(self): 151 | return None 152 | 153 | def get_description(self): 154 | return None 155 | 156 | def get_descriptionHtml(self): 157 | return None 158 | 159 | def get_variants(self): 160 | return None 161 | 162 | def get_metadata(self): 163 | # Get current UTC time 164 | current_utc_time = datetime.utcnow() 165 | # Format the time in ISO 8601 format 166 | formatted_time = current_utc_time.strftime("%Y-%m-%dT%H:%M:%SZ") 167 | return {"dateDownloaded": formatted_time, "probability": 1} 168 | -------------------------------------------------------------------------------- /scrapy_project/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for scrapy_project project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | import os 10 | 11 | BOT_NAME = "scrapy_project" 12 | 13 | SPIDER_MODULES = ["scrapy_project.spiders"] 14 | NEWSPIDER_MODULE = "scrapy_project.spiders" 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | CONCURRENT_REQUESTS = 3 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | DOWNLOAD_DELAY = 0 29 | # The download delay setting will honor only one of: 30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 1 31 | # CONCURRENT_REQUESTS_PER_IP = 1 32 | 33 | # Disable cookies (enabled by default) 34 | COOKIES_ENABLED = True 35 | 36 | # Disable Telnet Console (enabled by default) 37 | # TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | DEFAULT_REQUEST_HEADERS = { 41 | "Upgrade-Insecure-Requests": "1", 42 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", 43 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 44 | } 45 | 46 | # Enable or disable spider middlewares 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 48 | SPIDER_MIDDLEWARES = {} 49 | 50 | # Enable or disable downloader middlewares 51 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 52 | DOWNLOADER_MIDDLEWARES = { 53 | "scrapy.downloadermiddlewares.retry.RetryMiddleware": 90, 54 | "scrapy_proxies.RandomProxy": 100, 55 | "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 110, 56 | "scrapy_poet.InjectionMiddleware": 543, 57 | } 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | EXTENSIONS = {} 62 | 63 | # Configure item pipelines 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65 | ITEM_PIPELINES = {} 66 | 67 | # Enable and configure the AutoThrottle extension (disabled by default) 68 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 69 | AUTOTHROTTLE_ENABLED = False 70 | # The initial download delay 71 | AUTOTHROTTLE_START_DELAY = 1 72 | # The maximum download delay to be set in case of high latencies 73 | AUTOTHROTTLE_MAX_DELAY = 0.8 74 | # The average number of requests Scrapy should be sending in parallel to 75 | # each remote server 76 | AUTOTHROTTLE_TARGET_CONCURRENCY = 16 77 | DOWNLOAD_TIMEOUT = 120 78 | # Enable showing throttling stats for every response received: 79 | # AUTOTHROTTLE_DEBUG = False 80 | 81 | # Enable and configure HTTP caching (disabled by default) 82 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 83 | HTTPCACHE_ENABLED = False 84 | HTTPCACHE_EXPIRATION_SECS = 0 85 | HTTPCACHE_DIR = "httpcache" 86 | HTTPCACHE_IGNORE_HTTP_CODES = [429, 500, 503, 504, 400, 403, 404, 408] 87 | HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" 88 | 89 | from .scrapy_poet_overrides import _SCRAPY_POET_OVERRIDES 90 | 91 | SCRAPY_POET_OVERRIDES = _SCRAPY_POET_OVERRIDES 92 | 93 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 94 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler" 95 | # SCHEDULER_PERSIST = True 96 | 97 | REDIS_URL = os.environ.get("REDIS_URL") 98 | DATABASE_URI = os.environ.get("DATABASE_URI") 99 | 100 | # Retry many times since proxies often fail 101 | RETRY_TIMES = 1 102 | # Retry on most error codes since proxies fail for different reasons 103 | RETRY_HTTP_CODES = [429, 500, 503, 504, 400, 403, 404, 408] 104 | 105 | # Proxy mode 106 | # 0 = Every request has different proxy 107 | # 1 = Take only one proxy from the list and assign it to every requests 108 | # 2 = Put a custom proxy to use in the settings 109 | PROXY_MODE = 1 110 | # If proxy mode is 2 uncomment this sentence : 111 | CUSTOM_PROXY = "http://host1:port" 112 | 113 | SCRAPEOPS_API_KEY = os.environ.get("SCRAPEOPS_API_KEY") 114 | PROXY_LIST = [ 115 | f"http://scrapeops.country=us:{SCRAPEOPS_API_KEY}@proxy.scrapeops.io:5353", 116 | ] 117 | 118 | GOOGLE_SHEETS_CREDENTIALS = {} 119 | 120 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" 121 | -------------------------------------------------------------------------------- /scrapy_project/spiders/google_search_scraper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABC 3 | from typing import Any, Iterable 4 | from urllib import parse 5 | 6 | import scrapy 7 | from scrapy.http import Request, TextResponse 8 | 9 | from scrapy_project.settings import ( 10 | SCRAPEOPS_API_KEY, 11 | ) 12 | 13 | 14 | class GoogleSearchLinksSpider(scrapy.Spider, ABC): 15 | """ 16 | Collect links from google search, no older than 7 days. 17 | Spider gets initial domain links from google sheets by google api. 18 | 19 | Use spider arguments to start gathering links of exact domain 20 | or provide the spider with args in case it runs on Zyte. 21 | Example: scrapy crawl google_search_links_parser -a domain_link="https://www.example.com" 22 | 23 | see https://scrapeops.io/docs/proxy-aggregator/advanced-functionality/auto-extract/ for reference 24 | """ 25 | 26 | name = "google_search_links_parser" 27 | 28 | custom_settings = { 29 | "DOWNLOADER_MIDDLEWARES": {}, 30 | "ITEM_PIPELINES": {}, 31 | "CONCURRENT_REQUESTS": 5, 32 | "DOWNLOAD_TIMEOUT": 120, 33 | "DOWNLOAD_DELAY": 0, 34 | "RETRY_TIMES": 1, 35 | } 36 | 37 | def start_requests(self) -> Iterable[Request]: 38 | domain_links = ["bbc.com", "cnn.com"] 39 | for domain_link in domain_links: 40 | yield from self.generate_request_to_google_search( 41 | domain=parse.urlparse(domain_link).netloc, 42 | initial_link=domain_link, 43 | ) 44 | 45 | def generate_request_to_google_search( 46 | self, domain: str, initial_link: str 47 | ) -> Iterable[Request]: 48 | url = self.get_google_url(page=0, domain=domain) 49 | yield scrapy.Request( 50 | url=url, 51 | cb_kwargs=dict( 52 | domain=domain, 53 | initial_link=initial_link, 54 | ), 55 | ) 56 | 57 | def parse(self, response: TextResponse, **cb_kwargs: Any) -> Any: 58 | data = response.json().get("data") 59 | if data: 60 | results = data.get("organic_results") or data.get("articles") 61 | logging.info(f"found {len(results)} results") 62 | for result in results: 63 | if not self.is_valid_link( 64 | initial_link=cb_kwargs.get("initial_link"), 65 | link=result.get("link"), 66 | ): 67 | continue 68 | yield { 69 | "url": result.get("link"), 70 | "domain": cb_kwargs.get("domain"), 71 | } 72 | page = self.get_page_number(data) 73 | if page > 0: 74 | url = self.get_google_url( 75 | page=page, domain=cb_kwargs.get("domain") 76 | ) 77 | yield scrapy.Request( 78 | url=url, 79 | cb_kwargs=cb_kwargs, 80 | ) 81 | else: 82 | logging.warning( 83 | "api.scraperapi.com did NOT return relevant results" 84 | ) 85 | 86 | @staticmethod 87 | def get_google_url(page: int, domain: str) -> str: 88 | """ 89 | Find results from specific dates: 90 | as_qdr=x 91 | Swap out x for the following to limit the search to only files first indexed in: 92 | d - the previous 24 hours 93 | w - the previous seven days 94 | m - the previous month 95 | y - past year 96 | mn - the previous n number of months. So m2 would be the previous two, m3 would be three, and so on. 97 | Does work into double digits 98 | """ 99 | # feel free to edit google url to fit your use case 100 | google_url = ( 101 | f"https://www.google.com/search?q=site:{domain}&" 102 | f"as_qdr=w&tbm=nws&start={page}&num=100" 103 | ) 104 | url = ( 105 | f"https://proxy.scrapeops.io/v1/?api_key={SCRAPEOPS_API_KEY}" 106 | f"&url={parse.quote_plus(google_url)}" 107 | "&auto_extract=google" 108 | ) 109 | return url 110 | 111 | def get_page_number(self, data: dict) -> int: 112 | pagination = data.get("pagination") 113 | pagination_url = pagination.get("load_more_url") 114 | page = 0 115 | if pagination_url: 116 | page = pagination_url.split("start%3D")[1].split( 117 | "%26sa%3DN&autoparse=" 118 | )[0] 119 | else: 120 | if pagination.get("next_page_url"): 121 | page = (int(pagination.get("current_page")) + 1) * 10 122 | return int(page) 123 | 124 | @staticmethod 125 | def is_valid_link(initial_link, link): 126 | return ( 127 | link != initial_link 128 | and parse.urlparse(link).path 129 | and parse.urlparse(link).path != "/" 130 | ) 131 | -------------------------------------------------------------------------------- /scrapy_project/page_objects/website_specific_page_objects/amazon.py: -------------------------------------------------------------------------------- 1 | from scrapy_project.page_objects.base_page_objects import ProductPage 2 | from price_parser import Price 3 | 4 | 5 | class AmazonProductPage(ProductPage): 6 | """ 7 | https://www.amazon.com/High-Protein-Bars-thinkThin-Non-GMO/dp/B00VXQGKRM 8 | """ 9 | 10 | def get_vendor(self): 11 | return "amazon.com" 12 | 13 | def get_availability(self): 14 | product_is_available = self.xpath( 15 | "//div[@id='availability']/span/text()" 16 | ).get() 17 | # Check if any product is available 18 | if product_is_available: 19 | if any( 20 | s in product_is_available.lower().strip() 21 | for s in ["in stock", "usually ships within"] 22 | ): 23 | return "InStock" 24 | else: 25 | return "OutOfStock" 26 | else: 27 | return "OutOfStock" 28 | 29 | def get_color(self): 30 | return None 31 | 32 | def get_currency(self): 33 | price_string = self.get_current_price() 34 | if Price.fromstring(price_string).currency == "$": 35 | return "USD" 36 | else: 37 | return None 38 | 39 | def get_currencyRaw(self): 40 | price_string = self.get_current_price() 41 | return Price.fromstring(price_string).currency 42 | 43 | def get_productId(self): 44 | return self.get_canonicalUrl() 45 | 46 | def get_gtin(self): 47 | return [] 48 | 49 | def get_images(self): 50 | image_selectors = [ 51 | "//div[@id='altImages']//li//img/@src", 52 | "//div[@id='imgTagWrapperId']/img/@src", 53 | ] 54 | 55 | image_urls = next( 56 | ( 57 | urls 58 | for selector in image_selectors 59 | if (urls := self.xpath(selector).getall()) 60 | ), 61 | [], 62 | ) 63 | 64 | image_urls = [ 65 | ".".join(image_url.rsplit(".", 2)[::2]) 66 | for image_url in image_urls 67 | if "gif" not in image_url 68 | ] 69 | 70 | image_urls = list(set(image_urls)) 71 | return image_urls 72 | 73 | def get_mainImage(self): 74 | mainImage_url = self.xpath( 75 | "//div[@id='imgTagWrapperId']/img/@src" 76 | ).get() 77 | return mainImage_url 78 | 79 | def get_mpn(self): 80 | attributes = self.get_additionalProperties() 81 | mpn_column_names = ["Item model number", "Part Number"] 82 | return next( 83 | ( 84 | attributes.get(name) 85 | for name in mpn_column_names 86 | if name in attributes.keys() 87 | ), 88 | None, 89 | ) 90 | 91 | def get_name(self): 92 | return self.xpath("//span[@id='productTitle']/text()").get() 93 | 94 | def get_available_quantity(self): 95 | # there is no simple way to collect available quantity on amazon.com 96 | # if you really need this value - create a dedicated spider 97 | return None 98 | 99 | def get_current_price(self): 100 | if self.get_availability() != "InStock": 101 | return None 102 | 103 | selectors = [ 104 | "//div[@class='a-section a-spacing-micro']" 105 | "//span[@class='a-offscreen']/text()" 106 | "//span[contains(text(), 'List Price')]" 107 | "//span[@class='a-offscreen']/text()", 108 | "//span[@id='price_inside_buybox']/text()", 109 | ] 110 | 111 | for xpath in selectors: 112 | price = self.xpath(xpath).get() 113 | if price: 114 | return price 115 | return None 116 | 117 | def get_price(self): 118 | price_string = self.get_current_price() 119 | return Price.fromstring(price_string).amount_text 120 | 121 | def get_regularPrice(self): 122 | regularPrice_string = self.xpath( 123 | "//span[contains(text(),'List Price:')]" 124 | "/span/span[@class='a-offscreen']/text()" 125 | ).get() 126 | return Price.fromstring(regularPrice_string).amount_text 127 | 128 | def get_sku(self): 129 | return self.url.split("/dp/", 1)[1] 130 | 131 | def get_additionalProperties(self): 132 | additionalProperties = {} 133 | 134 | def extract_attributes( 135 | xpath_query, name_query, info_query, name_clean=True 136 | ): 137 | for row in self.xpath(xpath_query): 138 | attribute_name = row.xpath(name_query).get() 139 | if attribute_name: 140 | attribute_name = attribute_name.strip() 141 | if name_clean: 142 | attribute_name = attribute_name.split(":")[0] 143 | attribute_name = self.clean_string(attribute_name) 144 | if ( 145 | attribute_name != "Customer Reviews" 146 | ): # this attribute should be stored in a separate field 147 | attributes_info = row.xpath(info_query).getall() 148 | attributes_info = self.clean_string(attributes_info) 149 | additionalProperties[attribute_name] = attributes_info 150 | 151 | tech_details = self.xpath("//div[@id='prodDetails']//table//tr") 152 | if len(tech_details) == 0: 153 | extract_attributes( 154 | "//div[contains(@id, 'productOverview')]//tr", 155 | "./td[1]/span//text()", 156 | "./td[2]/span[contains(@class,'a-size-base')]//text()", 157 | ) 158 | else: 159 | extract_attributes( 160 | "//div[@id='prodDetails']//table//tr", 161 | "./th/text()", 162 | "./td//text()", 163 | name_clean=False, 164 | ) 165 | 166 | extract_attributes( 167 | "//div[@id='detailBullets_feature_div']" 168 | "/ul/li/span[@class='a-list-item']", 169 | "./span[1]//text()", 170 | "./span[2]/text()", 171 | ) 172 | 173 | return additionalProperties 174 | 175 | def clean_string(self, string): 176 | if isinstance(string, list): 177 | string = " ".join(string) 178 | return ( 179 | string.replace("\n", "") 180 | .replace("\u200e", "") 181 | .replace("\u200f", "") 182 | .strip() 183 | ) 184 | 185 | def get_canonicalUrl(self): 186 | return self.url.split("?")[0] 187 | 188 | def get_aggregateRating(self): 189 | ratingValue = float( 190 | self.xpath( 191 | "//span[contains(@class,'reviewCountTextLinkedHistogram')]" 192 | "//a/span/text()" 193 | ).get() 194 | ) 195 | 196 | number_str = "".join( 197 | filter( 198 | str.isdigit, 199 | self.xpath( 200 | "//a[@id='acrCustomerReviewLink']/span/text()" 201 | ).get(), 202 | ) 203 | ) 204 | reviewCount = int(number_str) 205 | return { 206 | "bestRating": 5, 207 | "ratingValue": ratingValue, 208 | "reviewCount": reviewCount, 209 | } 210 | 211 | def get_brand(self): 212 | additionalProperties = self.get_additionalProperties() 213 | return additionalProperties.get("Brand") or additionalProperties.get( 214 | "Manufacturer" 215 | ) 216 | 217 | def get_breadcrumbs(self): 218 | breadcrumb_elements = self.xpath( 219 | "//div[contains(@*, 'breadcrumbs')]//ul/li/span/a" 220 | ) 221 | breadcrumbs = [ 222 | { 223 | "url": element.xpath("./@href").get(), 224 | "name": element.xpath("./text()") 225 | .get() 226 | .strip() 227 | .replace("\n", ""), 228 | } 229 | for element in breadcrumb_elements 230 | ] 231 | return breadcrumbs 232 | 233 | def get_features(self): 234 | return [] 235 | 236 | def get_description(self): 237 | product_description = self.xpath( 238 | "//h3[contains(span, 'Product Description')]" 239 | "//following::p[1]/span/text()" 240 | ).getall() 241 | if not product_description: 242 | product_description = self.xpath( 243 | "//div[@id='productDescription']/p/span/text()" 244 | ).getall() 245 | 246 | return product_description 247 | 248 | def get_descriptionHtml(self): 249 | product_description_html = self.xpath( 250 | "//h3[contains(span, 'Product Description')]" 251 | "//following::p[1]/span" 252 | ).getall() 253 | if not product_description_html: 254 | product_description_html = self.xpath( 255 | "//div[@id='productDescription']/p/span" 256 | ).getall() 257 | return product_description_html 258 | 259 | def get_variants(self): 260 | return [] 261 | --------------------------------------------------------------------------------