├── scripts
    ├── __init__.py
    └── populate_redis_queue.py
├── scrapy_project
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── db_utils.py
    │   ├── db_migration.py
    │   └── utils.py
    ├── spiders
    │   ├── __init__.py
    │   ├── base_spider
    │   │   ├── __init__.py
    │   │   └── product_page_spider.py
    │   └── google_search_scraper.py
    ├── pipelines
    │   ├── base_pipeline
    │   │   ├── __init__.py
    │   │   └── base_pipelines.py
    │   ├── __init__.py
    │   └── ecommerce_pipelines.py
    ├── page_objects
    │   ├── base_page_objects
    │   │   ├── __init__.py
    │   │   └── product_page.py
    │   ├── website_specific_page_objects
    │   │   ├── __init__.py
    │   │   └── amazon.py
    │   └── __init__.py
    ├── scrapy_poet_overrides.py
    ├── items.py
    ├── models.py
    └── settings.py
├── ruff.toml
├── scrapinghub.yml
├── requirements.txt
├── .gitignore
├── scrapy.cfg
├── setup.py
├── README.md
└── LICENSE


/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_project/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_project/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_project/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_project/spiders/base_spider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_project/pipelines/base_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
1 | # Set the maximum line length to 79.
2 | line-length = 79
3 | 


--------------------------------------------------------------------------------
/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | project: 000000
2 | stack: scrapy:2.11
3 | requirements:
4 |   file: requirements.txt


--------------------------------------------------------------------------------
/scrapy_project/page_objects/base_page_objects/__init__.py:
--------------------------------------------------------------------------------
1 | from .product_page import ProductPage
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WebScrapingSolutions/scrapy-project-template/HEAD/requirements.txt


--------------------------------------------------------------------------------
/scrapy_project/page_objects/website_specific_page_objects/__init__.py:
--------------------------------------------------------------------------------
1 | from .amazon import AmazonProductPage
2 | 


--------------------------------------------------------------------------------
/scrapy_project/page_objects/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_page_objects import *
2 | from .website_specific_page_objects import *
3 | 


--------------------------------------------------------------------------------
/scrapy_project/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from scrapy_project.pipelines.base_pipeline.base_pipelines import (
2 |     BaseDBPipeline,
3 | )
4 | from scrapy_project.pipelines.ecommerce_pipelines import *
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /venv
 2 | /.idea
 3 | /.gitignore
 4 | /.scrapy/httpcache/beauty_category/
 5 | /build/lib/ecommerce/
 6 | /project.egg-info/
 7 | *.json
 8 | /.scrapy/
 9 | **/__pycache__/
10 | /build/
11 | /.ruff_cache/
12 | 


--------------------------------------------------------------------------------
/scrapy_project/scrapy_poet_overrides.py:
--------------------------------------------------------------------------------
1 | from web_poet import ApplyRule
2 | from .page_objects import *
3 | 
4 | 
5 | _SCRAPY_POET_OVERRIDES = [
6 |     ApplyRule("amazon.com", use=AmazonProductPage, instead_of=ProductPage),
7 | ]
8 | 


--------------------------------------------------------------------------------
/scrapy_project/utils/db_utils.py:
--------------------------------------------------------------------------------
1 | from playhouse.shortcuts import dict_to_model
2 | 
3 | 
4 | def list_of_dicts_to_model(model, dicts):
5 |     models = []
6 |     for d in dicts:
7 |         models.append(dict_to_model(model, d))
8 |     return models
9 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapy_project.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapy_project
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: shub deploy
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | setup(
 6 |     name="project",
 7 |     packages=find_packages(),
 8 |     entry_points={"scrapy": ["settings = scrapy_project.settings"]},
 9 |     scripts=[
10 |         "scripts/populate_redis_queue.py",
11 |     ],
12 | )
13 | 


--------------------------------------------------------------------------------
/scrapy_project/pipelines/ecommerce_pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy_project.models import (
 2 |     ProductItemModel,
 3 | )
 4 | from scrapy_project.pipelines import BaseDBPipeline
 5 | 
 6 | 
 7 | class EcommercePricesDBPipeline(BaseDBPipeline):
 8 | 
 9 |     max_items = 1000
10 | 
11 |     def insert_to_db(self, items):
12 |         ProductItemModel.insert_many(items).execute()
13 | 


--------------------------------------------------------------------------------
/scripts/populate_redis_queue.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import redis
 3 | 
 4 | from scrapy_project.models import CustomDatabaseProxy, ProductItemModel
 5 | from scrapy_project.settings import DATABASE_URI, REDIS_URL
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     # connect to db
10 |     CustomDatabaseProxy(db_uri=DATABASE_URI)
11 |     # select product that we want to insert to redis queue
12 |     rows_to_insert = ProductItemModel.select().where((ProductItemModel.status == "NEW"))
13 |     # convert it to a list of dicts
14 |     json_urls = [json.dumps({"url": model.url}) for model in rows_to_insert]
15 |     # insert to redis queue
16 |     with redis.from_url(url=REDIS_URL) as redis_connect:
17 |         redis_connect.delete("redis-product-spider:start_urls")
18 |         redis_connect.lpush("redis-product-spider:start_urls", *json_urls)
19 | 


--------------------------------------------------------------------------------
/scrapy_project/utils/db_migration.py:
--------------------------------------------------------------------------------
 1 | from playhouse.migrate import *
 2 | 
 3 | from scrapy_project.utils.utils import CustomDatabaseProxy
 4 | 
 5 | 
 6 | def get_migrator():
 7 |     # see https://docs.peewee-orm.com/en/latest/peewee/playhouse.html#schema-migrations for reference
 8 |     db_uri = ""
 9 |     db_handle = CustomDatabaseProxy(db_uri=db_uri)
10 |     migrator = PostgresqlMigrator(db_handle)
11 |     return migrator
12 | 
13 | 
14 | def add_fields_to_db_schema():
15 |     migrator = get_migrator()
16 | 
17 |     # new example field
18 |     title_field = CharField(default="")
19 |     status_field = IntegerField(null=True)
20 | 
21 |     # actual migration
22 |     migrate(
23 |         migrator.add_column("some_table", "title", title_field),
24 |         migrator.add_column("some_table", "status", status_field),
25 |         migrator.drop_column("some_table", "old_column"),
26 |     )
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     add_fields_to_db_schema()
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Project Name: Scrapy Project Template
 2 | 
 3 | ### Description: 
 4 | this is an example Scrapy project template that can be used to quickly spin up a new web scraping project with the most necessary features.
 5 | 
 6 | ### Features:
 7 | - [scrapy-poet](https://github.com/scrapinghub/scrapy-poet) integration - a better way to organize Scrapy projects
 8 | - [proxy middleware](https://github.com/aivarsk/scrapy-proxies) - a simple way to start using proxies with your project
 9 | - [database integration](https://github.com/coleifer/peewee) with peewee orm - a simple way to store web scraping output in a database
10 | - [ruff linter and code formatter](https://github.com/astral-sh/ruff)
11 | - [scrapy-redis](https://github.com/rmax/scrapy-redis) integration - a simple way to allow distributed spider runs
12 | - example spiders: google search and amazon.com
13 | 
14 | ### Usage
15 | To use this example Scrapy project template, simply fork the repository and start writing your own code.
16 | Customize the spiders, pipelines, and items to suit your specific web scraping project needs.
17 | 
18 | ### License: MIT
19 | 


--------------------------------------------------------------------------------
/scrapy_project/utils/utils.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse, urlunparse
 2 | 
 3 | from peewee import DatabaseProxy
 4 | from playhouse.db_url import connect
 5 | from playhouse.postgres_ext import PostgresqlExtDatabase
 6 | 
 7 | 
 8 | class SingletonMeta(type):
 9 |     _instance = None
10 | 
11 |     def __call__(cls, *args, **kwargs):
12 |         if not cls._instance:
13 |             cls._instance = super().__call__(*args, **kwargs)
14 |         if (
15 |             not isinstance(cls._instance.obj, PostgresqlExtDatabase)
16 |             and "db_uri" in kwargs
17 |         ):
18 |             db_uri = kwargs["db_uri"]
19 |             parsed = urlparse(db_uri)
20 |             if parsed.scheme == "postgres":
21 |                 parsed = parsed._replace(scheme="postgresext")
22 |             db_uri = urlunparse(parsed)
23 |             conn = connect(db_uri, unquote_password=True)
24 |             cls._instance.initialize(conn)
25 |         return cls._instance
26 | 
27 | 
28 | class CustomDatabaseProxy(DatabaseProxy, metaclass=SingletonMeta):
29 |     def __init__(self, db_uri=None):
30 |         super().__init__()
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024, Web Scraping Solutions
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/scrapy_project/spiders/base_spider/product_page_spider.py:
--------------------------------------------------------------------------------
 1 | from scrapy_poet import callback_for
 2 | 
 3 | from scrapy_project.page_objects.base_page_objects.product_page import (
 4 |     ProductPage,
 5 | )
 6 | from scrapy_redis.spiders import RedisSpider
 7 | 
 8 | 
 9 | class ProductSpider(RedisSpider):
10 |     """
11 |     input: redis queue
12 |     output: product items
13 |     """
14 | 
15 |     name = "base_product_spider"
16 | 
17 |     custom_settings = {
18 |         "SCHEDULER": "scrapy_redis.scheduler.Scheduler",
19 |         "DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter",
20 |         "CONCURRENT_REQUESTS": 6,
21 |         "CONCURRENT_REQUESTS_PER_DOMAIN": 6,
22 |         "DOWNLOAD_DELAY": 0,
23 |         "DOWNLOAD_TIMEOUT": 120,
24 |         "CLOSESPIDER_ITEMCOUNT": 15000,
25 |         "CLOSESPIDER_TIMEOUT": 3600 * 3,
26 |         "RETRY_HTTP_CODES": [],
27 |         "HTTPERROR_ALLOWED_CODES": [404],
28 |         "DOWNLOADER_MIDDLEWARES": {
29 |             "scrapy.downloadermiddlewares.retry.RetryMiddleware": 90,
30 |             'scrapy_proxies.RandomProxy': 100,
31 |             "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 110,
32 |             "scrapy_poet.InjectionMiddleware": 543,
33 |         },
34 |         "ITEM_PIPELINES": {
35 |             "scrapy_project.pipelines.EcommercePricesDBPipeline": 100,
36 |         },
37 |     }
38 | 
39 |     redis_batch_size = 50
40 |     redis_key = "redis-product-spider:start_urls"
41 | 
42 |     parse = callback_for(ProductPage)
43 | 


--------------------------------------------------------------------------------
/scrapy_project/items.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class ProductItem(scrapy.Item):
 5 |     vendor = scrapy.Field()
 6 |     status = scrapy.Field()
 7 |     created = scrapy.Field()
 8 |     updated = scrapy.Field()
 9 | 
10 |     # General Product Information
11 |     availability = scrapy.Field()
12 |     color = scrapy.Field()
13 |     currency = scrapy.Field()
14 |     currencyRaw = scrapy.Field()
15 |     productId = scrapy.Field()
16 | 
17 |     # Product Identification Numbers (GTIN)
18 |     gtin = scrapy.Field()
19 | 
20 |     # Product Images
21 |     images = scrapy.Field()
22 |     mainImage = scrapy.Field()
23 | 
24 |     # Product Information
25 |     mpn = scrapy.Field()
26 |     name = scrapy.Field()
27 |     available_quantity = scrapy.Field()
28 |     price = scrapy.Field()
29 |     regularPrice = scrapy.Field()
30 |     size = scrapy.Field()
31 |     sku = scrapy.Field()
32 |     style = scrapy.Field()
33 | 
34 |     # Additional product properties
35 |     additionalProperties = scrapy.Field()
36 | 
37 |     # Product URLs
38 |     url = scrapy.Field()
39 |     canonicalUrl = scrapy.Field()
40 | 
41 |     # Product Rating
42 |     aggregateRating = scrapy.Field()
43 | 
44 |     # Product Brand Information
45 |     brand = scrapy.Field()
46 | 
47 |     # Breadcrumbs (navigation path)
48 |     breadcrumbs = scrapy.Field()
49 | 
50 |     # Product Features
51 |     features = scrapy.Field()
52 | 
53 |     # Product Description
54 |     description = scrapy.Field()
55 |     descriptionHtml = scrapy.Field()
56 | 
57 |     # Product Options
58 |     variants = scrapy.Field()
59 | 
60 |     # Additional metadata
61 |     metadata = scrapy.Field()
62 | 


--------------------------------------------------------------------------------
/scrapy_project/pipelines/base_pipeline/base_pipelines.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import logging
 3 | 
 4 | from itemadapter import ItemAdapter
 5 | 
 6 | from scrapy_project.models import connect_to_db
 7 | 
 8 | 
 9 | class BaseDBPipeline:
10 |     max_items = 10000
11 |     items = []
12 |     total = 0
13 | 
14 |     def __init__(self, db_uri):
15 |         self.db_handle = connect_to_db(db_uri)
16 | 
17 |     @classmethod
18 |     def from_settings(cls, settings):
19 |         db_uri = settings.get("DATABASE_URI")
20 |         params = {
21 |             "db_uri": db_uri,
22 |         }
23 |         return cls(**params)
24 | 
25 |     @classmethod
26 |     def from_crawler(cls, crawler):
27 |         return cls.from_settings(crawler.settings)
28 | 
29 |     def process_item(self, item, spider):
30 |         item_for_db = self.prepare_item(
31 |             copy.deepcopy(item)
32 |         )
33 |         item_for_db.pop("parent_url", None)
34 | 
35 |         self.total += 1
36 |         self.items.append(ItemAdapter(item_for_db).asdict())
37 | 
38 |         if self.total > self.max_items:
39 |             self.flush_data()
40 |         return item
41 | 
42 |     def close_spider(self, spider):
43 |         self.flush_data()
44 | 
45 |     def flush_data(self):
46 |         with self.db_handle.atomic() as transaction:
47 |             try:
48 |                 self.insert_to_db(self.items)
49 |             except Exception as e:
50 |                 logging.error(f"Error in PostgreSQLItemPipeline: {e}")
51 |             transaction.commit()
52 | 
53 |         self.items = []
54 |         self.total = 0
55 |         return None
56 | 
57 |     def insert_to_db(self, items):
58 |         pass
59 | 
60 |     def prepare_item(self, item):
61 |         return item
62 | 


--------------------------------------------------------------------------------
/scrapy_project/models.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from peewee import *
 4 | from playhouse.postgres_ext import JSONField
 5 | 
 6 | from scrapy_project.utils.utils import CustomDatabaseProxy
 7 | 
 8 | 
 9 | class BaseModel(Model):
10 |     class Meta:
11 |         database = CustomDatabaseProxy()
12 | 
13 | 
14 | class ProductItemModel(BaseModel):
15 |     vendor = TextField(null=False)  # str
16 |     status = TextField(null=False)  # str
17 |     created = DateTimeField(null=False, default=datetime.datetime.now())
18 |     updated = DateTimeField(null=True)
19 | 
20 |     # General Product Information
21 |     availability = TextField(default="InStock", null=True)  # str
22 |     color = TextField(null=True)  # str
23 |     currency = TextField(null=True)  # str
24 |     currencyRaw = TextField(null=True)  # str
25 |     productId = TextField(null=True)  # str
26 | 
27 |     # Product Identification Numbers (GTIN)
28 |     gtin = JSONField(null=True)  # List[Dict[str, str]]
29 | 
30 |     # Product Images
31 |     images = JSONField(null=True)  # List[Dict[str, str]]
32 |     mainImage = JSONField(null=True)  # Dict[str, Any]
33 | 
34 |     # Product Information
35 |     mpn = TextField(null=True)  # str Manufacture Product Number
36 |     name = TextField(null=True)  # str Product Name
37 |     available_quantity = TextField(
38 |         null=True
39 |     )  # str How many products are available to order
40 |     price = TextField(null=True)  # str Product Price
41 |     regularPrice = TextField(null=True)  # str Regular product price
42 |     size = TextField(null=True)  # str Product Size
43 |     sku = TextField(null=True)  # str Product Article
44 |     style = TextField(null=True)  # str Product Style
45 | 
46 |     # Additional product properties
47 |     additionalProperties = JSONField(null=True)  # List[Dict[str, Any]]
48 | 
49 |     # Product URLs
50 |     url = TextField(null=True)  # str
51 |     canonicalUrl = TextField(null=True)  # str
52 | 
53 |     # Product Rating
54 |     aggregateRating = JSONField(null=True)  # Dict[str, Any]
55 | 
56 |     # Product Brand Information
57 |     brand = JSONField(null=True)  # Dict[str, Any]
58 | 
59 |     # Breadcrumbs (navigation path)
60 |     breadcrumbs = JSONField(null=True)  # List[Dict[str, Any]]
61 | 
62 |     # Product Features
63 |     features = JSONField(null=True)  # List[Dict[str, Any]]
64 | 
65 |     # Product Description
66 |     description = TextField(null=True)  # str
67 |     descriptionHtml = TextField(null=True)  # str
68 | 
69 |     # Product Options
70 |     variants = JSONField(null=True)  # List[Dict[str, Any]]
71 | 
72 |     # Additional metadata
73 |     metadata = JSONField(null=True)  # Dict[str, Any]
74 | 
75 |     class Meta:
76 |         db_table = "product_item"
77 | 
78 | 
79 | def connect_to_db(db_uri):
80 |     db_handle = CustomDatabaseProxy(db_uri=db_uri)
81 |     db_handle.create_tables([ProductItemModel])  # table creation example
82 |     return db_handle
83 | 


--------------------------------------------------------------------------------
/scrapy_project/page_objects/base_page_objects/product_page.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | import web_poet
  4 | import attr
  5 | 
  6 | from scrapy_project.items import ProductItem
  7 | 
  8 | 
  9 | @attr.define
 10 | class ProductPage(web_poet.WebPage):
 11 |     page_params: web_poet.PageParams
 12 |     http: web_poet.HttpClient
 13 | 
 14 |     def to_item(self):
 15 |         item = ProductItem()
 16 |         item["vendor"] = self.get_vendor()
 17 |         item["status"] = self.get_status()
 18 | 
 19 |         item["updated"] = self.get_updated()
 20 | 
 21 |         # General Product Information
 22 |         item["availability"] = self.get_availability()
 23 |         item["color"] = self.get_color()
 24 |         item["currency"] = self.get_currency()
 25 |         item["currencyRaw"] = self.get_currencyRaw()
 26 |         item["productId"] = self.get_productId()
 27 | 
 28 |         # Product Identification Numbers (GTIN)
 29 |         item["gtin"] = self.get_gtin()
 30 | 
 31 |         # Product Images
 32 |         item["images"] = self.get_images()
 33 |         item["mainImage"] = self.get_mainImage()
 34 | 
 35 |         # Product Information
 36 |         item["mpn"] = self.get_mpn()
 37 |         item["name"] = self.get_name()
 38 |         item["available_quantity"] = self.get_available_quantity()
 39 |         item["price"] = self.get_price()
 40 |         item["regularPrice"] = self.get_regularPrice()
 41 |         item["size"] = self.get_size()
 42 |         item["sku"] = self.get_sku()
 43 |         item["style"] = self.get_style()
 44 | 
 45 |         # Additional product properties
 46 |         item["additionalProperties"] = self.get_additionalProperties()
 47 | 
 48 |         # Product URLs
 49 |         item["url"] = self.url
 50 |         item["canonicalUrl"] = self.get_canonicalUrl()
 51 | 
 52 |         # Product Rating
 53 |         item["aggregateRating"] = self.get_aggregateRating()
 54 | 
 55 |         # Product Brand Information
 56 |         item["brand"] = self.get_brand()
 57 | 
 58 |         # Breadcrumbs (navigation path)
 59 |         item["breadcrumbs"] = self.get_breadcrumbs()
 60 | 
 61 |         # Product Features
 62 |         item["features"] = self.get_features()
 63 | 
 64 |         # Product Description
 65 |         item["description"] = self.get_description()
 66 |         item["descriptionHtml"] = self.get_descriptionHtml()
 67 | 
 68 |         # Product Options
 69 |         item["variants"] = self.get_variants()
 70 | 
 71 |         # Additional metadata
 72 |         item["metadata"] = self.get_metadata()
 73 |         yield item
 74 | 
 75 |     def get_vendor(self):
 76 |         return None
 77 | 
 78 |     def get_status(self):
 79 |         return "NEW"
 80 | 
 81 |     def get_created(self):
 82 |         return None
 83 | 
 84 |     def get_updated(self):
 85 |         return None
 86 | 
 87 |     def get_availability(self):
 88 |         return None
 89 | 
 90 |     def get_color(self):
 91 |         return None
 92 | 
 93 |     def get_currency(self):
 94 |         return None
 95 | 
 96 |     def get_currencyRaw(self):
 97 |         return None
 98 | 
 99 |     def get_productId(self):
100 |         return None
101 | 
102 |     def get_gtin(self):
103 |         return None
104 | 
105 |     def get_images(self):
106 |         return None
107 | 
108 |     def get_mainImage(self):
109 |         return None
110 | 
111 |     def get_mpn(self):
112 |         return None
113 | 
114 |     def get_name(self):
115 |         return None
116 | 
117 |     def get_available_quantity(self):
118 |         return None
119 | 
120 |     def get_price(self):
121 |         return None
122 | 
123 |     def get_regularPrice(self):
124 |         return None
125 | 
126 |     def get_size(self):
127 |         return None
128 | 
129 |     def get_sku(self):
130 |         return None
131 | 
132 |     def get_style(self):
133 |         return None
134 | 
135 |     def get_additionalProperties(self):
136 |         return None
137 | 
138 |     def get_canonicalUrl(self):
139 |         return None
140 | 
141 |     def get_aggregateRating(self):
142 |         return None
143 | 
144 |     def get_brand(self):
145 |         return None
146 | 
147 |     def get_breadcrumbs(self):
148 |         return None
149 | 
150 |     def get_features(self):
151 |         return None
152 | 
153 |     def get_description(self):
154 |         return None
155 | 
156 |     def get_descriptionHtml(self):
157 |         return None
158 | 
159 |     def get_variants(self):
160 |         return None
161 | 
162 |     def get_metadata(self):
163 |         # Get current UTC time
164 |         current_utc_time = datetime.utcnow()
165 |         # Format the time in ISO 8601 format
166 |         formatted_time = current_utc_time.strftime("%Y-%m-%dT%H:%M:%SZ")
167 |         return {"dateDownloaded": formatted_time, "probability": 1}
168 | 


--------------------------------------------------------------------------------
/scrapy_project/settings.py:
--------------------------------------------------------------------------------
  1 | # Scrapy settings for scrapy_project project
  2 | #
  3 | # For simplicity, this file contains only settings considered important or
  4 | # commonly used. You can find more settings consulting the documentation:
  5 | #
  6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  9 | import os
 10 | 
 11 | BOT_NAME = "scrapy_project"
 12 | 
 13 | SPIDER_MODULES = ["scrapy_project.spiders"]
 14 | NEWSPIDER_MODULE = "scrapy_project.spiders"
 15 | 
 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 17 | USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
 18 | 
 19 | # Obey robots.txt rules
 20 | ROBOTSTXT_OBEY = False
 21 | 
 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 23 | CONCURRENT_REQUESTS = 3
 24 | 
 25 | # Configure a delay for requests for the same website (default: 0)
 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 27 | # See also autothrottle settings and docs
 28 | DOWNLOAD_DELAY = 0
 29 | # The download delay setting will honor only one of:
 30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 1
 31 | # CONCURRENT_REQUESTS_PER_IP = 1
 32 | 
 33 | # Disable cookies (enabled by default)
 34 | COOKIES_ENABLED = True
 35 | 
 36 | # Disable Telnet Console (enabled by default)
 37 | # TELNETCONSOLE_ENABLED = False
 38 | 
 39 | # Override the default request headers:
 40 | DEFAULT_REQUEST_HEADERS = {
 41 |     "Upgrade-Insecure-Requests": "1",
 42 |     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
 43 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
 44 | }
 45 | 
 46 | # Enable or disable spider middlewares
 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 48 | SPIDER_MIDDLEWARES = {}
 49 | 
 50 | # Enable or disable downloader middlewares
 51 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 52 | DOWNLOADER_MIDDLEWARES = {
 53 |     "scrapy.downloadermiddlewares.retry.RetryMiddleware": 90,
 54 |     "scrapy_proxies.RandomProxy": 100,
 55 |     "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 110,
 56 |     "scrapy_poet.InjectionMiddleware": 543,
 57 | }
 58 | 
 59 | # Enable or disable extensions
 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 61 | EXTENSIONS = {}
 62 | 
 63 | # Configure item pipelines
 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 65 | ITEM_PIPELINES = {}
 66 | 
 67 | # Enable and configure the AutoThrottle extension (disabled by default)
 68 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 69 | AUTOTHROTTLE_ENABLED = False
 70 | # The initial download delay
 71 | AUTOTHROTTLE_START_DELAY = 1
 72 | # The maximum download delay to be set in case of high latencies
 73 | AUTOTHROTTLE_MAX_DELAY = 0.8
 74 | # The average number of requests Scrapy should be sending in parallel to
 75 | # each remote server
 76 | AUTOTHROTTLE_TARGET_CONCURRENCY = 16
 77 | DOWNLOAD_TIMEOUT = 120
 78 | # Enable showing throttling stats for every response received:
 79 | # AUTOTHROTTLE_DEBUG = False
 80 | 
 81 | # Enable and configure HTTP caching (disabled by default)
 82 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 83 | HTTPCACHE_ENABLED = False
 84 | HTTPCACHE_EXPIRATION_SECS = 0
 85 | HTTPCACHE_DIR = "httpcache"
 86 | HTTPCACHE_IGNORE_HTTP_CODES = [429, 500, 503, 504, 400, 403, 404, 408]
 87 | HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 88 | 
 89 | from .scrapy_poet_overrides import _SCRAPY_POET_OVERRIDES
 90 | 
 91 | SCRAPY_POET_OVERRIDES = _SCRAPY_POET_OVERRIDES
 92 | 
 93 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 94 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 95 | # SCHEDULER_PERSIST = True
 96 | 
 97 | REDIS_URL = os.environ.get("REDIS_URL")
 98 | DATABASE_URI = os.environ.get("DATABASE_URI")
 99 | 
100 | # Retry many times since proxies often fail
101 | RETRY_TIMES = 1
102 | # Retry on most error codes since proxies fail for different reasons
103 | RETRY_HTTP_CODES = [429, 500, 503, 504, 400, 403, 404, 408]
104 | 
105 | # Proxy mode
106 | # 0 = Every request has different proxy
107 | # 1 = Take only one proxy from the list and assign it to every requests
108 | # 2 = Put a custom proxy to use in the settings
109 | PROXY_MODE = 1
110 | # If proxy mode is 2 uncomment this sentence :
111 | CUSTOM_PROXY = "http://host1:port"
112 | 
113 | SCRAPEOPS_API_KEY = os.environ.get("SCRAPEOPS_API_KEY")
114 | PROXY_LIST = [
115 |     f"http://scrapeops.country=us:{SCRAPEOPS_API_KEY}@proxy.scrapeops.io:5353",
116 | ]
117 | 
118 | GOOGLE_SHEETS_CREDENTIALS = {}
119 | 
120 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
121 | 


--------------------------------------------------------------------------------
/scrapy_project/spiders/google_search_scraper.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from abc import ABC
  3 | from typing import Any, Iterable
  4 | from urllib import parse
  5 | 
  6 | import scrapy
  7 | from scrapy.http import Request, TextResponse
  8 | 
  9 | from scrapy_project.settings import (
 10 |     SCRAPEOPS_API_KEY,
 11 | )
 12 | 
 13 | 
 14 | class GoogleSearchLinksSpider(scrapy.Spider, ABC):
 15 |     """
 16 |     Collect links from google search, no older than 7 days.
 17 |     Spider gets initial domain links from google sheets by google api.
 18 | 
 19 |     Use spider arguments to start gathering links of exact domain
 20 |     or provide the spider with args in case it runs on Zyte.
 21 |     Example: scrapy crawl google_search_links_parser -a domain_link="https://www.example.com"
 22 | 
 23 |     see https://scrapeops.io/docs/proxy-aggregator/advanced-functionality/auto-extract/ for reference
 24 |     """
 25 | 
 26 |     name = "google_search_links_parser"
 27 | 
 28 |     custom_settings = {
 29 |         "DOWNLOADER_MIDDLEWARES": {},
 30 |         "ITEM_PIPELINES": {},
 31 |         "CONCURRENT_REQUESTS": 5,
 32 |         "DOWNLOAD_TIMEOUT": 120,
 33 |         "DOWNLOAD_DELAY": 0,
 34 |         "RETRY_TIMES": 1,
 35 |     }
 36 | 
 37 |     def start_requests(self) -> Iterable[Request]:
 38 |         domain_links = ["bbc.com", "cnn.com"]
 39 |         for domain_link in domain_links:
 40 |             yield from self.generate_request_to_google_search(
 41 |                 domain=parse.urlparse(domain_link).netloc,
 42 |                 initial_link=domain_link,
 43 |             )
 44 | 
 45 |     def generate_request_to_google_search(
 46 |         self, domain: str, initial_link: str
 47 |     ) -> Iterable[Request]:
 48 |         url = self.get_google_url(page=0, domain=domain)
 49 |         yield scrapy.Request(
 50 |             url=url,
 51 |             cb_kwargs=dict(
 52 |                 domain=domain,
 53 |                 initial_link=initial_link,
 54 |             ),
 55 |         )
 56 | 
 57 |     def parse(self, response: TextResponse, **cb_kwargs: Any) -> Any:
 58 |         data = response.json().get("data")
 59 |         if data:
 60 |             results = data.get("organic_results") or data.get("articles")
 61 |             logging.info(f"found {len(results)} results")
 62 |             for result in results:
 63 |                 if not self.is_valid_link(
 64 |                     initial_link=cb_kwargs.get("initial_link"),
 65 |                     link=result.get("link"),
 66 |                 ):
 67 |                     continue
 68 |                 yield {
 69 |                     "url": result.get("link"),
 70 |                     "domain": cb_kwargs.get("domain"),
 71 |                 }
 72 |             page = self.get_page_number(data)
 73 |             if page > 0:
 74 |                 url = self.get_google_url(
 75 |                     page=page, domain=cb_kwargs.get("domain")
 76 |                 )
 77 |                 yield scrapy.Request(
 78 |                     url=url,
 79 |                     cb_kwargs=cb_kwargs,
 80 |                 )
 81 |         else:
 82 |             logging.warning(
 83 |                 "api.scraperapi.com did NOT return relevant results"
 84 |             )
 85 | 
 86 |     @staticmethod
 87 |     def get_google_url(page: int, domain: str) -> str:
 88 |         """
 89 |         Find results from specific dates:
 90 |             as_qdr=x
 91 |         Swap out x for the following to limit the search to only files first indexed in:
 92 |             d - the previous 24 hours
 93 |             w - the previous seven days
 94 |             m - the previous month
 95 |             y - past year
 96 |             mn - the previous n number of months. So m2 would be the previous two, m3 would be three, and so on.
 97 |         Does work into double digits
 98 |         """
 99 |         # feel free to edit google url to fit your use case
100 |         google_url = (
101 |             f"https://www.google.com/search?q=site:{domain}&"
102 |             f"as_qdr=w&tbm=nws&start={page}&num=100"
103 |         )
104 |         url = (
105 |             f"https://proxy.scrapeops.io/v1/?api_key={SCRAPEOPS_API_KEY}"
106 |             f"&url={parse.quote_plus(google_url)}"
107 |             "&auto_extract=google"
108 |         )
109 |         return url
110 | 
111 |     def get_page_number(self, data: dict) -> int:
112 |         pagination = data.get("pagination")
113 |         pagination_url = pagination.get("load_more_url")
114 |         page = 0
115 |         if pagination_url:
116 |             page = pagination_url.split("start%3D")[1].split(
117 |                 "%26sa%3DN&autoparse="
118 |             )[0]
119 |         else:
120 |             if pagination.get("next_page_url"):
121 |                 page = (int(pagination.get("current_page")) + 1) * 10
122 |         return int(page)
123 | 
124 |     @staticmethod
125 |     def is_valid_link(initial_link, link):
126 |         return (
127 |             link != initial_link
128 |             and parse.urlparse(link).path
129 |             and parse.urlparse(link).path != "/"
130 |         )
131 | 


--------------------------------------------------------------------------------
/scrapy_project/page_objects/website_specific_page_objects/amazon.py:
--------------------------------------------------------------------------------
  1 | from scrapy_project.page_objects.base_page_objects import ProductPage
  2 | from price_parser import Price
  3 | 
  4 | 
  5 | class AmazonProductPage(ProductPage):
  6 |     """
  7 |     https://www.amazon.com/High-Protein-Bars-thinkThin-Non-GMO/dp/B00VXQGKRM
  8 |     """
  9 | 
 10 |     def get_vendor(self):
 11 |         return "amazon.com"
 12 | 
 13 |     def get_availability(self):
 14 |         product_is_available = self.xpath(
 15 |             "//div[@id='availability']/span/text()"
 16 |         ).get()
 17 |         # Check if any product is available
 18 |         if product_is_available:
 19 |             if any(
 20 |                 s in product_is_available.lower().strip()
 21 |                 for s in ["in stock", "usually ships within"]
 22 |             ):
 23 |                 return "InStock"
 24 |             else:
 25 |                 return "OutOfStock"
 26 |         else:
 27 |             return "OutOfStock"
 28 | 
 29 |     def get_color(self):
 30 |         return None
 31 | 
 32 |     def get_currency(self):
 33 |         price_string = self.get_current_price()
 34 |         if Price.fromstring(price_string).currency == "$":
 35 |             return "USD"
 36 |         else:
 37 |             return None
 38 | 
 39 |     def get_currencyRaw(self):
 40 |         price_string = self.get_current_price()
 41 |         return Price.fromstring(price_string).currency
 42 | 
 43 |     def get_productId(self):
 44 |         return self.get_canonicalUrl()
 45 | 
 46 |     def get_gtin(self):
 47 |         return []
 48 | 
 49 |     def get_images(self):
 50 |         image_selectors = [
 51 |             "//div[@id='altImages']//li//img/@src",
 52 |             "//div[@id='imgTagWrapperId']/img/@src",
 53 |         ]
 54 | 
 55 |         image_urls = next(
 56 |             (
 57 |                 urls
 58 |                 for selector in image_selectors
 59 |                 if (urls := self.xpath(selector).getall())
 60 |             ),
 61 |             [],
 62 |         )
 63 | 
 64 |         image_urls = [
 65 |             ".".join(image_url.rsplit(".", 2)[::2])
 66 |             for image_url in image_urls
 67 |             if "gif" not in image_url
 68 |         ]
 69 | 
 70 |         image_urls = list(set(image_urls))
 71 |         return image_urls
 72 | 
 73 |     def get_mainImage(self):
 74 |         mainImage_url = self.xpath(
 75 |             "//div[@id='imgTagWrapperId']/img/@src"
 76 |         ).get()
 77 |         return mainImage_url
 78 | 
 79 |     def get_mpn(self):
 80 |         attributes = self.get_additionalProperties()
 81 |         mpn_column_names = ["Item model number", "Part Number"]
 82 |         return next(
 83 |             (
 84 |                 attributes.get(name)
 85 |                 for name in mpn_column_names
 86 |                 if name in attributes.keys()
 87 |             ),
 88 |             None,
 89 |         )
 90 | 
 91 |     def get_name(self):
 92 |         return self.xpath("//span[@id='productTitle']/text()").get()
 93 | 
 94 |     def get_available_quantity(self):
 95 |         # there is no simple way to collect available quantity on amazon.com
 96 |         # if you really need this value - create a dedicated spider
 97 |         return None
 98 | 
 99 |     def get_current_price(self):
100 |         if self.get_availability() != "InStock":
101 |             return None
102 | 
103 |         selectors = [
104 |             "//div[@class='a-section a-spacing-micro']"
105 |             "//span[@class='a-offscreen']/text()"
106 |             "//span[contains(text(), 'List Price')]"
107 |             "//span[@class='a-offscreen']/text()",
108 |             "//span[@id='price_inside_buybox']/text()",
109 |         ]
110 | 
111 |         for xpath in selectors:
112 |             price = self.xpath(xpath).get()
113 |             if price:
114 |                 return price
115 |         return None
116 | 
117 |     def get_price(self):
118 |         price_string = self.get_current_price()
119 |         return Price.fromstring(price_string).amount_text
120 | 
121 |     def get_regularPrice(self):
122 |         regularPrice_string = self.xpath(
123 |             "//span[contains(text(),'List Price:')]"
124 |             "/span/span[@class='a-offscreen']/text()"
125 |         ).get()
126 |         return Price.fromstring(regularPrice_string).amount_text
127 | 
128 |     def get_sku(self):
129 |         return self.url.split("/dp/", 1)[1]
130 | 
131 |     def get_additionalProperties(self):
132 |         additionalProperties = {}
133 | 
134 |         def extract_attributes(
135 |             xpath_query, name_query, info_query, name_clean=True
136 |         ):
137 |             for row in self.xpath(xpath_query):
138 |                 attribute_name = row.xpath(name_query).get()
139 |                 if attribute_name:
140 |                     attribute_name = attribute_name.strip()
141 |                     if name_clean:
142 |                         attribute_name = attribute_name.split(":")[0]
143 |                     attribute_name = self.clean_string(attribute_name)
144 |                     if (
145 |                         attribute_name != "Customer Reviews"
146 |                     ):  # this attribute should be stored in a separate field
147 |                         attributes_info = row.xpath(info_query).getall()
148 |                         attributes_info = self.clean_string(attributes_info)
149 |                         additionalProperties[attribute_name] = attributes_info
150 | 
151 |         tech_details = self.xpath("//div[@id='prodDetails']//table//tr")
152 |         if len(tech_details) == 0:
153 |             extract_attributes(
154 |                 "//div[contains(@id, 'productOverview')]//tr",
155 |                 "./td[1]/span//text()",
156 |                 "./td[2]/span[contains(@class,'a-size-base')]//text()",
157 |             )
158 |         else:
159 |             extract_attributes(
160 |                 "//div[@id='prodDetails']//table//tr",
161 |                 "./th/text()",
162 |                 "./td//text()",
163 |                 name_clean=False,
164 |             )
165 | 
166 |         extract_attributes(
167 |             "//div[@id='detailBullets_feature_div']"
168 |             "/ul/li/span[@class='a-list-item']",
169 |             "./span[1]//text()",
170 |             "./span[2]/text()",
171 |         )
172 | 
173 |         return additionalProperties
174 | 
175 |     def clean_string(self, string):
176 |         if isinstance(string, list):
177 |             string = " ".join(string)
178 |         return (
179 |             string.replace("\n", "")
180 |             .replace("\u200e", "")
181 |             .replace("\u200f", "")
182 |             .strip()
183 |         )
184 | 
185 |     def get_canonicalUrl(self):
186 |         return self.url.split("?")[0]
187 | 
188 |     def get_aggregateRating(self):
189 |         ratingValue = float(
190 |             self.xpath(
191 |                 "//span[contains(@class,'reviewCountTextLinkedHistogram')]"
192 |                 "//a/span/text()"
193 |             ).get()
194 |         )
195 | 
196 |         number_str = "".join(
197 |             filter(
198 |                 str.isdigit,
199 |                 self.xpath(
200 |                     "//a[@id='acrCustomerReviewLink']/span/text()"
201 |                 ).get(),
202 |             )
203 |         )
204 |         reviewCount = int(number_str)
205 |         return {
206 |             "bestRating": 5,
207 |             "ratingValue": ratingValue,
208 |             "reviewCount": reviewCount,
209 |         }
210 | 
211 |     def get_brand(self):
212 |         additionalProperties = self.get_additionalProperties()
213 |         return additionalProperties.get("Brand") or additionalProperties.get(
214 |             "Manufacturer"
215 |         )
216 | 
217 |     def get_breadcrumbs(self):
218 |         breadcrumb_elements = self.xpath(
219 |             "//div[contains(@*, 'breadcrumbs')]//ul/li/span/a"
220 |         )
221 |         breadcrumbs = [
222 |             {
223 |                 "url": element.xpath("./@href").get(),
224 |                 "name": element.xpath("./text()")
225 |                 .get()
226 |                 .strip()
227 |                 .replace("\n", ""),
228 |             }
229 |             for element in breadcrumb_elements
230 |         ]
231 |         return breadcrumbs
232 | 
233 |     def get_features(self):
234 |         return []
235 | 
236 |     def get_description(self):
237 |         product_description = self.xpath(
238 |             "//h3[contains(span, 'Product Description')]"
239 |             "//following::p[1]/span/text()"
240 |         ).getall()
241 |         if not product_description:
242 |             product_description = self.xpath(
243 |                 "//div[@id='productDescription']/p/span/text()"
244 |             ).getall()
245 | 
246 |         return product_description
247 | 
248 |     def get_descriptionHtml(self):
249 |         product_description_html = self.xpath(
250 |             "//h3[contains(span, 'Product Description')]"
251 |             "//following::p[1]/span"
252 |         ).getall()
253 |         if not product_description_html:
254 |             product_description_html = self.xpath(
255 |                 "//div[@id='productDescription']/p/span"
256 |             ).getall()
257 |         return product_description_html
258 | 
259 |     def get_variants(self):
260 |         return []
261 | 


--------------------------------------------------------------------------------