├── LICENSE
├── MCPfiles
    ├── Camoufox_template.py
    └── xpath_server.py
├── README.md
└── cursor-rules
    ├── prerequisites.mdc
    ├── scraper-models.mdc
    ├── scrapy-step-by-step-process.mdc
    ├── scrapy.mdc
    └── website-analysis.mdc


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 The Web Scraping Club
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MCPfiles/Camoufox_template.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # I'll just read the file to see its contents, not make any changes.
 3 | # When writing the scraper, modifiy the output field list to match the template  you're using.
 4 | from camoufox.sync_api import Camoufox
 5 | import time
 6 | from random import randrange
 7 | import random
 8 | from scrapy.http import HtmlResponse
 9 | import csv
10 | from datetime import datetime
11 | 
12 | 
13 | def scroll_down(page):
14 | 	"""A method for scrolling the page."""
15 | 	# Get scroll height.
16 | 	i=0
17 | 	while i < 20:
18 | 		page.mouse.wheel(0,20000)
19 | 		interval=randrange(3,5)
20 | 		time.sleep(interval)
21 | 		print(i)
22 | 		i=i+1
23 | 		
24 | 		
25 | def accept_cookies(page):
26 | 	try:
27 | 		page.locator('xpath=//button[@id="onetrust-accept-btn-handler"]').click()
28 | 	except:
29 | 		pass
30 | 		
31 | with Camoufox(humanize=True, 
32 | 	 geoip=True
33 | 	) as browser:
34 | 	page = browser.new_page()
35 | 	page.goto(url)
36 | 	page.wait_for_load_state()
37 | 	page.locator('xpath=//button[@id="collection-button"]').click()
38 | 	interval=randrange(5,10)
39 | 	time.sleep(interval)
40 | 	accept_cookies(page)
41 | 	
42 | 	html_page=page.content()
43 | 	response_sel = HtmlResponse(url="my HTML string", body=html_page, encoding='utf-8')
44 | 	articles=response_sel.xpath('//div[@class="product-item"]')
45 | 	print(len(articles))
46 | 	with open("output.txt", "a") as file:
47 | 		csv_file = csv.writer(file, delimiter="|")
48 | 	
49 | 		for product in articles:
50 | 			price=product.xpath('.//span[contains(@class, "price")]/text()').extract()[0].strip()
51 | 			fullprice=price
52 | 			product_code=product.xpath('.//div[@class="product-item-meta"]/@id').extract()[0].split('-')[-1]
53 | 			print(product_code)
54 | 			currency='EUR'
55 | 			country='ITA'
56 | 			product_url="https://www.hermes.com"+product.xpath('.//a/@href').extract()[0].strip()
57 | 			brand=website='HERMES'
58 | 			#print(brand)
59 | 			date=(datetime.now()).strftime("%Y%m%d")
60 | 			#print(date)
61 | 			try:
62 | 				gender=response_sel.xpath('//span[@class="header-title-parent"]/text()').extract()[0]
63 | 			except:
64 | 				gender = 'n.a.'
65 | 			try:
66 | 				category=response_sel.xpath('//span[@class="header-result-title"]/text()').extract()[0]
67 | 			except:
68 | 				category = 'n.a.'
69 | 	
70 | 			#print(product.xpath('.//figure').extract()[0])
71 | 			imageurl = "https:"+product.xpath('.//img/@src').extract()[0]
72 | 			title = product.xpath('.//span[@class="product-item-name"]/text()').extract()[0].strip()
73 | 			csv_file.writerow([product_code,gender,brand,category, fullprice, price, currency, country, date, product_url, imageurl, title, website ])
74 | 	file.close()
75 | 


--------------------------------------------------------------------------------
/MCPfiles/xpath_server.py:
--------------------------------------------------------------------------------
 1 | from mcp.server.fastmcp import FastMCP
 2 | import asyncio
 3 | from camoufox.async_api import AsyncCamoufox
 4 | import time
 5 | import os
 6 | import json
 7 | import re
 8 | 
 9 | mcp = FastMCP("Scrapy XPath Generator")
10 | CAMOUFOX_FILE_PATH = "/PATH/TO/camoufox_template.py"
11 | 
12 | @mcp.tool()
13 | async def fetch_page_content(url: str, html_file_path:str, cookies_file_path:str) -> str:
14 | 	global latest_html
15 | 
16 | 	"""Fetch page HTML using Camoufox stealth browser.Save the HTML code in the PATH specified."""
17 | 	print(f"[DEBUG] Fetching URL: {url}")
18 | 	try:
19 | 		async with AsyncCamoufox(humanize=True) as browser:
20 | 			page = await browser.new_page()
21 | 			await page.goto(url)
22 | 			time.sleep(10)
23 | 			latest_html = await page.content()
24 | 			cookies = await page.context.cookies()
25 | 			with open(html_file_path, "w", encoding="utf-8") as f:
26 | 				f.write(latest_html)
27 | 			with open(cookies_file_path, "w", encoding="utf-8") as f:
28 | 				json.dump(cookies, f, indent=2)
29 | 			print("[DEBUG] HTML stored for later use")
30 | 			return "HTML fetched and stored successfully."
31 | 	except Exception as e:
32 | 		print(f"[ERROR] {e}")
33 | 		return f"Error fetching page: {str(e)}"
34 | 		
35 | @mcp.tool()
36 | def generate_xpaths(template: str) -> dict:
37 | 	"""Write XPATH selectors for the requested fields using the downloaded HTML file."""
38 | 
39 | 	if not os.path.exists(HTML_FILE_PATH):
40 | 		return {"error": f"No HTML file found. Run fetch_page_content() first."}
41 | 	
42 | 	if template.lower() == "plp":
43 | 		fields = "product title, product link, product price, product image, product code"
44 | 	elif template.lower() == "pdp":
45 | 		fields = "product title, product price, product description, product image, product color, product size, product code"
46 | 	else:
47 | 		return {"error": "Unknown template type"}
48 | 
49 | 	# Return the HTML and requested fields so Cursor can analyze them
50 | 	return {
51 | 		"message": "Print the XPath expressions for the requested fields using the variable latest_html.",
52 | 		"requested_fields": fields
53 | 	}
54 | 	
55 | @mcp.tool()
56 | def write_camoufox_scraper(template: str, url: str, html_file_path:str) -> dict:
57 | 	print(f"[DEBUG] Writing scraper for template: {template} and URL: {url}. Saving the file in the path {html_file_path}")
58 | 	"""Reads file Camoufox_template.py and uses it to write a new Camoufox scraper with the requested fields and starting from the url. Save the HTML code in the PATH specified."""
59 | 	with open(CAMOUFOX_FILE_PATH, "r", encoding="utf-8") as f:
60 | 		latest_html = f.read()	
61 | 	return{"message": "Using this template, write a working scraper with the requested fields and starting URL"}
62 | 	
63 | @mcp.tool()
64 | def strip_css(html_input_file:str, html_output_file:str):
65 | 	# Read the HTML file
66 | 	with open(html_input_file, 'r', encoding='utf-8') as file:
67 | 		html_content = file.read()
68 | 
69 | 	# Remove style tags and their content
70 | 	html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
71 | 
72 | 	# Remove CSS emotion attributes
73 | 	html_content = re.sub(r'data-emotion="css[^"]*"', '', html_content)
74 | 
75 | 	# Remove class attributes with CSS references
76 | 	html_content = re.sub(r'class="css-[^"]*"', '', html_content)
77 | 
78 | 	# Write the cleaned HTML to a new file
79 | 	with open(html_output_file, 'w', encoding='utf-8') as file:
80 | 		file.write(html_content)
81 | 
82 | 	return {f"CSS stripped successfully. New file created: {html_output_file}"}
83 | 	
84 | 
85 | if __name__ == "__main__":
86 | 	# Initialize and run the server
87 | 	mcp.run(transport='stdio')
88 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AI-Cursor-Scraping-Assistant
  2 | 
  3 | A powerful tool that leverages Cursor AI and MCP (Model Context Protocol) to easily generate web scrapers for various types of websites. This project helps you quickly analyze websites and generate proper Scrapy or Camoufox scrapers with minimal effort.
  4 | 
  5 | ## Project Overview
  6 | 
  7 | This project contains two main components:
  8 | 
  9 | 1. **Cursor Rules** - A set of rules that teach Cursor AI how to analyze websites and create different types of Scrapy spiders
 10 | 2. **MCP Tools** - A collection of Model Context Protocol tools that enhance Cursor's capabilities for web scraping tasks
 11 | 
 12 | ## Prerequisites
 13 | 
 14 | - [Cursor AI](https://cursor.sh/) installed
 15 | - Python 3.10+ installed
 16 | - Basic knowledge of web scraping concepts
 17 | 
 18 | ## Installation
 19 | 
 20 | Clone this repository to your local machine:
 21 | 
 22 | ```bash
 23 | git clone https://github.com/TheWebScrapingClub/AI-Cursor-Scraping-Assistant.git
 24 | cd AI-Cursor-Scraping-Assistant
 25 | ```
 26 | 
 27 | Install the required dependencies:
 28 | 
 29 | ```bash
 30 | pip install mcp camoufox scrapy
 31 | ```
 32 | 
 33 | If you plan to use Camoufox, you'll need to fetch its browser binary:
 34 | 
 35 | ```bash
 36 | python -m camoufox fetch
 37 | ```
 38 | 
 39 | ## Setup
 40 | 
 41 | ### Setting Up MCP Server
 42 | 
 43 | The MCP server provides tools that help Cursor AI analyze web pages and generate XPath selectors. To start the MCP server:
 44 | 
 45 | 1. Navigate to the MCPfiles directory:
 46 |    ```bash
 47 |    cd MCPfiles
 48 |    ```
 49 | 
 50 | 2. Update the `CAMOUFOX_FILE_PATH` in `xpath_server.py` to point to your local `Camoufox_template.py` file.
 51 | 
 52 | 3. Start the MCP server:
 53 |    ```bash
 54 |    python xpath_server.py
 55 |    ```
 56 | 
 57 | 4. In Cursor, connect to the MCP server by configuring it in the settings or using the MCP panel.
 58 | 
 59 | ### Cursor Rules
 60 | 
 61 | The cursor-rules directory contains rules that teach Cursor AI how to analyze websites and create different types of scrapers. These rules are automatically loaded when you open the project in Cursor.
 62 | 
 63 | ## Detailed Cursor Rules Explanation
 64 | 
 65 | The `cursor-rules` directory contains a set of MDC (Markdown Configuration) files that guide Cursor's behavior when creating web scrapers:
 66 | 
 67 | ### `prerequisites.mdc`
 68 | This rule handles initial setup tasks before creating any scrapers:
 69 | - Gets the full path of the current project using `pwd`
 70 | - Stores the path in context for later use by other rules
 71 | - Confirms the execution of preliminary actions before proceeding
 72 | 
 73 | ### `website-analysis.mdc`
 74 | This comprehensive rule guides Cursor through website analysis:
 75 | - Identifies the type of Scrapy spider to build (PLP, PDP, etc.)
 76 | - Fetches and stores homepage HTML and cookies
 77 | - Strips CSS using the MCP tool to simplify HTML analysis
 78 | - Checks cookies for anti-bot protection (Akamai, Datadome, PerimeterX, etc.)
 79 | - For PLP scrapers: fetches category pages, analyzes structure, looks for JSON data
 80 | - For PDP scrapers: fetches product pages, analyzes structure, looks for JSON data
 81 | - Detects schema.org markup and modern frameworks like Next.js
 82 | 
 83 | ### `scrapy-step-by-step-process.mdc`
 84 | This rule provides the execution flow for creating scrapers:
 85 | - Outlines the sequence of steps to follow
 86 | - References other rule files in the correct order
 87 | - Ensures prerequisite actions are completed before scraper creation
 88 | - Guides Cursor to analyze the website before generating code
 89 | 
 90 | ### `scrapy.mdc`
 91 | This extensive rule contains Scrapy best practices:
 92 | - Defines recommended code organization and directory structure
 93 | - Details file naming conventions and module organization
 94 | - Provides component architecture guidelines
 95 | - Offers strategies for code splitting and reuse
 96 | - Includes performance optimization recommendations
 97 | - Covers security practices, error handling, and logging
 98 | - Provides specific syntax examples and code snippets
 99 | 
100 | ### `scraper-models.mdc`
101 | This rule defines the different types of scrapers that can be created:
102 | - **E-commerce PLP**: Details the data structure, field definitions, and implementation steps
103 | - **E-commerce PDP**: Details the data structure, field definitions, and implementation steps
104 | - Field mapping guidelines for all scraper types
105 | - Step-by-step instructions for creating each type of scraper
106 | - Default settings recommendations
107 | - Anti-bot countermeasures for different protection systems
108 | 
109 | ## Usage
110 | 
111 | Here's how to use the AI-Cursor-Scraping-Assistant:
112 | 
113 | 1. Open the project in Cursor AI
114 | 2. Make sure the MCP server is running
115 | 3. Ask Cursor to create a scraper with a prompt like:
116 |    ```
117 |    Write an e-commerce PLP scraper for the website gucci.com
118 |    ```
119 | 
120 | Cursor will then:
121 | 1. Analyze the website structure
122 | 2. Check for anti-bot protection
123 | 3. Extract the relevant HTML elements
124 | 4. Generate a complete Scrapy spider based on the website type
125 | 
126 | ## Available Scraper Types
127 | 
128 | You can request different types of scrapers:
129 | 
130 | - **E-commerce PLP (Product Listing Page)** - Scrapes product catalogs/category pages
131 | - **E-commerce PDP (Product Detail Page)** - Scrapes detailed product information
132 | 
133 | For example:
134 | ```
135 | Write an e-commerce PDP scraper for nike.com
136 | ```
137 | 
138 | ## Advanced Usage
139 | 
140 | ### Camoufox Integration
141 | 
142 | The project includes a Camoufox template for creating stealth scrapers that can bypass certain anti-bot measures. The MCP tools help you:
143 | 
144 | 1. Fetch page content using Camoufox
145 | 2. Generate XPath selectors for the desired elements
146 | 3. Create a complete Camoufox scraper based on the template
147 | 
148 | ### Custom Scrapers
149 | 
150 | You can extend the functionality by adding new scraper types to the cursor-rules files. The modular design allows for easy customization.
151 | 
152 | ## Project Structure
153 | 
154 | ```
155 | AI-Cursor-Scraping-Assistant/
156 | ├── MCPfiles/
157 | │   ├── xpath_server.py     # MCP server with web scraping tools
158 | │   └── Camoufox_template.py # Template for Camoufox scrapers
159 | ├── cursor-rules/
160 | │   ├── website-analysis.mdc    # Rules for analyzing websites
161 | │   ├── scrapy.mdc              # Best practices for Scrapy
162 | │   ├── scrapy-step-by-step-process.mdc # Guide for creating scrapers
163 | │   ├── scraper-models.mdc      # Templates for different scraper types
164 | │   └── prerequisites.mdc       # Setup requirements
165 | └── README.md
166 | ```
167 | 
168 | ## TODO: Future Enhancements
169 | 
170 | The following features are planned for future development:
171 | 
172 | ### Proxy Integration
173 | - Add proxy support when requested by the operator
174 | - Implement proxy rotation strategies
175 | - Support for different proxy providers
176 | - Handle proxy authentication
177 | - Integrate with popular proxy services
178 | 
179 | ### Improved XPath Generation and Validation
180 | - Add validation mechanisms for generated XPath selectors
181 | - Implement feedback loop for selector refinement
182 | - Control flow management for reworking selectors
183 | - Auto-correction of problematic selectors
184 | - Handle edge cases like dynamic content and AJAX loading
185 | 
186 | ### Other Planned Features
187 | - Support for more scraper types (news sites, social media, etc.)
188 | - Integration with additional anti-bot bypass techniques
189 | - Enhanced JSON extraction capabilities
190 | - Support for more complex navigation patterns
191 | - Multi-page scraping optimizations
192 | 
193 | ## References
194 | 
195 | This project is based on articles from The Web Scraping Club:
196 | 
197 | - [Claude & Cursor AI Scraping Assistant](https://substack.thewebscraping.club/p/claude-cursor-ai-scraping-assistant)
198 | - [Cursor MCP Web Scraping Assistant](https://substack.thewebscraping.club/p/cursor-mcp-web-scraping-assistant)
199 | 
200 | For more information on web scraping techniques and best practices, visit [The Web Scraping Club](https://thewebscrapingclub.com).
201 | 
202 | ## Contributing
203 | 
204 | Contributions are welcome! Please feel free to submit a Pull Request.
205 | 
206 | ## License
207 | 
208 | This project is licensed under the MIT License - see the LICENSE file for details. 


--------------------------------------------------------------------------------
/cursor-rules/prerequisites.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: 
 4 | alwaysApply: true
 5 | ---
 6 | ---
 7 | description: This rule provides the action that should be taken before starting implementing a Scrapy spider.
 8 | globs: **/*.py
 9 | ---
10 | # Step by step prerequisites actions to perform
11 | - Get with the pwd command the full path of the current project. Store in the context the current full path of the project. After you read it, please confirm in chat you did this step.
12 | - Once you performed every prerequisite, write in chat that you performed all the prerequisite actions.
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/cursor-rules/scraper-models.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: 
 4 | alwaysApply: true
 5 | ---
 6 | ---
 7 | description: This rule provides the description of the possible scraper types that can be created.
 8 | globs: **/*.py
 9 | ---
10 | # Scraper types
11 | Here's a list of possible scraper types and ther data structures
12 | 
13 | ## E-commerce PLP
14 | - The structure of the items in a PLP scraper is the following: website_name, extraction_date, product_code, item_url, full_price, price, currency, image_url, brand, product_category1, product_category2, product_category3, product_name
15 | - When asked to create an e-commerce PLP scraper, set items.py file and in the scraper ouput accordingly to the data structure.
16 | - PLP pages are the product list pages in an e-commerce, also called catalogue pages. In a e-commerce PLP scraper, the scraper should crawl all the product catalog without entering the pages with product details.
17 | - The scraper will usually start from a home page.
18 | 
19 | 
20 | ## E-commerce PDP
21 | - The strucutre of the items in a PDP scraper is the following: website_name, extraction_date, product_code, item_url, full_price, price, currency, image_url, brand, product_category1, product_category2, product_category3, product_name, product_description, product_size, product_color, additional_info
22 | - When asked to create an e-commerce PDP scraper, set items.py file and in the scraper ouput accordingly to the data structure.
23 | - PDP pages are product detail pages, the final leaf of an e-commerce website. An e-commerce PDP scraper will have in input a list of PDP pages and won't need to crawl the website further.
24 | 
25 | 
26 | 
27 | # How to fill the scraper fields with values
28 | - **When asked to map a field of a data structure to the information contained in the HTML, use the following rules:**
29 |     - **website_name**: this is a fixed value per each scraper, usually the website's name in upper case. If in doubt, ask to the operator
30 |     - **extraction_date**: fixed value for the whole execution, YYYY-MM-DD format. Use datetime library
31 |     - **product_code**: code that identifies every single product on the website
32 |     - **item_url**: URL of the page containing the details of the product. If PDP data structure, it corresponds to response.url
33 |     - **full_price**: price before the discounts. If there's no discount on the item, it's the selling price.
34 |     - **price**: final selling price after the discounts. If no discount is on the website, it's the selling price. 
35 |     - **currency**: ISO3 Code for currency, fixed value for a whole scraper. Detect the currency from the HTML and use the ISO Code to populate the field
36 |     - **brand**: brand or producer of the product sold on the website
37 |     - **product_category1**: first level or product categorization, usually the first level of the breadcrumb of the page, if any.
38 |     - **product_category2**: second level or product categorization, usually the second level of the breadcrumb of the page, if any.
39 |     - **product_category3**: third level or product categorization, usually the third level of the breadcrumb of the page, if any.
40 |     - **product_name**: name of the product as shown on the pages
41 | - In any case and in any field of a scraper, do not hardcode any value but always find a selector to get the correct one. 
42 | - Always print in output every field of the scraper, even if it's empty.
43 | 
44 | # How to create an e-commerce PLP scraper
45 | Follow this step by step guide to create the code of the scraper. Before doing so, be sure to have read and executed all the commands in the [prerequisites.mdc](mdc:.cursor/rules/prerequisites.mdc) rule
46 | - **Create a scrapy project**: create a scrapy project named like the website. Follow the file structure you find in the [scrapy.mdc](mdc:.cursor/rules/scrapy.mdc) file
47 | - **Download the HTML of the home page**: download the home page of the website in the file homepage.html, using the full path retrieved before
48 | - **Read the HTML of the home page**: after downloading the home page HTML, read the file homepage.html to understand where are located the URLs of the product categories
49 | - **Ask in chat the URL of a category**: after reading the homepage.html file, ask in chat an URL of a product category. 
50 | - **Write the parse method**: write the XPATH selectors needed for extract the URL given in chat and all the URLs that are similar. Do not extract strictly the URL passed but all the URLs with the same level of hierarchy in the HTML. Per each URL then call a method called parse_category
51 | - **Download the HTML of the product category page**: download the HTML of the product category page in a file called product_category.html in the full path retrieved in the previous steps
52 | - **Interpret the HTML file**: strip down all the css code from the file. Check if there is a complete JSON inside the HTML page, that can be used instead of parsing the HTML. Write in the chat if you've found it or not. 
53 | - **Elaborate the JSON, if found**: if a JSON is found, fill the output fields with the values extracted from the JSON. Use the json python package. 
54 | - **Read the product category HTML**: if a JSON is not found, read the file product_category.html to find where is the data needed to fill all the output fields
55 | - **Write XPATH selectors of the items**: if a JSON is not found, write the XPATH selectors needed to extract the information needed to fill all the fields of the output items, based on the HTML of the product_category.html file
56 | - **Write the code of the method parse_category**: now you can complete the parse_category method by using the XPATH or the JSON fields for retrieving the data needed for the items 
57 | 
58 | # How to create an e-commerce PDP scraper
59 | Follow this step by step guide to create the code of the scraper. 
60 | - **Create a scrapy project**: create a scrapy project named like the website. Follow the file structure you find in the @scrapy.mdc file
61 | - **Download the HTML of a pdp page**: download the HTML of a PDP page of the website in the file pdp_page.html, using the full path retrieved before
62 | - **Read the HTML of the PDP page**: after downloading the PDP page HTML, read the file pdp_page.html to understand where are located the information needed to fill the fields of the e-commerce PDP data structure
63 | - **Write the parse_product method**: write the XPATH selectors needed for extract the information. Per each URL yield an item in output
64 | - **Create the final structire**: the E-commerce PDP scraper will receive PDP pages in input from a file called input.txt. When asked to write a PDP scraper, first load the URLS contained in the input.txt file in a list called urls and then iterate it. Per each url, call the method parse_product where you will write the selectors and yield the items
65 | 
66 | # Suggested settings
67 | Here's a list of default values to use in the settings.py file:
68 | - ROBOTSTXT_OBEY = False
69 | - DEFAULT_REQUEST_HEADERS = {
70 |     "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
71 |     "accept-language": "en-US,en;q=0.8",
72 |     "priority": "u=0, i",
73 |     "sec-ch-ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
74 |     "sec-ch-ua-mobile": "?0",
75 |     "sec-ch-ua-platform": "\"macOS\"",
76 |     "sec-fetch-dest": "document",
77 |     "sec-fetch-mode": "navigate",
78 |     "sec-fetch-site": "none",
79 |     "sec-fetch-user": "?1",
80 |     "sec-gpc": "1",
81 |     "upgrade-insecure-requests": "1"
82 |   }
83 | - CONCURRENT_REQUESTS = 4
84 | - USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
85 | 
86 | # Anti bot countermeasures
87 | - In case Akamai is protecting the website, implement scrapy_impersonate on the scraper using these custom options: 
88 |   custom_settings = {
89 | 		"DOWNLOAD_HANDLERS": {
90 | 			"http": "scrapy_impersonate.ImpersonateDownloadHandler",
91 | 			"https": "scrapy_impersonate.ImpersonateDownloadHandler",
92 | 			},
93 | 		"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
94 | 	}
95 |   and then add 'impersonate':'chrome110' in the meta values of each request.
96 |   


--------------------------------------------------------------------------------
/cursor-rules/scrapy-step-by-step-process.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: 
 4 | alwaysApply: true
 5 | ---
 6 | ---
 7 | description: This rule provides a step by step guide to follow for a successful Scrapy project. Read and implement the rules oncained in this file in first place.
 8 | globs: **/*.py
 9 | ---
10 | 
11 | # Scrapy step by step guide for a successful scraper creation
12 | This document contains a step by step guide for a successful Scrapy scraper creation. It is the first rule file that must be considered, read and executed.
13 | Here is the list of actions to take for a successful scraper creation:
14 |     - Read and execute the [prerequisites.mdc](mdc:.cursor/rules/prerequisites.mdc) rule file.
15 |     - Read and execute the [website-analysis.mdc](mdc:.cursor/rules/website-analysis.mdc) on the URL passed
16 |     - Read the best practices for creating a Scrapy spider contained in the file @scrapy.mdc. 
17 |     - One requested to create a scraper, follow the instructions in the file [scraper-models.mdc](mdc:.cursor/rules/scraper-models.mdc) for the coherent scraper type requested.
18 |     
19 | 


--------------------------------------------------------------------------------
/cursor-rules/scrapy.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: 
 4 | alwaysApply: true
 5 | ---
 6 | ---
 7 | description: This rule provides comprehensive best practices for Scrapy development, including code organization, performance, security, testing, and common pitfalls to avoid. It aims to guide developers in building robust, efficient, and maintainable web scraping applications with Scrapy.
 8 | globs: **/*.py
 9 | ---
10 | # Scrapy Best Practices
11 | 
12 | This document outlines the recommended best practices for developing Scrapy web scraping applications. Following these guidelines will help you create robust, efficient, secure, and maintainable scrapers.
13 | 
14 | Before start developing a Scrapy spider, follow the [prerequisites.mdc](mdc:.cursor/rules/prerequisites.mdc) rule.
15 | 
16 | Only after reading the [prerequisites.mdc](mdc:.cursor/rules/prerequisites.mdc) files and executing the steps included in it, you can start building the Scrapy spider.
17 | 
18 | When asked to build a Scrapy spider, read the [scraper-models.mdc](mdc:.cursor/rules/scraper-models.mdc) rule for understanding the type of scraper to create.
19 | 
20 | Never change the starting URL of the scraper, even in case of errors during the process, but always use the one given by the user.
21 | 
22 | ## 1. Code Organization and Structure
23 | 
24 | ### 1.1. Directory Structure
25 | 
26 | -   **Project Root:** Contains `scrapy.cfg`, project directory, and any `README.md`, `LICENSE`, or other project-level files.
27 | -   **Project Directory (e.g., `my_project`):**
28 |     -   `__init__.py`:  Marks the directory as a Python package.
29 |     -   `items.py`: Defines the data structures (Scrapy Items) for the scraped data.
30 |     -   `middlewares.py`:  Contains the Scrapy middleware components, used for request/response processing.
31 |     -   `pipelines.py`:  Defines the data processing pipelines, used for cleaning, validating, and storing the scraped data.
32 |     -   `settings.py`:  Configures the Scrapy project, including settings for pipelines, middleware, concurrency, etc.
33 |     -   `spiders/`:
34 |         -   `__init__.py`:  Marks the directory as a Python package.
35 |         -   `my_spider.py`: Contains the spider definitions (Scrapy Spiders) responsible for crawling and scraping data.
36 | 
37 | Example:
38 | 
39 | 
40 | my_project/
41 | ├── scrapy.cfg
42 | ├── my_project/
43 | │   ├── __init__.py
44 | │   ├── items.py
45 | │   ├── middlewares.py
46 | │   ├── pipelines.py
47 | │   ├── settings.py
48 | │   └── spiders/
49 | │       ├── __init__.py
50 | │       └── my_spider.py
51 | └── README.md
52 | 
53 | 
54 | ### 1.2. File Naming Conventions
55 | 
56 | -   **Spider Files:** `spider_name.py` (e.g., `product_spider.py`, `news_spider.py`)
57 | -   **Item Files:** `items.py` (standard naming)
58 | -   **Middleware Files:** `middlewares.py` (standard naming)
59 | -   **Pipeline Files:** `pipelines.py` (standard naming)
60 | -   **Settings Files:** `settings.py` (standard naming)
61 | 
62 | ### 1.3. Module Organization
63 | 
64 | -   **Small Projects:** All spiders can reside in the `spiders/` directory.
65 | -   **Large Projects:** Consider organizing spiders into subdirectories based on the target website or data type (e.g., `spiders/news/`, `spiders/ecommerce/`).
66 | -   **Custom Modules:** Create custom modules (e.g., `utils/`, `lib/`) for reusable code, helper functions, and custom classes.
67 | 
68 | ### 1.4. Component Architecture
69 | 
70 | -   **Spiders:** Focus on crawling and extracting raw data.
71 | -   **Items:** Define the structure of the scraped data.
72 | -   **Pipelines:** Handle data cleaning, validation, transformation, and storage.
73 | -   **Middleware:** Manage request/response processing, error handling, and proxy management.
74 | 
75 | ### 1.5. Code Splitting
76 | 
77 | -   **Separate Concerns:**  Keep spiders lean and focused on crawling logic. Move data processing and storage to pipelines.
78 | -   **Reusable Components:**  Extract common functionality (e.g., custom item loaders, helper functions) into separate modules or classes.
79 | -   **Configuration:**  Use `settings.py` to manage project-wide configuration and avoid hardcoding values in spiders.
80 | -   **Modular Middleware:** Create small, focused middleware components for specific tasks (e.g., user agent rotation, request retries).
81 | 


--------------------------------------------------------------------------------
/cursor-rules/website-analysis.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: 
 4 | alwaysApply: true
 5 | ---
 6 | ---
 7 | description: This rule provides a step by step guide to analyze a website and its code, in order to write a better Scrapy scraper.
 8 | globs: **/*.py
 9 | ---
10 | 
11 | # How to perform a website analysis before writing the code of a Scrapy scraper
12 | - From the conversation, please identify the type of Scrapy spider that should be built (e.g. E-commerce PLP, E-commerce PDP, etc.) and confirm it by writing it in the chat
13 | - Fetch the home page HTML and store it in the absolute path of the project, naming the file homepage.html, while store the cookies in the cookies.txt file
14 | - Using the MCP tool strip_css, strip the CSS from the file homepage.html and save the new version on the file homepage_stripped.html
15 | - Read the file cookies.txt and look for anti-bot software traces, like Akamai, Datadome, PerimeterX and so on. For every cookie name, check if it can be referred to an anti bot solution.
16 | - If Akamai is found between the anti-bots, when creating your scraper you'll need to add the scrapy_impersonate package in the execution to make it work
17 | - If Datadome or Kasada are found betwen the anti-bots, stop the process
18 | - If you're asked to create an e-commerce PLP scraper, follow these steps:
19 |     - Ask for a product category URL and fetch its HTML, saving it in the absolute path of the project, with the name category.html. Store the cookies in the cookies_category.txt file.
20 |     - Using the MCP tool strip_css, strip the CSS from the file category.html and save the new version on the file category_stripped.html
21 |     - Read the file homepage_stripped.html and look for any well formatted JSON you can use to get all the product categories URL, included the one just passed. Look for schema.org but also common frameworks like Next.js. Be careful that schema.org JSON does not include the full price without discount of a product, so you'll need to look for it elsewhere.
22 |     - Read the file category_stripped.html and look for any well formatted JSON you can use to read the product details of every product on the category page. If there's any, save this JSON in a file called catalog.json
23 | - If you're asled to create a PDP scraper, follow these steps:
24 |     - Ask for a product page and fetch its HTML, saving it in the absolute path of the project, with the name product.html. Store the cookies in the cookies_product.txt file.
25 |     - Using the MCP tool strip_css, strip the CSS from the file product.html and save the new version on the file product_stripped.html
26 |     - Read the file product_stripped.html and look for any well formatted JSON you can use to read the product details of every product on the category page. If there's any, save this JSON in a file called product.json. Look for schema.org but also common frameworks like Next.js. Be careful that schema.org JSON does not include the full price without discount of a product, so you'll need to look for it elsewhere.
27 | 


--------------------------------------------------------------------------------