├── LICENSE ├── MCPfiles ├── Camoufox_template.py └── xpath_server.py ├── README.md └── cursor-rules ├── prerequisites.mdc ├── scraper-models.mdc ├── scrapy-step-by-step-process.mdc ├── scrapy.mdc └── website-analysis.mdc /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 The Web Scraping Club 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MCPfiles/Camoufox_template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # I'll just read the file to see its contents, not make any changes. 3 | # When writing the scraper, modifiy the output field list to match the template you're using. 4 | from camoufox.sync_api import Camoufox 5 | import time 6 | from random import randrange 7 | import random 8 | from scrapy.http import HtmlResponse 9 | import csv 10 | from datetime import datetime 11 | 12 | 13 | def scroll_down(page): 14 | """A method for scrolling the page.""" 15 | # Get scroll height. 16 | i=0 17 | while i < 20: 18 | page.mouse.wheel(0,20000) 19 | interval=randrange(3,5) 20 | time.sleep(interval) 21 | print(i) 22 | i=i+1 23 | 24 | 25 | def accept_cookies(page): 26 | try: 27 | page.locator('xpath=//button[@id="onetrust-accept-btn-handler"]').click() 28 | except: 29 | pass 30 | 31 | with Camoufox(humanize=True, 32 | geoip=True 33 | ) as browser: 34 | page = browser.new_page() 35 | page.goto(url) 36 | page.wait_for_load_state() 37 | page.locator('xpath=//button[@id="collection-button"]').click() 38 | interval=randrange(5,10) 39 | time.sleep(interval) 40 | accept_cookies(page) 41 | 42 | html_page=page.content() 43 | response_sel = HtmlResponse(url="my HTML string", body=html_page, encoding='utf-8') 44 | articles=response_sel.xpath('//div[@class="product-item"]') 45 | print(len(articles)) 46 | with open("output.txt", "a") as file: 47 | csv_file = csv.writer(file, delimiter="|") 48 | 49 | for product in articles: 50 | price=product.xpath('.//span[contains(@class, "price")]/text()').extract()[0].strip() 51 | fullprice=price 52 | product_code=product.xpath('.//div[@class="product-item-meta"]/@id').extract()[0].split('-')[-1] 53 | print(product_code) 54 | currency='EUR' 55 | country='ITA' 56 | product_url="https://www.hermes.com"+product.xpath('.//a/@href').extract()[0].strip() 57 | brand=website='HERMES' 58 | #print(brand) 59 | date=(datetime.now()).strftime("%Y%m%d") 60 | #print(date) 61 | try: 62 | gender=response_sel.xpath('//span[@class="header-title-parent"]/text()').extract()[0] 63 | except: 64 | gender = 'n.a.' 65 | try: 66 | category=response_sel.xpath('//span[@class="header-result-title"]/text()').extract()[0] 67 | except: 68 | category = 'n.a.' 69 | 70 | #print(product.xpath('.//figure').extract()[0]) 71 | imageurl = "https:"+product.xpath('.//img/@src').extract()[0] 72 | title = product.xpath('.//span[@class="product-item-name"]/text()').extract()[0].strip() 73 | csv_file.writerow([product_code,gender,brand,category, fullprice, price, currency, country, date, product_url, imageurl, title, website ]) 74 | file.close() 75 | -------------------------------------------------------------------------------- /MCPfiles/xpath_server.py: -------------------------------------------------------------------------------- 1 | from mcp.server.fastmcp import FastMCP 2 | import asyncio 3 | from camoufox.async_api import AsyncCamoufox 4 | import time 5 | import os 6 | import json 7 | import re 8 | 9 | mcp = FastMCP("Scrapy XPath Generator") 10 | CAMOUFOX_FILE_PATH = "/PATH/TO/camoufox_template.py" 11 | 12 | @mcp.tool() 13 | async def fetch_page_content(url: str, html_file_path:str, cookies_file_path:str) -> str: 14 | global latest_html 15 | 16 | """Fetch page HTML using Camoufox stealth browser.Save the HTML code in the PATH specified.""" 17 | print(f"[DEBUG] Fetching URL: {url}") 18 | try: 19 | async with AsyncCamoufox(humanize=True) as browser: 20 | page = await browser.new_page() 21 | await page.goto(url) 22 | time.sleep(10) 23 | latest_html = await page.content() 24 | cookies = await page.context.cookies() 25 | with open(html_file_path, "w", encoding="utf-8") as f: 26 | f.write(latest_html) 27 | with open(cookies_file_path, "w", encoding="utf-8") as f: 28 | json.dump(cookies, f, indent=2) 29 | print("[DEBUG] HTML stored for later use") 30 | return "HTML fetched and stored successfully." 31 | except Exception as e: 32 | print(f"[ERROR] {e}") 33 | return f"Error fetching page: {str(e)}" 34 | 35 | @mcp.tool() 36 | def generate_xpaths(template: str) -> dict: 37 | """Write XPATH selectors for the requested fields using the downloaded HTML file.""" 38 | 39 | if not os.path.exists(HTML_FILE_PATH): 40 | return {"error": f"No HTML file found. Run fetch_page_content() first."} 41 | 42 | if template.lower() == "plp": 43 | fields = "product title, product link, product price, product image, product code" 44 | elif template.lower() == "pdp": 45 | fields = "product title, product price, product description, product image, product color, product size, product code" 46 | else: 47 | return {"error": "Unknown template type"} 48 | 49 | # Return the HTML and requested fields so Cursor can analyze them 50 | return { 51 | "message": "Print the XPath expressions for the requested fields using the variable latest_html.", 52 | "requested_fields": fields 53 | } 54 | 55 | @mcp.tool() 56 | def write_camoufox_scraper(template: str, url: str, html_file_path:str) -> dict: 57 | print(f"[DEBUG] Writing scraper for template: {template} and URL: {url}. Saving the file in the path {html_file_path}") 58 | """Reads file Camoufox_template.py and uses it to write a new Camoufox scraper with the requested fields and starting from the url. Save the HTML code in the PATH specified.""" 59 | with open(CAMOUFOX_FILE_PATH, "r", encoding="utf-8") as f: 60 | latest_html = f.read() 61 | return{"message": "Using this template, write a working scraper with the requested fields and starting URL"} 62 | 63 | @mcp.tool() 64 | def strip_css(html_input_file:str, html_output_file:str): 65 | # Read the HTML file 66 | with open(html_input_file, 'r', encoding='utf-8') as file: 67 | html_content = file.read() 68 | 69 | # Remove style tags and their content 70 | html_content = re.sub(r']*>.*?', '', html_content, flags=re.DOTALL) 71 | 72 | # Remove CSS emotion attributes 73 | html_content = re.sub(r'data-emotion="css[^"]*"', '', html_content) 74 | 75 | # Remove class attributes with CSS references 76 | html_content = re.sub(r'class="css-[^"]*"', '', html_content) 77 | 78 | # Write the cleaned HTML to a new file 79 | with open(html_output_file, 'w', encoding='utf-8') as file: 80 | file.write(html_content) 81 | 82 | return {f"CSS stripped successfully. New file created: {html_output_file}"} 83 | 84 | 85 | if __name__ == "__main__": 86 | # Initialize and run the server 87 | mcp.run(transport='stdio') 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI-Cursor-Scraping-Assistant 2 | 3 | A powerful tool that leverages Cursor AI and MCP (Model Context Protocol) to easily generate web scrapers for various types of websites. This project helps you quickly analyze websites and generate proper Scrapy or Camoufox scrapers with minimal effort. 4 | 5 | ## Project Overview 6 | 7 | This project contains two main components: 8 | 9 | 1. **Cursor Rules** - A set of rules that teach Cursor AI how to analyze websites and create different types of Scrapy spiders 10 | 2. **MCP Tools** - A collection of Model Context Protocol tools that enhance Cursor's capabilities for web scraping tasks 11 | 12 | ## Prerequisites 13 | 14 | - [Cursor AI](https://cursor.sh/) installed 15 | - Python 3.10+ installed 16 | - Basic knowledge of web scraping concepts 17 | 18 | ## Installation 19 | 20 | Clone this repository to your local machine: 21 | 22 | ```bash 23 | git clone https://github.com/TheWebScrapingClub/AI-Cursor-Scraping-Assistant.git 24 | cd AI-Cursor-Scraping-Assistant 25 | ``` 26 | 27 | Install the required dependencies: 28 | 29 | ```bash 30 | pip install mcp camoufox scrapy 31 | ``` 32 | 33 | If you plan to use Camoufox, you'll need to fetch its browser binary: 34 | 35 | ```bash 36 | python -m camoufox fetch 37 | ``` 38 | 39 | ## Setup 40 | 41 | ### Setting Up MCP Server 42 | 43 | The MCP server provides tools that help Cursor AI analyze web pages and generate XPath selectors. To start the MCP server: 44 | 45 | 1. Navigate to the MCPfiles directory: 46 | ```bash 47 | cd MCPfiles 48 | ``` 49 | 50 | 2. Update the `CAMOUFOX_FILE_PATH` in `xpath_server.py` to point to your local `Camoufox_template.py` file. 51 | 52 | 3. Start the MCP server: 53 | ```bash 54 | python xpath_server.py 55 | ``` 56 | 57 | 4. In Cursor, connect to the MCP server by configuring it in the settings or using the MCP panel. 58 | 59 | ### Cursor Rules 60 | 61 | The cursor-rules directory contains rules that teach Cursor AI how to analyze websites and create different types of scrapers. These rules are automatically loaded when you open the project in Cursor. 62 | 63 | ## Detailed Cursor Rules Explanation 64 | 65 | The `cursor-rules` directory contains a set of MDC (Markdown Configuration) files that guide Cursor's behavior when creating web scrapers: 66 | 67 | ### `prerequisites.mdc` 68 | This rule handles initial setup tasks before creating any scrapers: 69 | - Gets the full path of the current project using `pwd` 70 | - Stores the path in context for later use by other rules 71 | - Confirms the execution of preliminary actions before proceeding 72 | 73 | ### `website-analysis.mdc` 74 | This comprehensive rule guides Cursor through website analysis: 75 | - Identifies the type of Scrapy spider to build (PLP, PDP, etc.) 76 | - Fetches and stores homepage HTML and cookies 77 | - Strips CSS using the MCP tool to simplify HTML analysis 78 | - Checks cookies for anti-bot protection (Akamai, Datadome, PerimeterX, etc.) 79 | - For PLP scrapers: fetches category pages, analyzes structure, looks for JSON data 80 | - For PDP scrapers: fetches product pages, analyzes structure, looks for JSON data 81 | - Detects schema.org markup and modern frameworks like Next.js 82 | 83 | ### `scrapy-step-by-step-process.mdc` 84 | This rule provides the execution flow for creating scrapers: 85 | - Outlines the sequence of steps to follow 86 | - References other rule files in the correct order 87 | - Ensures prerequisite actions are completed before scraper creation 88 | - Guides Cursor to analyze the website before generating code 89 | 90 | ### `scrapy.mdc` 91 | This extensive rule contains Scrapy best practices: 92 | - Defines recommended code organization and directory structure 93 | - Details file naming conventions and module organization 94 | - Provides component architecture guidelines 95 | - Offers strategies for code splitting and reuse 96 | - Includes performance optimization recommendations 97 | - Covers security practices, error handling, and logging 98 | - Provides specific syntax examples and code snippets 99 | 100 | ### `scraper-models.mdc` 101 | This rule defines the different types of scrapers that can be created: 102 | - **E-commerce PLP**: Details the data structure, field definitions, and implementation steps 103 | - **E-commerce PDP**: Details the data structure, field definitions, and implementation steps 104 | - Field mapping guidelines for all scraper types 105 | - Step-by-step instructions for creating each type of scraper 106 | - Default settings recommendations 107 | - Anti-bot countermeasures for different protection systems 108 | 109 | ## Usage 110 | 111 | Here's how to use the AI-Cursor-Scraping-Assistant: 112 | 113 | 1. Open the project in Cursor AI 114 | 2. Make sure the MCP server is running 115 | 3. Ask Cursor to create a scraper with a prompt like: 116 | ``` 117 | Write an e-commerce PLP scraper for the website gucci.com 118 | ``` 119 | 120 | Cursor will then: 121 | 1. Analyze the website structure 122 | 2. Check for anti-bot protection 123 | 3. Extract the relevant HTML elements 124 | 4. Generate a complete Scrapy spider based on the website type 125 | 126 | ## Available Scraper Types 127 | 128 | You can request different types of scrapers: 129 | 130 | - **E-commerce PLP (Product Listing Page)** - Scrapes product catalogs/category pages 131 | - **E-commerce PDP (Product Detail Page)** - Scrapes detailed product information 132 | 133 | For example: 134 | ``` 135 | Write an e-commerce PDP scraper for nike.com 136 | ``` 137 | 138 | ## Advanced Usage 139 | 140 | ### Camoufox Integration 141 | 142 | The project includes a Camoufox template for creating stealth scrapers that can bypass certain anti-bot measures. The MCP tools help you: 143 | 144 | 1. Fetch page content using Camoufox 145 | 2. Generate XPath selectors for the desired elements 146 | 3. Create a complete Camoufox scraper based on the template 147 | 148 | ### Custom Scrapers 149 | 150 | You can extend the functionality by adding new scraper types to the cursor-rules files. The modular design allows for easy customization. 151 | 152 | ## Project Structure 153 | 154 | ``` 155 | AI-Cursor-Scraping-Assistant/ 156 | ├── MCPfiles/ 157 | │ ├── xpath_server.py # MCP server with web scraping tools 158 | │ └── Camoufox_template.py # Template for Camoufox scrapers 159 | ├── cursor-rules/ 160 | │ ├── website-analysis.mdc # Rules for analyzing websites 161 | │ ├── scrapy.mdc # Best practices for Scrapy 162 | │ ├── scrapy-step-by-step-process.mdc # Guide for creating scrapers 163 | │ ├── scraper-models.mdc # Templates for different scraper types 164 | │ └── prerequisites.mdc # Setup requirements 165 | └── README.md 166 | ``` 167 | 168 | ## TODO: Future Enhancements 169 | 170 | The following features are planned for future development: 171 | 172 | ### Proxy Integration 173 | - Add proxy support when requested by the operator 174 | - Implement proxy rotation strategies 175 | - Support for different proxy providers 176 | - Handle proxy authentication 177 | - Integrate with popular proxy services 178 | 179 | ### Improved XPath Generation and Validation 180 | - Add validation mechanisms for generated XPath selectors 181 | - Implement feedback loop for selector refinement 182 | - Control flow management for reworking selectors 183 | - Auto-correction of problematic selectors 184 | - Handle edge cases like dynamic content and AJAX loading 185 | 186 | ### Other Planned Features 187 | - Support for more scraper types (news sites, social media, etc.) 188 | - Integration with additional anti-bot bypass techniques 189 | - Enhanced JSON extraction capabilities 190 | - Support for more complex navigation patterns 191 | - Multi-page scraping optimizations 192 | 193 | ## References 194 | 195 | This project is based on articles from The Web Scraping Club: 196 | 197 | - [Claude & Cursor AI Scraping Assistant](https://substack.thewebscraping.club/p/claude-cursor-ai-scraping-assistant) 198 | - [Cursor MCP Web Scraping Assistant](https://substack.thewebscraping.club/p/cursor-mcp-web-scraping-assistant) 199 | 200 | For more information on web scraping techniques and best practices, visit [The Web Scraping Club](https://thewebscrapingclub.com). 201 | 202 | ## Contributing 203 | 204 | Contributions are welcome! Please feel free to submit a Pull Request. 205 | 206 | ## License 207 | 208 | This project is licensed under the MIT License - see the LICENSE file for details. -------------------------------------------------------------------------------- /cursor-rules/prerequisites.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: 4 | alwaysApply: true 5 | --- 6 | --- 7 | description: This rule provides the action that should be taken before starting implementing a Scrapy spider. 8 | globs: **/*.py 9 | --- 10 | # Step by step prerequisites actions to perform 11 | - Get with the pwd command the full path of the current project. Store in the context the current full path of the project. After you read it, please confirm in chat you did this step. 12 | - Once you performed every prerequisite, write in chat that you performed all the prerequisite actions. 13 | 14 | 15 | -------------------------------------------------------------------------------- /cursor-rules/scraper-models.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: 4 | alwaysApply: true 5 | --- 6 | --- 7 | description: This rule provides the description of the possible scraper types that can be created. 8 | globs: **/*.py 9 | --- 10 | # Scraper types 11 | Here's a list of possible scraper types and ther data structures 12 | 13 | ## E-commerce PLP 14 | - The structure of the items in a PLP scraper is the following: website_name, extraction_date, product_code, item_url, full_price, price, currency, image_url, brand, product_category1, product_category2, product_category3, product_name 15 | - When asked to create an e-commerce PLP scraper, set items.py file and in the scraper ouput accordingly to the data structure. 16 | - PLP pages are the product list pages in an e-commerce, also called catalogue pages. In a e-commerce PLP scraper, the scraper should crawl all the product catalog without entering the pages with product details. 17 | - The scraper will usually start from a home page. 18 | 19 | 20 | ## E-commerce PDP 21 | - The strucutre of the items in a PDP scraper is the following: website_name, extraction_date, product_code, item_url, full_price, price, currency, image_url, brand, product_category1, product_category2, product_category3, product_name, product_description, product_size, product_color, additional_info 22 | - When asked to create an e-commerce PDP scraper, set items.py file and in the scraper ouput accordingly to the data structure. 23 | - PDP pages are product detail pages, the final leaf of an e-commerce website. An e-commerce PDP scraper will have in input a list of PDP pages and won't need to crawl the website further. 24 | 25 | 26 | 27 | # How to fill the scraper fields with values 28 | - **When asked to map a field of a data structure to the information contained in the HTML, use the following rules:** 29 | - **website_name**: this is a fixed value per each scraper, usually the website's name in upper case. If in doubt, ask to the operator 30 | - **extraction_date**: fixed value for the whole execution, YYYY-MM-DD format. Use datetime library 31 | - **product_code**: code that identifies every single product on the website 32 | - **item_url**: URL of the page containing the details of the product. If PDP data structure, it corresponds to response.url 33 | - **full_price**: price before the discounts. If there's no discount on the item, it's the selling price. 34 | - **price**: final selling price after the discounts. If no discount is on the website, it's the selling price. 35 | - **currency**: ISO3 Code for currency, fixed value for a whole scraper. Detect the currency from the HTML and use the ISO Code to populate the field 36 | - **brand**: brand or producer of the product sold on the website 37 | - **product_category1**: first level or product categorization, usually the first level of the breadcrumb of the page, if any. 38 | - **product_category2**: second level or product categorization, usually the second level of the breadcrumb of the page, if any. 39 | - **product_category3**: third level or product categorization, usually the third level of the breadcrumb of the page, if any. 40 | - **product_name**: name of the product as shown on the pages 41 | - In any case and in any field of a scraper, do not hardcode any value but always find a selector to get the correct one. 42 | - Always print in output every field of the scraper, even if it's empty. 43 | 44 | # How to create an e-commerce PLP scraper 45 | Follow this step by step guide to create the code of the scraper. Before doing so, be sure to have read and executed all the commands in the [prerequisites.mdc](mdc:.cursor/rules/prerequisites.mdc) rule 46 | - **Create a scrapy project**: create a scrapy project named like the website. Follow the file structure you find in the [scrapy.mdc](mdc:.cursor/rules/scrapy.mdc) file 47 | - **Download the HTML of the home page**: download the home page of the website in the file homepage.html, using the full path retrieved before 48 | - **Read the HTML of the home page**: after downloading the home page HTML, read the file homepage.html to understand where are located the URLs of the product categories 49 | - **Ask in chat the URL of a category**: after reading the homepage.html file, ask in chat an URL of a product category. 50 | - **Write the parse method**: write the XPATH selectors needed for extract the URL given in chat and all the URLs that are similar. Do not extract strictly the URL passed but all the URLs with the same level of hierarchy in the HTML. Per each URL then call a method called parse_category 51 | - **Download the HTML of the product category page**: download the HTML of the product category page in a file called product_category.html in the full path retrieved in the previous steps 52 | - **Interpret the HTML file**: strip down all the css code from the file. Check if there is a complete JSON inside the HTML page, that can be used instead of parsing the HTML. Write in the chat if you've found it or not. 53 | - **Elaborate the JSON, if found**: if a JSON is found, fill the output fields with the values extracted from the JSON. Use the json python package. 54 | - **Read the product category HTML**: if a JSON is not found, read the file product_category.html to find where is the data needed to fill all the output fields 55 | - **Write XPATH selectors of the items**: if a JSON is not found, write the XPATH selectors needed to extract the information needed to fill all the fields of the output items, based on the HTML of the product_category.html file 56 | - **Write the code of the method parse_category**: now you can complete the parse_category method by using the XPATH or the JSON fields for retrieving the data needed for the items 57 | 58 | # How to create an e-commerce PDP scraper 59 | Follow this step by step guide to create the code of the scraper. 60 | - **Create a scrapy project**: create a scrapy project named like the website. Follow the file structure you find in the @scrapy.mdc file 61 | - **Download the HTML of a pdp page**: download the HTML of a PDP page of the website in the file pdp_page.html, using the full path retrieved before 62 | - **Read the HTML of the PDP page**: after downloading the PDP page HTML, read the file pdp_page.html to understand where are located the information needed to fill the fields of the e-commerce PDP data structure 63 | - **Write the parse_product method**: write the XPATH selectors needed for extract the information. Per each URL yield an item in output 64 | - **Create the final structire**: the E-commerce PDP scraper will receive PDP pages in input from a file called input.txt. When asked to write a PDP scraper, first load the URLS contained in the input.txt file in a list called urls and then iterate it. Per each url, call the method parse_product where you will write the selectors and yield the items 65 | 66 | # Suggested settings 67 | Here's a list of default values to use in the settings.py file: 68 | - ROBOTSTXT_OBEY = False 69 | - DEFAULT_REQUEST_HEADERS = { 70 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", 71 | "accept-language": "en-US,en;q=0.8", 72 | "priority": "u=0, i", 73 | "sec-ch-ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"", 74 | "sec-ch-ua-mobile": "?0", 75 | "sec-ch-ua-platform": "\"macOS\"", 76 | "sec-fetch-dest": "document", 77 | "sec-fetch-mode": "navigate", 78 | "sec-fetch-site": "none", 79 | "sec-fetch-user": "?1", 80 | "sec-gpc": "1", 81 | "upgrade-insecure-requests": "1" 82 | } 83 | - CONCURRENT_REQUESTS = 4 84 | - USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36" 85 | 86 | # Anti bot countermeasures 87 | - In case Akamai is protecting the website, implement scrapy_impersonate on the scraper using these custom options: 88 | custom_settings = { 89 | "DOWNLOAD_HANDLERS": { 90 | "http": "scrapy_impersonate.ImpersonateDownloadHandler", 91 | "https": "scrapy_impersonate.ImpersonateDownloadHandler", 92 | }, 93 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 94 | } 95 | and then add 'impersonate':'chrome110' in the meta values of each request. 96 | -------------------------------------------------------------------------------- /cursor-rules/scrapy-step-by-step-process.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: 4 | alwaysApply: true 5 | --- 6 | --- 7 | description: This rule provides a step by step guide to follow for a successful Scrapy project. Read and implement the rules oncained in this file in first place. 8 | globs: **/*.py 9 | --- 10 | 11 | # Scrapy step by step guide for a successful scraper creation 12 | This document contains a step by step guide for a successful Scrapy scraper creation. It is the first rule file that must be considered, read and executed. 13 | Here is the list of actions to take for a successful scraper creation: 14 | - Read and execute the [prerequisites.mdc](mdc:.cursor/rules/prerequisites.mdc) rule file. 15 | - Read and execute the [website-analysis.mdc](mdc:.cursor/rules/website-analysis.mdc) on the URL passed 16 | - Read the best practices for creating a Scrapy spider contained in the file @scrapy.mdc. 17 | - One requested to create a scraper, follow the instructions in the file [scraper-models.mdc](mdc:.cursor/rules/scraper-models.mdc) for the coherent scraper type requested. 18 | 19 | -------------------------------------------------------------------------------- /cursor-rules/scrapy.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: 4 | alwaysApply: true 5 | --- 6 | --- 7 | description: This rule provides comprehensive best practices for Scrapy development, including code organization, performance, security, testing, and common pitfalls to avoid. It aims to guide developers in building robust, efficient, and maintainable web scraping applications with Scrapy. 8 | globs: **/*.py 9 | --- 10 | # Scrapy Best Practices 11 | 12 | This document outlines the recommended best practices for developing Scrapy web scraping applications. Following these guidelines will help you create robust, efficient, secure, and maintainable scrapers. 13 | 14 | Before start developing a Scrapy spider, follow the [prerequisites.mdc](mdc:.cursor/rules/prerequisites.mdc) rule. 15 | 16 | Only after reading the [prerequisites.mdc](mdc:.cursor/rules/prerequisites.mdc) files and executing the steps included in it, you can start building the Scrapy spider. 17 | 18 | When asked to build a Scrapy spider, read the [scraper-models.mdc](mdc:.cursor/rules/scraper-models.mdc) rule for understanding the type of scraper to create. 19 | 20 | Never change the starting URL of the scraper, even in case of errors during the process, but always use the one given by the user. 21 | 22 | ## 1. Code Organization and Structure 23 | 24 | ### 1.1. Directory Structure 25 | 26 | - **Project Root:** Contains `scrapy.cfg`, project directory, and any `README.md`, `LICENSE`, or other project-level files. 27 | - **Project Directory (e.g., `my_project`):** 28 | - `__init__.py`: Marks the directory as a Python package. 29 | - `items.py`: Defines the data structures (Scrapy Items) for the scraped data. 30 | - `middlewares.py`: Contains the Scrapy middleware components, used for request/response processing. 31 | - `pipelines.py`: Defines the data processing pipelines, used for cleaning, validating, and storing the scraped data. 32 | - `settings.py`: Configures the Scrapy project, including settings for pipelines, middleware, concurrency, etc. 33 | - `spiders/`: 34 | - `__init__.py`: Marks the directory as a Python package. 35 | - `my_spider.py`: Contains the spider definitions (Scrapy Spiders) responsible for crawling and scraping data. 36 | 37 | Example: 38 | 39 | 40 | my_project/ 41 | ├── scrapy.cfg 42 | ├── my_project/ 43 | │ ├── __init__.py 44 | │ ├── items.py 45 | │ ├── middlewares.py 46 | │ ├── pipelines.py 47 | │ ├── settings.py 48 | │ └── spiders/ 49 | │ ├── __init__.py 50 | │ └── my_spider.py 51 | └── README.md 52 | 53 | 54 | ### 1.2. File Naming Conventions 55 | 56 | - **Spider Files:** `spider_name.py` (e.g., `product_spider.py`, `news_spider.py`) 57 | - **Item Files:** `items.py` (standard naming) 58 | - **Middleware Files:** `middlewares.py` (standard naming) 59 | - **Pipeline Files:** `pipelines.py` (standard naming) 60 | - **Settings Files:** `settings.py` (standard naming) 61 | 62 | ### 1.3. Module Organization 63 | 64 | - **Small Projects:** All spiders can reside in the `spiders/` directory. 65 | - **Large Projects:** Consider organizing spiders into subdirectories based on the target website or data type (e.g., `spiders/news/`, `spiders/ecommerce/`). 66 | - **Custom Modules:** Create custom modules (e.g., `utils/`, `lib/`) for reusable code, helper functions, and custom classes. 67 | 68 | ### 1.4. Component Architecture 69 | 70 | - **Spiders:** Focus on crawling and extracting raw data. 71 | - **Items:** Define the structure of the scraped data. 72 | - **Pipelines:** Handle data cleaning, validation, transformation, and storage. 73 | - **Middleware:** Manage request/response processing, error handling, and proxy management. 74 | 75 | ### 1.5. Code Splitting 76 | 77 | - **Separate Concerns:** Keep spiders lean and focused on crawling logic. Move data processing and storage to pipelines. 78 | - **Reusable Components:** Extract common functionality (e.g., custom item loaders, helper functions) into separate modules or classes. 79 | - **Configuration:** Use `settings.py` to manage project-wide configuration and avoid hardcoding values in spiders. 80 | - **Modular Middleware:** Create small, focused middleware components for specific tasks (e.g., user agent rotation, request retries). 81 | -------------------------------------------------------------------------------- /cursor-rules/website-analysis.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: 4 | alwaysApply: true 5 | --- 6 | --- 7 | description: This rule provides a step by step guide to analyze a website and its code, in order to write a better Scrapy scraper. 8 | globs: **/*.py 9 | --- 10 | 11 | # How to perform a website analysis before writing the code of a Scrapy scraper 12 | - From the conversation, please identify the type of Scrapy spider that should be built (e.g. E-commerce PLP, E-commerce PDP, etc.) and confirm it by writing it in the chat 13 | - Fetch the home page HTML and store it in the absolute path of the project, naming the file homepage.html, while store the cookies in the cookies.txt file 14 | - Using the MCP tool strip_css, strip the CSS from the file homepage.html and save the new version on the file homepage_stripped.html 15 | - Read the file cookies.txt and look for anti-bot software traces, like Akamai, Datadome, PerimeterX and so on. For every cookie name, check if it can be referred to an anti bot solution. 16 | - If Akamai is found between the anti-bots, when creating your scraper you'll need to add the scrapy_impersonate package in the execution to make it work 17 | - If Datadome or Kasada are found betwen the anti-bots, stop the process 18 | - If you're asked to create an e-commerce PLP scraper, follow these steps: 19 | - Ask for a product category URL and fetch its HTML, saving it in the absolute path of the project, with the name category.html. Store the cookies in the cookies_category.txt file. 20 | - Using the MCP tool strip_css, strip the CSS from the file category.html and save the new version on the file category_stripped.html 21 | - Read the file homepage_stripped.html and look for any well formatted JSON you can use to get all the product categories URL, included the one just passed. Look for schema.org but also common frameworks like Next.js. Be careful that schema.org JSON does not include the full price without discount of a product, so you'll need to look for it elsewhere. 22 | - Read the file category_stripped.html and look for any well formatted JSON you can use to read the product details of every product on the category page. If there's any, save this JSON in a file called catalog.json 23 | - If you're asled to create a PDP scraper, follow these steps: 24 | - Ask for a product page and fetch its HTML, saving it in the absolute path of the project, with the name product.html. Store the cookies in the cookies_product.txt file. 25 | - Using the MCP tool strip_css, strip the CSS from the file product.html and save the new version on the file product_stripped.html 26 | - Read the file product_stripped.html and look for any well formatted JSON you can use to read the product details of every product on the category page. If there's any, save this JSON in a file called product.json. Look for schema.org but also common frameworks like Next.js. Be careful that schema.org JSON does not include the full price without discount of a product, so you'll need to look for it elsewhere. 27 | --------------------------------------------------------------------------------