├── .gitignore ├── .vscode └── launch.json ├── Dockerfile ├── LICENSE ├── README.md ├── docker-compose.yml ├── docs └── crawl4ai │ ├── 1 getting started - crawl4ai documentation.md │ ├── 10 user simulation - crawl4ai documentation.md │ ├── 111 json css - crawl4ai documentation.md │ ├── 112 llm strategy - crawl4ai documentation.md │ ├── 113 cosine strategy - crawl4ai documentation.md │ ├── 12 session crawling - crawl4ai documentation.md │ ├── 13 text chunking - crawl4ai documentation.md │ ├── 14 custom workflows - crawl4ai documentation.md │ ├── 2 advanced features - crawl4ai documentation.md │ ├── 3 browser setup - crawl4ai documentation.md │ ├── 4 proxy settings - crawl4ai documentation.md │ ├── 5 dynamic content - crawl4ai documentation.md │ ├── 6 magic mode - crawl4ai documentation.md │ ├── 7 content cleaning - crawl4ai documentation.md │ ├── 8 media handling - crawl4ai documentation.md │ ├── 9 link analysis - crawl4ai documentation.md │ ├── asyncwebcrawler - crawl4ai documentation.md │ ├── asyncwebcrawlerarun - crawl4ai documentation.md │ ├── browser configuration - crawl4ai documentation.md │ ├── chunking - crawl4ai documentation.md │ ├── content processing - crawl4ai documentation.md │ ├── content selection - crawl4ai documentation.md │ ├── cosine strategy - crawl4ai documentation.md │ ├── crawlresult - crawl4ai documentation.md │ ├── docker deployment - crawl4ai documentation.md │ ├── home - crawl4ai documentation.md │ ├── hooks auth - crawl4ai documentation.md │ ├── installation - crawl4ai documentation.md │ ├── json-css extractor advanced - crawl4ai documentation.md │ ├── json-css extractor basic - crawl4ai documentation.md │ ├── llm strategy - crawl4ai documentation.md │ ├── magic mode - crawl4ai documentation.md │ ├── output formats - crawl4ai documentation.md │ ├── overview - crawl4ai documentation.md │ ├── page interaction - crawl4ai documentation.md │ ├── parameters table - crawl4ai documentation.md │ ├── proxy security - crawl4ai documentation.md │ ├── quick start - crawl4ai documentation.md │ ├── session management - crawl4ai documentation.md │ ├── session management advanced - crawl4ai documentation.md │ ├── simple crawling - crawl4ai documentation.md │ └── strategies - crawl4ai documentation.md ├── main.py ├── poetry.lock ├── pyproject.toml └── static └── index.html /.gitignore: -------------------------------------------------------------------------------- 1 | output*/ 2 | output_*/ 3 | __pycache__ 4 | .env 5 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python Debugger: FastAPI", 9 | "type": "debugpy", 10 | "request": "launch", 11 | "module": "uvicorn", 12 | "args": [ 13 | "main:app", 14 | "--reload" 15 | ], 16 | "jinja": true 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Python 3.10 as base image 2 | FROM python:3.10-slim 3 | 4 | # Install system dependencies 5 | RUN apt-get update && apt-get install -y default-libmysqlclient-dev build-essential pkg-config 6 | 7 | # Install Poetry 8 | RUN pip install poetry 9 | 10 | # Set working directory 11 | WORKDIR /app 12 | 13 | # Copy Poetry configuration files 14 | COPY pyproject.toml poetry.lock ./ 15 | 16 | # Configure Poetry to not create virtual environment inside container 17 | RUN poetry config virtualenvs.create false 18 | 19 | # Install dependencies 20 | RUN poetry install --no-dev --no-interaction --no-ansi 21 | 22 | RUN playwright install 23 | 24 | RUN playwright install-deps 25 | 26 | # Copy application files 27 | COPY main.py . 28 | COPY static/ static/ 29 | 30 | # Create directory for temporary files 31 | RUN mkdir -p /app/output 32 | 33 | # Expose port 8000 34 | EXPOSE 8000 35 | 36 | # Set the entry command 37 | CMD ["poetry", "run", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 f4ww4z 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # crawl4ai-frontend 2 | 3 | A FastAPI-based frontend for crawl4ai that provides a web interface and REST API for crawling websites and converting them to markdown format. 4 | 5 | ## Features 6 | 7 | - Web interface for easy interaction 8 | - REST API for programmatic access 9 | - Recursive website crawling with configurable depth 10 | - Automatic conversion of web pages to markdown format 11 | - Background job processing with status tracking 12 | - Results downloadable as ZIP archives 13 | - Docker support for easy deployment 14 | 15 | ## Installation 16 | 17 | ### Local Installation 18 | 19 | 1. Ensure you have Python 3.10+ and Poetry installed 20 | 2. Clone the repository 21 | 3. Install dependencies: 22 | ```bash 23 | poetry install 24 | ``` 25 | 26 | ### Docker Installation 27 | 28 | 1. Ensure you have Docker installed 29 | 2. Build the image: 30 | ```bash 31 | docker build -t crawl4ai-frontend . 32 | ``` 33 | 34 | Or use docker-compose: 35 | ```bash 36 | docker-compose up -d 37 | ``` 38 | 39 | ## Usage 40 | 41 | ### Running Locally 42 | 43 | ```bash 44 | poetry run uvicorn main:app --host 0.0.0.0 --port 8000 45 | ``` 46 | 47 | Then open http://localhost:8000 in your browser. 48 | 49 | ### Running with Docker 50 | 51 | ```bash 52 | docker run -p 8000:8000 crawl4ai-frontend 53 | ``` 54 | 55 | Then open http://localhost:8000 in your browser. 56 | 57 | ## API Documentation 58 | 59 | ### Start a Crawl Job 60 | 61 | ```http 62 | POST /api/crawl 63 | ``` 64 | 65 | Request body: 66 | ```json 67 | { 68 | "url": "https://example.com", 69 | "limit": 10 70 | } 71 | ``` 72 | 73 | Response: 74 | ```json 75 | { 76 | "job_id": "uuid", 77 | "status": "starting", 78 | "progress": 0 79 | } 80 | ``` 81 | 82 | ### Check Job Status 83 | 84 | ```http 85 | GET /api/status/{job_id} 86 | ``` 87 | 88 | Response: 89 | ```json 90 | { 91 | "job_id": "uuid", 92 | "status": "processing|completed|failed", 93 | "progress": 5, 94 | "total_pages": 10, 95 | "current_url": "https://example.com/page" 96 | } 97 | ``` 98 | 99 | ### Download Results 100 | 101 | ```http 102 | GET /api/download/{job_id} 103 | ``` 104 | 105 | Returns a ZIP file containing the crawled pages in markdown format. 106 | 107 | ## Dependencies 108 | 109 | - Python 3.10+ 110 | - FastAPI 111 | - Crawl4AI 112 | - aiofiles 113 | - Poetry (for dependency management) 114 | 115 | ## Development 116 | 117 | For development, additional dependencies can be installed: 118 | ```bash 119 | poetry install --with dev 120 | ``` 121 | 122 | Development dependencies include: 123 | - autopep8 (code formatting) 124 | - djlint (HTML template linting) 125 | 126 | ## License 127 | 128 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 129 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | web: 3 | build: . 4 | ports: 5 | - "${PORT}:8000" 6 | volumes: 7 | - ./output:/app/output 8 | restart: unless-stopped 9 | -------------------------------------------------------------------------------- /docs/crawl4ai/1 getting started - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 1: Introduction to Crawl4AI and Basic Installation 4 | 5 | ### Quick Intro 6 | 7 | Walk through installation from PyPI, setup, and verification. Show how to install with options like `torch` or `transformer` for advanced capabilities. 8 | 9 | Here's a condensed outline of the **Installation and Setup** video content: 10 | 11 | * * * 12 | 13 | 1) **Introduction to Crawl4AI**: Briefly explain that Crawl4AI is a powerful tool for web scraping, data extraction, and content processing, with customizable options for various needs. 14 | 15 | 2) **Installation Overview**: 16 | 17 | - **Basic Install**: Run `pip install crawl4ai` and `playwright install` (to set up browser dependencies). 18 | 19 | - **Optional Advanced Installs**: 20 | - `pip install crawl4ai[torch]` \- Adds PyTorch for clustering. 21 | - `pip install crawl4ai[transformer]` \- Adds support for LLM-based extraction. 22 | - `pip install crawl4ai[all]` \- Installs all features for complete functionality. 23 | 24 | 3) **Verifying the Installation**: 25 | 26 | - Walk through a simple test script to confirm the setup: 27 | 28 | 29 | 30 | ```hljs python 31 | import asyncio 32 | from crawl4ai import AsyncWebCrawler 33 | 34 | async def main(): 35 | async with AsyncWebCrawler(verbose=True) as crawler: 36 | result = await crawler.arun(url="https://www.example.com") 37 | print(result.markdown[:500]) # Show first 500 characters 38 | 39 | asyncio.run(main()) 40 | 41 | ``` 42 | 43 | - Explain that this script initializes the crawler and runs it on a test URL, displaying part of the extracted content to verify functionality. 44 | 45 | 4) **Important Tips**: 46 | 47 | - **Run** `playwright install` **after installation** to set up dependencies. 48 | - **For full performance** on text-related tasks, run `crawl4ai-download-models` after installing with `[torch]`, `[transformer]`, or `[all]` options. 49 | - If you encounter issues, refer to the documentation or GitHub issues. 50 | 51 | 5) **Wrap Up**: 52 | 53 | - Introduce the next topic in the series, which will cover Crawl4AI's browser configuration options (like choosing between `chromium`, `firefox`, and `webkit`). 54 | 55 | * * * 56 | 57 | This structure provides a concise, effective guide to get viewers up and running with Crawl4AI in minutes. 58 | 59 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/10 user simulation - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 10: Custom Headers, Identity, and User Simulation 4 | 5 | ### Quick Intro 6 | 7 | Teach how to use custom headers, user-agent strings, and simulate real user interactions. Demo: Set custom user-agent and headers to access a site that blocks typical crawlers. 8 | 9 | Here’s a concise outline for the **Custom Headers, Identity Management, and User Simulation** video: 10 | 11 | * * * 12 | 13 | ### **Custom Headers, Identity Management, & User Simulation** 14 | 15 | 1) **Why Customize Headers and Identity in Crawling**: 16 | 17 | - Websites often track request headers and browser properties to detect bots. Customizing headers and managing identity help make requests appear more human, improving access to restricted sites. 18 | 19 | 2) **Setting Custom Headers**: 20 | 21 | - Customize HTTP headers to mimic genuine browser requests or meet site-specific requirements: 22 | 23 | 24 | 25 | ```hljs makefile 26 | headers = { 27 | "Accept-Language": "en-US,en;q=0.9", 28 | "X-Requested-With": "XMLHttpRequest", 29 | "Cache-Control": "no-cache" 30 | } 31 | crawler = AsyncWebCrawler(headers=headers) 32 | 33 | ``` 34 | 35 | - **Use Case**: Customize the `Accept-Language` header to simulate local user settings, or `Cache-Control` to bypass cache for fresh content. 36 | 37 | 3) **Setting a Custom User Agent**: 38 | 39 | - Some websites block requests from common crawler user agents. Setting a custom user agent string helps bypass these restrictions: 40 | 41 | 42 | 43 | ```hljs makefile 44 | crawler = AsyncWebCrawler( 45 | user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" 46 | ) 47 | 48 | ``` 49 | 50 | - **Tip**: Use user-agent strings from popular browsers (e.g., Chrome, Firefox) to improve access and reduce detection risks. 51 | 52 | 4) **User Simulation for Human-like Behavior**: 53 | 54 | - Enable `simulate_user=True` to mimic natural user interactions, such as random timing and simulated mouse movements: 55 | 56 | 57 | 58 | ```hljs python 59 | result = await crawler.arun( 60 | url="https://example.com", 61 | simulate_user=True # Simulates human-like behavior 62 | ) 63 | 64 | ``` 65 | 66 | - **Behavioral Effects**: Adds subtle variations in interactions, making the crawler harder to detect on bot-protected sites. 67 | 68 | 5) **Navigator Overrides and Magic Mode for Full Identity Masking**: 69 | 70 | - Use `override_navigator=True` to mask automation indicators like `navigator.webdriver`, which websites check to detect bots: 71 | 72 | 73 | 74 | ```hljs python 75 | result = await crawler.arun( 76 | url="https://example.com", 77 | override_navigator=True # Masks bot-related signals 78 | ) 79 | 80 | ``` 81 | 82 | - **Combining with Magic Mode**: For a complete anti-bot setup, combine these identity options with `magic=True` for maximum protection: 83 | 84 | 85 | 86 | ```hljs python 87 | async with AsyncWebCrawler() as crawler: 88 | result = await crawler.arun( 89 | url="https://example.com", 90 | magic=True, # Enables all anti-bot detection features 91 | user_agent="Custom-Agent", # Custom agent with Magic Mode 92 | ) 93 | 94 | ``` 95 | 96 | - This setup includes all anti-detection techniques like navigator masking, random timing, and user simulation. 97 | 98 | 6) **Example: Comprehensive Setup for Identity Management**: 99 | 100 | - A full example combining custom headers, user-agent, and user simulation for a realistic browsing profile: 101 | 102 | 103 | 104 | ```hljs python 105 | async with AsyncWebCrawler( 106 | headers={"Accept-Language": "en-US", "Cache-Control": "no-cache"}, 107 | user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0", 108 | simulate_user=True 109 | ) as crawler: 110 | result = await crawler.arun(url="https://example.com/secure-page") 111 | print(result.markdown[:500]) # Display extracted content 112 | 113 | ``` 114 | 115 | - This example enables detailed customization for evading detection and accessing protected pages smoothly. 116 | 117 | 7) **Wrap Up & Next Steps**: 118 | 119 | - Recap the value of headers, user-agent customization, and simulation in bypassing bot detection. 120 | - Tease the next video: **Extraction Strategies: JSON CSS, LLM, and Cosine** to dive into structured data extraction methods for high-quality content retrieval. 121 | 122 | * * * 123 | 124 | This outline equips users with tools for managing crawler identity and human-like behavior, essential for accessing bot-protected or restricted websites. 125 | 126 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/111 json css - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, covering all key aspects and supported structures in Crawl4AI: 2 | 3 | * * * 4 | 5 | ### **10.1 JSON-CSS Extraction Strategy** 6 | 7 | #### **1\. Introduction to JSON-CSS Extraction** 8 | 9 | - JSON-CSS Extraction is used for pulling structured data from pages with repeated patterns, like product listings, article feeds, or directories. 10 | - This strategy allows defining a schema with CSS selectors and data fields, making it easy to capture nested, list-based, or singular elements. 11 | 12 | #### **2\. Basic Schema Structure** 13 | 14 | - **Schema Fields**: The schema has two main components: 15 | - `baseSelector`: A CSS selector to locate the main elements you want to extract (e.g., each article or product block). 16 | - `fields`: Defines the data fields for each element, supporting various data types and structures. 17 | 18 | #### **3\. Simple Field Extraction** 19 | 20 | - **Example HTML**: 21 | 22 | 23 | 24 | ```hljs php-template 25 |
26 |

Sample Product

27 | $19.99 28 |

This is a sample product.

29 |
30 | 31 | ``` 32 | 33 | - **Schema**: 34 | 35 | 36 | 37 | ```hljs graphql 38 | schema = { 39 | "baseSelector": ".product", 40 | "fields": [\ 41 | {"name": "title", "selector": ".title", "type": "text"},\ 42 | {"name": "price", "selector": ".price", "type": "text"},\ 43 | {"name": "description", "selector": ".description", "type": "text"}\ 44 | ] 45 | } 46 | 47 | ``` 48 | 49 | - **Explanation**: Each field captures text content from specified CSS selectors within each `.product` element. 50 | 51 | #### **4\. Supported Field Types: Text, Attribute, HTML, Regex** 52 | 53 | - **Field Type Options**: 54 | - `text`: Extracts visible text. 55 | - `attribute`: Captures an HTML attribute (e.g., `src`, `href`). 56 | - `html`: Extracts the raw HTML of an element. 57 | - `regex`: Allows regex patterns to extract part of the text. 58 | - **Example HTML** (including an image): 59 | 60 | 61 | 62 | 63 | ```hljs javascript 64 |
65 |

Sample Product

66 | Product Image 67 | $19.99 68 |

Limited time offer.

69 |
70 | 71 | ``` 72 | 73 | - **Schema**: 74 | 75 | 76 | 77 | ```hljs python 78 | schema = { 79 | "baseSelector": ".product", 80 | "fields": [\ 81 | {"name": "title", "selector": ".title", "type": "text"},\ 82 | {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"},\ 83 | {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"},\ 84 | {"name": "description_html", "selector": ".description", "type": "html"}\ 85 | ] 86 | } 87 | 88 | ``` 89 | 90 | - **Explanation**: 91 | - `attribute`: Extracts the `src` attribute from `.product-image`. 92 | - `regex`: Extracts the numeric part from `$19.99`. 93 | - `html`: Retrieves the full HTML of the description element. 94 | 95 | #### **5\. Nested Field Extraction** 96 | 97 | - **Use Case**: Useful when content contains sub-elements, such as an article with author details within it. 98 | - **Example HTML**: 99 | 100 | 101 | 102 | ```hljs php-template 103 |
104 |

Sample Article

105 |
106 | John Doe 107 | Writer and editor 108 |
109 |
110 | 111 | ``` 112 | 113 | - **Schema**: 114 | 115 | 116 | 117 | ```hljs graphql 118 | schema = { 119 | "baseSelector": ".article", 120 | "fields": [\ 121 | {"name": "title", "selector": ".title", "type": "text"},\ 122 | {"name": "author", "type": "nested", "selector": ".author", "fields": [\ 123 | {"name": "name", "selector": ".name", "type": "text"},\ 124 | {"name": "bio", "selector": ".bio", "type": "text"}\ 125 | ]}\ 126 | ] 127 | } 128 | 129 | ``` 130 | 131 | - **Explanation**: 132 | - `nested`: Extracts `name` and `bio` within `.author`, grouping the author details in a single `author` object. 133 | 134 | #### **6\. List and Nested List Extraction** 135 | 136 | - **List**: Extracts multiple elements matching the selector as a list. 137 | - **Nested List**: Allows lists within lists, useful for items with sub-lists (e.g., specifications for each product). 138 | - **Example HTML**: 139 | 140 | 141 | 142 | ```hljs php-template 143 |
144 |

Product with Features

145 | 150 |
151 | 152 | ``` 153 | 154 | - **Schema**: 155 | 156 | 157 | 158 | ```hljs graphql 159 | schema = { 160 | "baseSelector": ".product", 161 | "fields": [\ 162 | {"name": "title", "selector": ".title", "type": "text"},\ 163 | {"name": "features", "type": "list", "selector": ".features .feature", "fields": [\ 164 | {"name": "feature", "type": "text"}\ 165 | ]}\ 166 | ] 167 | } 168 | 169 | ``` 170 | 171 | - **Explanation**: 172 | - `list`: Captures each `.feature` item within `.features`, outputting an array of features under the `features` field. 173 | 174 | #### **7\. Transformations for Field Values** 175 | 176 | - Transformations allow you to modify extracted values (e.g., converting to lowercase). 177 | - Supported transformations: `lowercase`, `uppercase`, `strip`. 178 | - **Example HTML**: 179 | 180 | 181 | 182 | ```hljs php-template 183 |
184 |

Special Product

185 |
186 | 187 | ``` 188 | 189 | - **Schema**: 190 | 191 | 192 | 193 | ```hljs graphql 194 | schema = { 195 | "baseSelector": ".product", 196 | "fields": [\ 197 | {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"}\ 198 | ] 199 | } 200 | 201 | ``` 202 | 203 | - **Explanation**: The `transform` property changes the `title` to uppercase, useful for standardized outputs. 204 | 205 | #### **8\. Full JSON-CSS Extraction Example** 206 | 207 | - Combining all elements in a single schema example for a comprehensive crawl: 208 | - **Example HTML**: 209 | 210 | 211 | 212 | ```hljs javascript 213 |
214 |

Featured Product

215 | 216 | $99.99 217 |

Best product of the year.

218 | 222 |
223 | 224 | ``` 225 | 226 | - **Schema**: 227 | 228 | 229 | 230 | ```hljs python 231 | schema = { 232 | "baseSelector": ".product", 233 | "fields": [\ 234 | {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"},\ 235 | {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"},\ 236 | {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"},\ 237 | {"name": "description", "selector": ".description", "type": "html"},\ 238 | {"name": "features", "type": "list", "selector": ".features .feature", "fields": [\ 239 | {"name": "feature", "type": "text"}\ 240 | ]}\ 241 | ] 242 | } 243 | 244 | ``` 245 | 246 | - **Explanation**: This schema captures and transforms each aspect of the product, illustrating the JSON-CSS strategy’s versatility for structured extraction. 247 | 248 | #### **9\. Wrap Up & Next Steps** 249 | 250 | - Summarize JSON-CSS Extraction’s flexibility for structured, pattern-based extraction. 251 | - Tease the next video: **10.2 LLM Extraction Strategy**, focusing on using language models to extract data based on intelligent content analysis. 252 | 253 | * * * 254 | 255 | This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users. 256 | 257 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/112 llm strategy - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine 4 | 5 | ### Quick Intro 6 | 7 | Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site. 8 | 9 | Here’s a comprehensive outline for the **LLM Extraction Strategy** video, covering key details and example applications. 10 | 11 | * * * 12 | 13 | ### **10.2 LLM Extraction Strategy** 14 | 15 | #### **1\. Introduction to LLM Extraction Strategy** 16 | 17 | - The LLM Extraction Strategy leverages language models to interpret and extract structured data from complex web content. 18 | - Unlike traditional CSS selectors, this strategy uses natural language instructions and schemas to guide the extraction, ideal for unstructured or diverse content. 19 | - Supports **OpenAI**, **Azure OpenAI**, **HuggingFace**, and **Ollama** models, enabling flexibility with both proprietary and open-source providers. 20 | 21 | #### **2\. Key Components of LLM Extraction Strategy** 22 | 23 | - **Provider**: Specifies the LLM provider (e.g., OpenAI, HuggingFace, Azure). 24 | - **API Token**: Required for most providers, except Ollama (local LLM model). 25 | - **Instruction**: Custom extraction instructions sent to the model, providing flexibility in how the data is structured and extracted. 26 | - **Schema**: Optional, defines structured fields to organize extracted data into JSON format. 27 | - **Extraction Type**: Supports `"block"` for simpler text blocks or `"schema"` when a structured output format is required. 28 | - **Chunking Parameters**: Breaks down large documents, with options to adjust chunk size and overlap rate for more accurate extraction across lengthy texts. 29 | 30 | #### **3\. Basic Extraction Example: OpenAI Model Pricing** 31 | 32 | - **Goal**: Extract model names and their input and output fees from the OpenAI pricing page. 33 | - **Schema Definition**: 34 | - **Model Name**: Text for model identification. 35 | - **Input Fee**: Token cost for input processing. 36 | - **Output Fee**: Token cost for output generation. 37 | - **Schema**: 38 | 39 | 40 | 41 | 42 | ```hljs scss 43 | class OpenAIModelFee(BaseModel): 44 | model_name: str = Field(..., description="Name of the OpenAI model.") 45 | input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") 46 | output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") 47 | 48 | ``` 49 | 50 | - **Example Code**: 51 | 52 | 53 | 54 | 55 | ```hljs csharp 56 | async def extract_openai_pricing(): 57 | async with AsyncWebCrawler() as crawler: 58 | result = await crawler.arun( 59 | url="https://openai.com/api/pricing/", 60 | extraction_strategy=LLMExtractionStrategy( 61 | provider="openai/gpt-4o", 62 | api_token=os.getenv("OPENAI_API_KEY"), 63 | schema=OpenAIModelFee.schema(), 64 | extraction_type="schema", 65 | instruction="Extract model names and fees for input and output tokens from the page." 66 | ), 67 | bypass_cache=True 68 | ) 69 | print(result.extracted_content) 70 | 71 | ``` 72 | 73 | - **Explanation**: 74 | - The extraction strategy combines a schema and detailed instruction to guide the LLM in capturing structured data. 75 | - Each model’s name, input fee, and output fee are extracted in a JSON format. 76 | 77 | #### **4\. Knowledge Graph Extraction Example** 78 | 79 | - **Goal**: Extract entities and their relationships from a document for use in a knowledge graph. 80 | - **Schema Definition**: 81 | - **Entities**: Individual items with descriptions (e.g., people, organizations). 82 | - **Relationships**: Connections between entities, including descriptions and relationship types. 83 | - **Schema**: 84 | 85 | 86 | 87 | 88 | ```hljs yaml 89 | class Entity(BaseModel): 90 | name: str 91 | description: str 92 | 93 | class Relationship(BaseModel): 94 | entity1: Entity 95 | entity2: Entity 96 | description: str 97 | relation_type: str 98 | 99 | class KnowledgeGraph(BaseModel): 100 | entities: List[Entity] 101 | relationships: List[Relationship] 102 | 103 | ``` 104 | 105 | - **Example Code**: 106 | 107 | 108 | 109 | 110 | ```hljs csharp 111 | async def extract_knowledge_graph(): 112 | extraction_strategy = LLMExtractionStrategy( 113 | provider="azure/gpt-4o-mini", 114 | api_token=os.getenv("AZURE_API_KEY"), 115 | schema=KnowledgeGraph.schema(), 116 | extraction_type="schema", 117 | instruction="Extract entities and relationships from the content to build a knowledge graph." 118 | ) 119 | async with AsyncWebCrawler() as crawler: 120 | result = await crawler.arun( 121 | url="https://example.com/some-article", 122 | extraction_strategy=extraction_strategy, 123 | bypass_cache=True 124 | ) 125 | print(result.extracted_content) 126 | 127 | ``` 128 | 129 | - **Explanation**: 130 | - In this setup, the LLM extracts entities and their relationships based on the schema and instruction. 131 | - The schema organizes results into a JSON-based knowledge graph format. 132 | 133 | #### **5\. Key Settings in LLM Extraction** 134 | 135 | - **Chunking Options**: 136 | - For long pages, set `chunk_token_threshold` to specify maximum token count per section. 137 | - Adjust `overlap_rate` to control the overlap between chunks, useful for contextual consistency. 138 | - **Example**: 139 | 140 | 141 | 142 | ```hljs lua 143 | extraction_strategy = LLMExtractionStrategy( 144 | provider="openai/gpt-4", 145 | api_token=os.getenv("OPENAI_API_KEY"), 146 | chunk_token_threshold=3000, 147 | overlap_rate=0.2, # 20% overlap between chunks 148 | instruction="Extract key insights and relationships." 149 | ) 150 | 151 | ``` 152 | 153 | - This setup ensures that longer texts are divided into manageable chunks with slight overlap, enhancing the quality of extraction. 154 | 155 | #### **6\. Flexible Provider Options for LLM Extraction** 156 | 157 | - **Using Proprietary Models**: OpenAI, Azure, and HuggingFace provide robust language models, often suited for complex or detailed extractions. 158 | - **Using Open-Source Models**: Ollama and other open-source models can be deployed locally, suitable for offline or cost-effective extraction. 159 | - **Example Call**: 160 | 161 | 162 | 163 | ```hljs lua 164 | await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) 165 | await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) 166 | await extract_structured_data_using_llm("ollama/llama3.2") 167 | 168 | ``` 169 | 170 | 171 | #### **7\. Complete Example of LLM Extraction Setup** 172 | 173 | - Code to run both the OpenAI pricing and Knowledge Graph extractions, using various providers: 174 | 175 | 176 | 177 | ```hljs csharp 178 | async def main(): 179 | await extract_openai_pricing() 180 | await extract_knowledge_graph() 181 | 182 | if __name__ == "__main__": 183 | asyncio.run(main()) 184 | 185 | ``` 186 | 187 | 188 | #### **8\. Wrap Up & Next Steps** 189 | 190 | - Recap the power of LLM extraction for handling unstructured or complex data extraction tasks. 191 | - Tease the next video: **10.3 Cosine Similarity Strategy** for clustering similar content based on semantic similarity. 192 | 193 | * * * 194 | 195 | This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases. 196 | 197 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/113 cosine strategy - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine 4 | 5 | ### Quick Intro 6 | 7 | Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site. 8 | 9 | Here’s a structured outline for the **Cosine Similarity Strategy** video, covering key concepts, configuration, and a practical example. 10 | 11 | * * * 12 | 13 | ### **10.3 Cosine Similarity Strategy** 14 | 15 | #### **1\. Introduction to Cosine Similarity Strategy** 16 | 17 | - The Cosine Similarity Strategy clusters content by semantic similarity, offering an efficient alternative to LLM-based extraction, especially when speed is a priority. 18 | - Ideal for grouping similar sections of text, this strategy is well-suited for pages with content sections that may need to be classified or tagged, like news articles, product descriptions, or reviews. 19 | 20 | #### **2\. Key Configuration Options** 21 | 22 | - **semantic\_filter**: A keyword-based filter to focus on relevant content. 23 | - **word\_count\_threshold**: Minimum number of words per cluster, filtering out shorter, less meaningful clusters. 24 | - **max\_dist**: Maximum allowable distance between elements in clusters, impacting cluster tightness. 25 | - **linkage\_method**: Method for hierarchical clustering, such as `'ward'` (for well-separated clusters). 26 | - **top\_k**: Specifies the number of top categories for each cluster. 27 | - **model\_name**: Defines the model for embeddings, such as `sentence-transformers/all-MiniLM-L6-v2`. 28 | - **sim\_threshold**: Minimum similarity threshold for filtering, allowing control over cluster relevance. 29 | 30 | #### **3\. How Cosine Similarity Clustering Works** 31 | 32 | - **Step 1**: Embeddings are generated for each text section, transforming them into vectors that capture semantic meaning. 33 | - **Step 2**: Hierarchical clustering groups similar sections based on cosine similarity, forming clusters with related content. 34 | - **Step 3**: Clusters are filtered based on word count, removing those below the `word_count_threshold`. 35 | - **Step 4**: Each cluster is then categorized with tags, if enabled, providing context to each grouped content section. 36 | 37 | #### **4\. Example Use Case: Clustering Blog Article Sections** 38 | 39 | - **Goal**: Group related sections of a blog or news page to identify distinct topics or discussion areas. 40 | - **Example HTML Sections**: 41 | 42 | 43 | 44 | 45 | ```hljs bash 46 | "The economy is showing signs of recovery, with markets up this quarter.", 47 | "In the sports world, several major teams are preparing for the upcoming season.", 48 | "New advancements in AI technology are reshaping the tech landscape.", 49 | "Market analysts are optimistic about continued growth in tech stocks." 50 | 51 | ``` 52 | 53 | - **Code Setup**: 54 | 55 | 56 | 57 | 58 | ```hljs csharp 59 | async def extract_blog_sections(): 60 | extraction_strategy = CosineStrategy( 61 | word_count_threshold=15, 62 | max_dist=0.3, 63 | sim_threshold=0.2, 64 | model_name="sentence-transformers/all-MiniLM-L6-v2", 65 | top_k=2 66 | ) 67 | async with AsyncWebCrawler() as crawler: 68 | url = "https://example.com/blog-page" 69 | result = await crawler.arun( 70 | url=url, 71 | extraction_strategy=extraction_strategy, 72 | bypass_cache=True 73 | ) 74 | print(result.extracted_content) 75 | 76 | ``` 77 | 78 | - **Explanation**: 79 | - **word\_count\_threshold**: Ensures only clusters with meaningful content are included. 80 | - **sim\_threshold**: Filters out clusters with low similarity, focusing on closely related sections. 81 | - **top\_k**: Selects top tags, useful for identifying main topics. 82 | 83 | #### **5\. Applying Semantic Filtering with Cosine Similarity** 84 | 85 | - **Semantic Filter**: Filters sections based on relevance to a specific keyword, such as “technology” for tech articles. 86 | - **Example Code**: 87 | 88 | 89 | 90 | ```hljs makefile 91 | extraction_strategy = CosineStrategy( 92 | semantic_filter="technology", 93 | word_count_threshold=10, 94 | max_dist=0.25, 95 | model_name="sentence-transformers/all-MiniLM-L6-v2" 96 | ) 97 | 98 | ``` 99 | 100 | - **Explanation**: 101 | - **semantic\_filter**: Only sections with high similarity to the “technology” keyword will be included in the clustering, making it easy to focus on specific topics within a mixed-content page. 102 | 103 | #### **6\. Clustering Product Reviews by Similarity** 104 | 105 | - **Goal**: Organize product reviews by themes, such as “price,” “quality,” or “durability.” 106 | - **Example Reviews**: 107 | 108 | 109 | 110 | 111 | ```hljs css 112 | "The quality of this product is outstanding and well worth the price.", 113 | "I found the product to be durable but a bit overpriced.", 114 | "Great value for the money and long-lasting.", 115 | "The build quality is good, but I expected a lower price point." 116 | 117 | ``` 118 | 119 | - **Code Setup**: 120 | 121 | 122 | 123 | 124 | ```hljs csharp 125 | async def extract_product_reviews(): 126 | extraction_strategy = CosineStrategy( 127 | word_count_threshold=20, 128 | max_dist=0.35, 129 | sim_threshold=0.25, 130 | model_name="sentence-transformers/all-MiniLM-L6-v2" 131 | ) 132 | async with AsyncWebCrawler() as crawler: 133 | url = "https://example.com/product-reviews" 134 | result = await crawler.arun( 135 | url=url, 136 | extraction_strategy=extraction_strategy, 137 | bypass_cache=True 138 | ) 139 | print(result.extracted_content) 140 | 141 | ``` 142 | 143 | - **Explanation**: 144 | - This configuration clusters similar reviews, grouping feedback by common themes, helping businesses understand customer sentiments around particular product aspects. 145 | 146 | #### **7\. Performance Advantages of Cosine Strategy** 147 | 148 | - **Speed**: The Cosine Similarity Strategy is faster than LLM-based extraction, as it doesn’t rely on API calls to external LLMs. 149 | - **Local Processing**: The strategy runs locally with pre-trained sentence embeddings, ideal for high-throughput scenarios where cost and latency are concerns. 150 | - **Comparison**: With a well-optimized local model, this method can perform clustering on large datasets quickly, making it suitable for tasks requiring rapid, repeated analysis. 151 | 152 | #### **8\. Full Code Example for Clustering News Articles** 153 | 154 | - **Code**: 155 | 156 | 157 | 158 | ```hljs csharp 159 | async def main(): 160 | await extract_blog_sections() 161 | await extract_product_reviews() 162 | 163 | if __name__ == "__main__": 164 | asyncio.run(main()) 165 | 166 | ``` 167 | 168 | 169 | #### **9\. Wrap Up & Next Steps** 170 | 171 | - Recap the efficiency and effectiveness of Cosine Similarity for clustering related content quickly. 172 | - Close with a reminder of Crawl4AI’s flexibility across extraction strategies, and prompt users to experiment with different settings to optimize clustering for their specific content. 173 | 174 | * * * 175 | 176 | This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently. 177 | 178 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/12 session crawling - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 12: Session-Based Crawling for Dynamic Websites 4 | 5 | ### Quick Intro 6 | 7 | Show session management for handling websites with multiple pages or actions (like “load more” buttons). Demo: Crawl a paginated content page, persisting session data across multiple requests. 8 | 9 | Here’s a detailed outline for the **Session-Based Crawling for Dynamic Websites** video, explaining why sessions are necessary, how to use them, and providing practical examples and a visual diagram to illustrate the concept. 10 | 11 | * * * 12 | 13 | ### **11\. Session-Based Crawling for Dynamic Websites** 14 | 15 | #### **1\. Introduction to Session-Based Crawling** 16 | 17 | - **What is Session-Based Crawling**: Session-based crawling maintains a continuous browsing session across multiple page states, allowing the crawler to interact with a page and retrieve content that loads dynamically or based on user interactions. 18 | - **Why It’s Needed**: 19 | - In static pages, all content is available directly from a single URL. 20 | - In dynamic websites, content often loads progressively or based on user actions (e.g., clicking “load more,” submitting forms, scrolling). 21 | - Session-based crawling helps simulate user actions, capturing content that is otherwise hidden until specific actions are taken. 22 | 23 | #### **2\. Conceptual Diagram for Session-Based Crawling** 24 | 25 | ```hljs less 26 | graph TD 27 | Start[Start Session] --> S1[Initial State (S1)] 28 | S1 -->|Crawl| Content1[Extract Content S1] 29 | S1 -->|Action: Click Load More| S2[State S2] 30 | S2 -->|Crawl| Content2[Extract Content S2] 31 | S2 -->|Action: Scroll Down| S3[State S3] 32 | S3 -->|Crawl| Content3[Extract Content S3] 33 | S3 -->|Action: Submit Form| S4[Final State] 34 | S4 -->|Crawl| Content4[Extract Content S4] 35 | Content4 --> End[End Session] 36 | 37 | ``` 38 | 39 | - **Explanation of Diagram**: 40 | - **Start**: Initializes the session and opens the starting URL. 41 | - **State Transitions**: Each action (e.g., clicking “load more,” scrolling) transitions to a new state, where additional content becomes available. 42 | - **Session Persistence**: Keeps the same browsing session active, preserving the state and allowing for a sequence of actions to unfold. 43 | - **End**: After reaching the final state, the session ends, and all accumulated content has been extracted. 44 | 45 | #### **3\. Key Components of Session-Based Crawling in Crawl4AI** 46 | 47 | - **Session ID**: A unique identifier to maintain the state across requests, allowing the crawler to “remember” previous actions. 48 | - **JavaScript Execution**: Executes JavaScript commands (e.g., clicks, scrolls) to simulate interactions. 49 | - **Wait Conditions**: Ensures the crawler waits for content to load in each state before moving on. 50 | - **Sequential State Transitions**: By defining actions and wait conditions between states, the crawler can navigate through the page as a user would. 51 | 52 | #### **4\. Basic Session Example: Multi-Step Content Loading** 53 | 54 | - **Goal**: Crawl an article feed that requires several “load more” clicks to display additional content. 55 | - **Code**: 56 | 57 | 58 | 59 | ```hljs python 60 | async def crawl_article_feed(): 61 | async with AsyncWebCrawler() as crawler: 62 | session_id = "feed_session" 63 | 64 | for page in range(3): 65 | result = await crawler.arun( 66 | url="https://example.com/articles", 67 | session_id=session_id, 68 | js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, 69 | wait_for="css:.article", 70 | css_selector=".article" # Target article elements 71 | ) 72 | print(f"Page {page + 1}: Extracted {len(result.extracted_content)} articles") 73 | 74 | ``` 75 | 76 | - **Explanation**: 77 | - **session\_id**: Ensures all requests share the same browsing state. 78 | - **js\_code**: Clicks the “load more” button after the initial page load, expanding content on each iteration. 79 | - **wait\_for**: Ensures articles have loaded after each click before extraction. 80 | 81 | #### **5\. Advanced Example: E-Commerce Product Search with Filter Selection** 82 | 83 | - **Goal**: Interact with filters on an e-commerce page to extract products based on selected criteria. 84 | - **Example Steps**: 85 | 1. **State 1**: Load the main product page. 86 | 2. **State 2**: Apply a filter (e.g., “On Sale”) by selecting a checkbox. 87 | 3. **State 3**: Scroll to load additional products and capture updated results. 88 | - **Code**: 89 | 90 | 91 | 92 | 93 | ```hljs python 94 | async def extract_filtered_products(): 95 | async with AsyncWebCrawler() as crawler: 96 | session_id = "product_session" 97 | 98 | # Step 1: Open product page 99 | result = await crawler.arun( 100 | url="https://example.com/products", 101 | session_id=session_id, 102 | wait_for="css:.product-item" 103 | ) 104 | 105 | # Step 2: Apply filter (e.g., "On Sale") 106 | result = await crawler.arun( 107 | url="https://example.com/products", 108 | session_id=session_id, 109 | js_code="document.querySelector('#sale-filter-checkbox').click();", 110 | wait_for="css:.product-item" 111 | ) 112 | 113 | # Step 3: Scroll to load additional products 114 | for _ in range(2): # Scroll down twice 115 | result = await crawler.arun( 116 | url="https://example.com/products", 117 | session_id=session_id, 118 | js_code="window.scrollTo(0, document.body.scrollHeight);", 119 | wait_for="css:.product-item" 120 | ) 121 | print(f"Loaded {len(result.extracted_content)} products after scroll") 122 | 123 | ``` 124 | 125 | - **Explanation**: 126 | - **State Persistence**: Each action (filter selection and scroll) builds on the previous session state. 127 | - **Multiple Interactions**: Combines clicking a filter with scrolling, demonstrating how the session preserves these actions. 128 | 129 | #### **6\. Key Benefits of Session-Based Crawling** 130 | 131 | - **Accessing Hidden Content**: Retrieves data that loads only after user actions. 132 | - **Simulating User Behavior**: Handles interactive elements such as “load more” buttons, dropdowns, and filters. 133 | - **Maintaining Continuity Across States**: Enables a sequential process, moving logically from one state to the next, capturing all desired content without reloading the initial state each time. 134 | 135 | #### **7\. Additional Configuration Tips** 136 | 137 | - **Manage Session End**: Always conclude the session after the final state to release resources. 138 | - **Optimize with Wait Conditions**: Use `wait_for` to ensure complete loading before each extraction. 139 | - **Handling Errors in Session-Based Crawling**: Include error handling for interactions that may fail, ensuring robustness across state transitions. 140 | 141 | #### **8\. Complete Code Example: Multi-Step Session Workflow** 142 | 143 | - **Example**: 144 | 145 | 146 | 147 | ```hljs csharp 148 | async def main(): 149 | await crawl_article_feed() 150 | await extract_filtered_products() 151 | 152 | if __name__ == "__main__": 153 | asyncio.run(main()) 154 | 155 | ``` 156 | 157 | 158 | #### **9\. Wrap Up & Next Steps** 159 | 160 | - Recap the usefulness of session-based crawling for dynamic content extraction. 161 | - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler** to cover advanced customization options for further control over the crawling process. 162 | 163 | * * * 164 | 165 | This outline covers session-based crawling from both a conceptual and practical perspective, helping users understand its importance, configure it effectively, and use it to handle complex dynamic content. 166 | 167 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/13 text chunking - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 13: Chunking Strategies for Large Text Processing 4 | 5 | ### Quick Intro 6 | 7 | Explain Regex, NLP, and Fixed-Length chunking, and when to use each. Demo: Chunk a large article or document for processing by topics or sentences. 8 | 9 | Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, emphasizing how chunking works within extraction and why it’s crucial for effective data aggregation. 10 | 11 | Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, explaining each strategy, when to use it, and providing examples to illustrate. 12 | 13 | * * * 14 | 15 | ### **12\. Chunking Strategies for Large Text Processing** 16 | 17 | #### **1\. Introduction to Chunking in Crawl4AI** 18 | 19 | - **What is Chunking**: Chunking is the process of dividing large text into manageable sections or “chunks,” enabling efficient processing in extraction tasks. 20 | - **Why It’s Needed**: 21 | - When processing large text, feeding it directly into an extraction function (like `F(x)`) can overwhelm memory or token limits. 22 | - Chunking breaks down `x` (the text) into smaller pieces, which are processed sequentially or in parallel by the extraction function, with the final result being an aggregation of all chunks’ processed output. 23 | 24 | #### **2\. Key Chunking Strategies and Use Cases** 25 | 26 | - Crawl4AI offers various chunking strategies to suit different text structures, chunk sizes, and processing requirements. 27 | - **Choosing a Strategy**: Select based on the type of text (e.g., articles, transcripts) and extraction needs (e.g., simple splitting or context-sensitive processing). 28 | 29 | #### **3\. Strategy 1: Regex-Based Chunking** 30 | 31 | - **Description**: Uses regular expressions to split text based on specified patterns (e.g., paragraphs or section breaks). 32 | - **Use Case**: Ideal for dividing text by paragraphs or larger logical blocks where sections are clearly separated by line breaks or punctuation. 33 | - **Example**: 34 | - **Pattern**: `r'\n\n'` for double line breaks. 35 | 36 | 37 | 38 | ```hljs python 39 | chunker = RegexChunking(patterns=[r'\n\n']) 40 | text_chunks = chunker.chunk(long_text) 41 | print(text_chunks) # Output: List of paragraphs 42 | 43 | ``` 44 | - **Pros**: Flexible for pattern-based chunking. 45 | - **Cons**: Limited to text with consistent formatting. 46 | 47 | #### **4\. Strategy 2: NLP Sentence-Based Chunking** 48 | 49 | - **Description**: Uses NLP to split text by sentences, ensuring grammatically complete segments. 50 | - **Use Case**: Useful for extracting individual statements, such as in news articles, quotes, or legal text. 51 | - **Example**: 52 | 53 | 54 | 55 | ```hljs makefile 56 | chunker = NlpSentenceChunking() 57 | sentence_chunks = chunker.chunk(long_text) 58 | print(sentence_chunks) # Output: List of sentences 59 | 60 | ``` 61 | 62 | - **Pros**: Maintains sentence structure, ideal for tasks needing semantic completeness. 63 | - **Cons**: May create very small chunks, which could limit contextual extraction. 64 | 65 | #### **5\. Strategy 3: Topic-Based Segmentation Using TextTiling** 66 | 67 | - **Description**: Segments text into topics using TextTiling, identifying topic shifts and key segments. 68 | - **Use Case**: Ideal for long articles, reports, or essays where each section covers a different topic. 69 | - **Example**: 70 | 71 | 72 | 73 | ```hljs makefile 74 | chunker = TopicSegmentationChunking(num_keywords=3) 75 | topic_chunks = chunker.chunk_with_topics(long_text) 76 | print(topic_chunks) # Output: List of topic segments with keywords 77 | 78 | ``` 79 | 80 | - **Pros**: Groups related content, preserving topical coherence. 81 | - **Cons**: Depends on identifiable topic shifts, which may not be present in all texts. 82 | 83 | #### **6\. Strategy 4: Fixed-Length Word Chunking** 84 | 85 | - **Description**: Splits text into chunks based on a fixed number of words. 86 | - **Use Case**: Ideal for text where exact segment size is required, such as processing word-limited documents for LLMs. 87 | - **Example**: 88 | 89 | 90 | 91 | ```hljs makefile 92 | chunker = FixedLengthWordChunking(chunk_size=100) 93 | word_chunks = chunker.chunk(long_text) 94 | print(word_chunks) # Output: List of 100-word chunks 95 | 96 | ``` 97 | 98 | - **Pros**: Ensures uniform chunk sizes, suitable for token-based extraction limits. 99 | - **Cons**: May split sentences, affecting semantic coherence. 100 | 101 | #### **7\. Strategy 5: Sliding Window Chunking** 102 | 103 | - **Description**: Uses a fixed window size with a step, creating overlapping chunks to maintain context. 104 | - **Use Case**: Useful for maintaining context across sections, as with documents where context is needed for neighboring sections. 105 | - **Example**: 106 | 107 | 108 | 109 | ```hljs makefile 110 | chunker = SlidingWindowChunking(window_size=100, step=50) 111 | window_chunks = chunker.chunk(long_text) 112 | print(window_chunks) # Output: List of overlapping word chunks 113 | 114 | ``` 115 | 116 | - **Pros**: Retains context across adjacent chunks, ideal for complex semantic extraction. 117 | - **Cons**: Overlap increases data size, potentially impacting processing time. 118 | 119 | #### **8\. Strategy 6: Overlapping Window Chunking** 120 | 121 | - **Description**: Similar to sliding windows but with a defined overlap, allowing chunks to share content at the edges. 122 | - **Use Case**: Suitable for handling long texts with essential overlapping information, like research articles or medical records. 123 | - **Example**: 124 | 125 | 126 | 127 | ```hljs makefile 128 | chunker = OverlappingWindowChunking(window_size=1000, overlap=100) 129 | overlap_chunks = chunker.chunk(long_text) 130 | print(overlap_chunks) # Output: List of overlapping chunks with defined overlap 131 | 132 | ``` 133 | 134 | - **Pros**: Allows controlled overlap for consistent content coverage across chunks. 135 | - **Cons**: Redundant data in overlapping areas may increase computation. 136 | 137 | #### **9\. Practical Example: Using Chunking with an Extraction Strategy** 138 | 139 | - **Goal**: Combine chunking with an extraction strategy to process large text effectively. 140 | - **Example Code**: 141 | 142 | 143 | 144 | 145 | ```hljs python 146 | from crawl4ai.extraction_strategy import LLMExtractionStrategy 147 | 148 | async def extract_large_text(): 149 | # Initialize chunker and extraction strategy 150 | chunker = FixedLengthWordChunking(chunk_size=200) 151 | extraction_strategy = LLMExtractionStrategy(provider="openai/gpt-4", api_token="your_api_token") 152 | 153 | # Split text into chunks 154 | text_chunks = chunker.chunk(large_text) 155 | 156 | async with AsyncWebCrawler() as crawler: 157 | for chunk in text_chunks: 158 | result = await crawler.arun( 159 | url="https://example.com", 160 | extraction_strategy=extraction_strategy, 161 | content=chunk 162 | ) 163 | print(result.extracted_content) 164 | 165 | ``` 166 | 167 | - **Explanation**: 168 | - `chunker.chunk()`: Divides the `large_text` into smaller segments based on the chosen strategy. 169 | - `extraction_strategy`: Processes each chunk separately, and results are then aggregated to form the final output. 170 | 171 | #### **10\. Choosing the Right Chunking Strategy** 172 | 173 | - **Text Structure**: If text has clear sections (e.g., paragraphs, topics), use Regex or Topic Segmentation. 174 | - **Extraction Needs**: If context is crucial, consider Sliding or Overlapping Window Chunking. 175 | - **Processing Constraints**: For word-limited extractions (e.g., LLMs with token limits), Fixed-Length Word Chunking is often most effective. 176 | 177 | #### **11\. Wrap Up & Next Steps** 178 | 179 | - Recap the benefits of each chunking strategy and when to use them in extraction workflows. 180 | - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler**, focusing on customizing crawler behavior with hooks for a fine-tuned extraction process. 181 | 182 | * * * 183 | 184 | This outline provides a complete understanding of chunking strategies, explaining each method’s strengths and best-use scenarios to help users process large texts effectively in Crawl4AI. 185 | 186 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/14 custom workflows - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 14: Hooks and Custom Workflow with AsyncWebCrawler 4 | 5 | ### Quick Intro 6 | 7 | Cover hooks ( `on_browser_created`, `before_goto`, `after_goto`) to add custom workflows. Demo: Use hooks to add custom cookies or headers, log HTML, or trigger specific events on page load. 8 | 9 | Here’s a detailed outline for the **Hooks and Custom Workflow with AsyncWebCrawler** video, covering each hook’s purpose, usage, and example implementations. 10 | 11 | * * * 12 | 13 | ### **13\. Hooks and Custom Workflow with AsyncWebCrawler** 14 | 15 | #### **1\. Introduction to Hooks in Crawl4AI** 16 | 17 | - **What are Hooks**: Hooks are customizable entry points in the crawling process that allow users to inject custom actions or logic at specific stages. 18 | - **Why Use Hooks**: 19 | - They enable fine-grained control over the crawling workflow. 20 | - Useful for performing additional tasks (e.g., logging, modifying headers) dynamically during the crawl. 21 | - Hooks provide the flexibility to adapt the crawler to complex site structures or unique project needs. 22 | 23 | #### **2\. Overview of Available Hooks** 24 | 25 | - Crawl4AI offers seven key hooks to modify and control different stages in the crawling lifecycle: 26 | - `on_browser_created` 27 | - `on_user_agent_updated` 28 | - `on_execution_started` 29 | - `before_goto` 30 | - `after_goto` 31 | - `before_return_html` 32 | - `before_retrieve_html` 33 | 34 | #### **3\. Hook-by-Hook Explanation and Examples** 35 | 36 | * * * 37 | 38 | ##### **Hook 1: `on_browser_created`** 39 | 40 | - **Purpose**: Triggered right after the browser instance is created. 41 | - **Use Case**: 42 | - Initializing browser-specific settings or performing setup actions. 43 | - Configuring browser extensions or scripts before any page is opened. 44 | - **Example**: 45 | 46 | 47 | 48 | ```hljs python 49 | async def log_browser_creation(browser): 50 | print("Browser instance created:", browser) 51 | 52 | crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) 53 | 54 | ``` 55 | 56 | - **Explanation**: This hook logs the browser creation event, useful for tracking when a new browser instance starts. 57 | 58 | * * * 59 | 60 | ##### **Hook 2: `on_user_agent_updated`** 61 | 62 | - **Purpose**: Called whenever the user agent string is updated. 63 | - **Use Case**: 64 | - Modifying the user agent based on page requirements, e.g., changing to a mobile user agent for mobile-only pages. 65 | - **Example**: 66 | 67 | 68 | 69 | ```hljs scss 70 | def update_user_agent(user_agent): 71 | print(f"User Agent Updated: {user_agent}") 72 | 73 | crawler.crawler_strategy.set_hook('on_user_agent_updated', update_user_agent) 74 | crawler.update_user_agent("Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)") 75 | 76 | ``` 77 | 78 | - **Explanation**: This hook provides a callback every time the user agent changes, helpful for debugging or dynamically altering user agent settings based on conditions. 79 | 80 | * * * 81 | 82 | ##### **Hook 3: `on_execution_started`** 83 | 84 | - **Purpose**: Called right before the crawler begins any interaction (e.g., JavaScript execution, clicks). 85 | - **Use Case**: 86 | - Performing setup actions, such as inserting cookies or initiating custom scripts. 87 | - **Example**: 88 | 89 | 90 | 91 | ```hljs python 92 | async def log_execution_start(page): 93 | print("Execution started on page:", page.url) 94 | 95 | crawler.crawler_strategy.set_hook('on_execution_started', log_execution_start) 96 | 97 | ``` 98 | 99 | - **Explanation**: Logs the start of any major interaction on the page, ideal for cases where you want to monitor each interaction. 100 | 101 | * * * 102 | 103 | ##### **Hook 4: `before_goto`** 104 | 105 | - **Purpose**: Triggered before navigating to a new URL with `page.goto()`. 106 | - **Use Case**: 107 | - Modifying request headers or setting up conditions right before the page loads. 108 | - Adding headers or dynamically adjusting options for specific URLs. 109 | - **Example**: 110 | 111 | 112 | 113 | ```hljs python 114 | async def modify_headers_before_goto(page): 115 | await page.set_extra_http_headers({"X-Custom-Header": "CustomValue"}) 116 | print("Custom headers set before navigation") 117 | 118 | crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) 119 | 120 | ``` 121 | 122 | - **Explanation**: This hook allows injecting headers or altering settings based on the page’s needs, particularly useful for pages with custom requirements. 123 | 124 | * * * 125 | 126 | ##### **Hook 5: `after_goto`** 127 | 128 | - **Purpose**: Executed immediately after a page has loaded (after `page.goto()`). 129 | - **Use Case**: 130 | - Checking the loaded page state, modifying the DOM, or performing post-navigation actions (e.g., scrolling). 131 | - **Example**: 132 | 133 | 134 | 135 | ```hljs python 136 | async def post_navigation_scroll(page): 137 | await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") 138 | print("Scrolled to the bottom after navigation") 139 | 140 | crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) 141 | 142 | ``` 143 | 144 | - **Explanation**: This hook scrolls to the bottom of the page after loading, which can help load dynamically added content like infinite scroll elements. 145 | 146 | * * * 147 | 148 | ##### **Hook 6: `before_return_html`** 149 | 150 | - **Purpose**: Called right before HTML content is retrieved and returned. 151 | - **Use Case**: 152 | - Removing overlays or cleaning up the page for a cleaner HTML extraction. 153 | - **Example**: 154 | 155 | 156 | 157 | ```hljs python 158 | async def remove_advertisements(page, html): 159 | await page.evaluate("document.querySelectorAll('.ad-banner').forEach(el => el.remove());") 160 | print("Advertisements removed before returning HTML") 161 | 162 | crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements) 163 | 164 | ``` 165 | 166 | - **Explanation**: The hook removes ad banners from the HTML before it’s retrieved, ensuring a cleaner data extraction. 167 | 168 | * * * 169 | 170 | ##### **Hook 7: `before_retrieve_html`** 171 | 172 | - **Purpose**: Runs right before Crawl4AI initiates HTML retrieval. 173 | - **Use Case**: 174 | - Finalizing any page adjustments (e.g., setting timers, waiting for specific elements). 175 | - **Example**: 176 | 177 | 178 | 179 | ```hljs python 180 | async def wait_for_content_before_retrieve(page): 181 | await page.wait_for_selector('.main-content') 182 | print("Main content loaded, ready to retrieve HTML") 183 | 184 | crawler.crawler_strategy.set_hook('before_retrieve_html', wait_for_content_before_retrieve) 185 | 186 | ``` 187 | 188 | - **Explanation**: This hook waits for the main content to load before retrieving the HTML, ensuring that all essential content is captured. 189 | 190 | #### **4\. Setting Hooks in Crawl4AI** 191 | 192 | - **How to Set Hooks**: 193 | - Use `set_hook` to define a custom function for each hook. 194 | - Each hook function can be asynchronous (useful for actions like waiting or retrieving async data). 195 | - **Example Setup**: 196 | 197 | 198 | 199 | ```hljs bash 200 | crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) 201 | crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) 202 | crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) 203 | 204 | ``` 205 | 206 | 207 | #### **5\. Complete Example: Using Hooks for a Customized Crawl Workflow** 208 | 209 | - **Goal**: Log each key step, set custom headers before navigation, and clean up the page before retrieving HTML. 210 | - **Example Code**: 211 | 212 | 213 | 214 | ```hljs python 215 | async def custom_crawl(): 216 | async with AsyncWebCrawler() as crawler: 217 | # Set hooks for custom workflow 218 | crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) 219 | crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) 220 | crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) 221 | crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements) 222 | 223 | # Perform the crawl 224 | url = "https://example.com" 225 | result = await crawler.arun(url=url) 226 | print(result.html) # Display or process HTML 227 | 228 | ``` 229 | 230 | 231 | #### **6\. Benefits of Using Hooks in Custom Crawling Workflows** 232 | 233 | - **Enhanced Control**: Hooks offer precise control over each stage, allowing adjustments based on content and structure. 234 | - **Efficient Modifications**: Avoid reloading or restarting the session; hooks can alter actions dynamically. 235 | - **Context-Sensitive Actions**: Hooks enable custom logic tailored to specific pages or sections, maximizing extraction quality. 236 | 237 | #### **7\. Wrap Up & Next Steps** 238 | 239 | - Recap how hooks empower customized workflows in Crawl4AI, enabling flexibility at every stage. 240 | - Tease the next video: **Automating Post-Processing with Crawl4AI**, covering automated steps after data extraction. 241 | 242 | * * * 243 | 244 | This outline provides a thorough understanding of hooks, their practical applications, and examples for customizing the crawling workflow in Crawl4AI. 245 | 246 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/2 advanced features - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 2: Overview of Advanced Features 4 | 5 | ### Quick Intro 6 | 7 | A general overview of advanced features like hooks, CSS selectors, and JSON CSS extraction. 8 | 9 | Here's a condensed outline for an **Overview of Advanced Features** video covering Crawl4AI's powerful customization and extraction options: 10 | 11 | * * * 12 | 13 | ### **Overview of Advanced Features** 14 | 15 | 1) **Introduction to Advanced Features**: 16 | 17 | - Briefly introduce Crawl4AI’s advanced tools, which let users go beyond basic crawling to customize and fine-tune their scraping workflows. 18 | 19 | 2) **Taking Screenshots**: 20 | 21 | - Explain the screenshot capability for capturing page state and verifying content. 22 | - **Example**: 23 | 24 | 25 | 26 | ```hljs ini 27 | result = await crawler.arun(url="https://www.example.com", screenshot=True) 28 | 29 | ``` 30 | 31 | - Mention that screenshots are saved as a base64 string in `result`, allowing easy decoding and saving. 32 | 33 | 3) **Media and Link Extraction**: 34 | 35 | - Demonstrate how to pull all media (images, videos) and links (internal and external) from a page for deeper analysis or content gathering. 36 | - **Example**: 37 | 38 | 39 | 40 | ```hljs python 41 | result = await crawler.arun(url="https://www.example.com") 42 | print("Media:", result.media) 43 | print("Links:", result.links) 44 | 45 | ``` 46 | 47 | 48 | 4) **Custom User Agent**: 49 | 50 | - Show how to set a custom user agent to disguise the crawler or simulate specific devices/browsers. 51 | - **Example**: 52 | 53 | 54 | 55 | ```hljs ini 56 | result = await crawler.arun(url="https://www.example.com", user_agent="Mozilla/5.0 (compatible; MyCrawler/1.0)") 57 | 58 | ``` 59 | 60 | 61 | 5) **Custom Hooks for Enhanced Control**: 62 | 63 | - Briefly cover how to use hooks, which allow custom actions like setting headers or handling login during the crawl. 64 | - **Example**: Setting a custom header with `before_get_url` hook. 65 | 66 | 67 | 68 | ```hljs python 69 | async def before_get_url(page): 70 | await page.set_extra_http_headers({"X-Test-Header": "test"}) 71 | 72 | ``` 73 | 74 | 75 | 6) **CSS Selectors for Targeted Extraction**: 76 | 77 | - Explain the use of CSS selectors to extract specific elements, ideal for structured data like articles or product details. 78 | - **Example**: 79 | 80 | 81 | 82 | ```hljs python 83 | result = await crawler.arun(url="https://www.example.com", css_selector="h2") 84 | print("H2 Tags:", result.extracted_content) 85 | 86 | ``` 87 | 88 | 89 | 7) **Crawling Inside Iframes**: 90 | 91 | - Mention how enabling `process_iframes=True` allows extracting content within iframes, useful for sites with embedded content or ads. 92 | - **Example**: 93 | 94 | 95 | 96 | ```hljs ini 97 | result = await crawler.arun(url="https://www.example.com", process_iframes=True) 98 | 99 | ``` 100 | 101 | 102 | 8) **Wrap-Up**: 103 | 104 | - Summarize these advanced features and how they allow users to customize every part of their web scraping experience. 105 | - Tease upcoming videos where each feature will be explored in detail. 106 | 107 | * * * 108 | 109 | This covers each advanced feature with a brief example, providing a useful overview to prepare viewers for the more in-depth videos. 110 | 111 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/3 browser setup - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 3: Browser Configurations & Headless Crawling 4 | 5 | ### Quick Intro 6 | 7 | Explain browser options ( `chromium`, `firefox`, `webkit`) and settings for headless mode, caching, and verbose logging. 8 | 9 | Here’s a streamlined outline for the **Browser Configurations & Headless Crawling** video: 10 | 11 | * * * 12 | 13 | ### **Browser Configurations & Headless Crawling** 14 | 15 | 1) **Overview of Browser Options**: 16 | 17 | - Crawl4AI supports three browser engines: 18 | - **Chromium** (default) - Highly compatible. 19 | - **Firefox** \- Great for specialized use cases. 20 | - **Webkit** \- Lightweight, ideal for basic needs. 21 | - **Example**: 22 | 23 | 24 | 25 | ```hljs ini 26 | # Using Chromium (default) 27 | crawler = AsyncWebCrawler(browser_type="chromium") 28 | 29 | # Using Firefox 30 | crawler = AsyncWebCrawler(browser_type="firefox") 31 | 32 | # Using WebKit 33 | crawler = AsyncWebCrawler(browser_type="webkit") 34 | 35 | ``` 36 | 37 | 38 | 2) **Headless Mode**: 39 | 40 | - Headless mode runs the browser without a visible GUI, making it faster and less resource-intensive. 41 | - To enable or disable: 42 | 43 | 44 | 45 | ```hljs ini 46 | # Headless mode (default is True) 47 | crawler = AsyncWebCrawler(headless=True) 48 | 49 | # Disable headless mode for debugging 50 | crawler = AsyncWebCrawler(headless=False) 51 | 52 | ``` 53 | 54 | 55 | 3) **Verbose Logging**: 56 | \- Use `verbose=True` to get detailed logs for each action, useful for debugging: 57 | 58 | 59 | ```hljs ini 60 | crawler = AsyncWebCrawler(verbose=True) 61 | 62 | ``` 63 | 64 | 4) **Running a Basic Crawl with Configuration**: 65 | \- Example of a simple crawl with custom browser settings: 66 | 67 | 68 | ```hljs python 69 | async with AsyncWebCrawler(browser_type="firefox", headless=True, verbose=True) as crawler: 70 | result = await crawler.arun(url="https://www.example.com") 71 | print(result.markdown[:500]) # Show first 500 characters 72 | 73 | ``` 74 | 75 | \- This example uses Firefox in headless mode with logging enabled, demonstrating the flexibility of Crawl4AI’s setup. 76 | 77 | 5) **Recap & Next Steps**: 78 | \- Recap the power of selecting different browsers and running headless mode for speed and efficiency. 79 | \- Tease the next video: **Proxy & Security Settings** for navigating blocked or restricted content and protecting IP identity. 80 | 81 | * * * 82 | 83 | This breakdown covers browser configuration essentials in Crawl4AI, providing users with practical steps to optimize their scraping setup. 84 | 85 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/4 proxy settings - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 4: Advanced Proxy and Security Settings 4 | 5 | ### Quick Intro 6 | 7 | Showcase proxy configurations (HTTP, SOCKS5, authenticated proxies). Demo: Use rotating proxies and set custom headers to avoid IP blocking and enhance security. 8 | 9 | Here’s a focused outline for the **Proxy and Security Settings** video: 10 | 11 | * * * 12 | 13 | ### **Proxy & Security Settings** 14 | 15 | 1) **Why Use Proxies in Web Crawling**: 16 | 17 | - Proxies are essential for bypassing IP-based restrictions, improving anonymity, and managing rate limits. 18 | - Crawl4AI supports simple proxies, authenticated proxies, and proxy rotation for robust web scraping. 19 | 20 | 2) **Basic Proxy Setup**: 21 | 22 | - **Using a Simple Proxy**: 23 | 24 | 25 | 26 | ```hljs ini 27 | # HTTP proxy 28 | crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080") 29 | 30 | # SOCKS proxy 31 | crawler = AsyncWebCrawler(proxy="socks5://proxy.example.com:1080") 32 | 33 | ``` 34 | 35 | 36 | 3) **Authenticated Proxies**: 37 | 38 | - Use `proxy_config` for proxies requiring a username and password: 39 | 40 | 41 | 42 | ```hljs makefile 43 | proxy_config = { 44 | "server": "http://proxy.example.com:8080", 45 | "username": "user", 46 | "password": "pass" 47 | } 48 | crawler = AsyncWebCrawler(proxy_config=proxy_config) 49 | 50 | ``` 51 | 52 | 53 | 4) **Rotating Proxies**: 54 | 55 | - Rotating proxies helps avoid IP bans by switching IP addresses for each request: 56 | 57 | 58 | 59 | ```hljs csharp 60 | async def get_next_proxy(): 61 | # Define proxy rotation logic here 62 | return {"server": "http://next.proxy.com:8080"} 63 | 64 | async with AsyncWebCrawler() as crawler: 65 | for url in urls: 66 | proxy = await get_next_proxy() 67 | crawler.update_proxy(proxy) 68 | result = await crawler.arun(url=url) 69 | 70 | ``` 71 | 72 | - This setup periodically switches the proxy for enhanced security and access. 73 | 74 | 5) **Custom Headers for Additional Security**: 75 | 76 | - Set custom headers to mask the crawler’s identity and avoid detection: 77 | 78 | 79 | 80 | ```hljs makefile 81 | headers = { 82 | "X-Forwarded-For": "203.0.113.195", 83 | "Accept-Language": "en-US,en;q=0.9", 84 | "Cache-Control": "no-cache", 85 | "Pragma": "no-cache" 86 | } 87 | crawler = AsyncWebCrawler(headers=headers) 88 | 89 | ``` 90 | 91 | 92 | 6) **Combining Proxies with Magic Mode for Anti-Bot Protection**: 93 | 94 | - For sites with aggressive bot detection, combine `proxy` settings with `magic=True`: 95 | 96 | 97 | 98 | ```hljs csharp 99 | async with AsyncWebCrawler(proxy="http://proxy.example.com:8080", headers={"Accept-Language": "en-US"}) as crawler: 100 | result = await crawler.arun( 101 | url="https://example.com", 102 | magic=True # Enables anti-detection features 103 | ) 104 | 105 | ``` 106 | 107 | - **Magic Mode** automatically enables user simulation, random timing, and browser property masking. 108 | 109 | 7) **Wrap Up & Next Steps**: 110 | 111 | - Summarize the importance of proxies and anti-detection in accessing restricted content and avoiding bans. 112 | - Tease the next video: **JavaScript Execution and Handling Dynamic Content** for working with interactive and dynamically loaded pages. 113 | 114 | * * * 115 | 116 | This outline provides a practical guide to setting up proxies and security configurations, empowering users to navigate restricted sites while staying undetected. 117 | 118 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/5 dynamic content - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 5: JavaScript Execution and Dynamic Content Handling 4 | 5 | ### Quick Intro 6 | 7 | Explain JavaScript code injection with examples (e.g., simulating scrolling, clicking ‘load more’). Demo: Extract content from a page that uses dynamic loading with lazy-loaded images. 8 | 9 | Here’s a focused outline for the **JavaScript Execution and Dynamic Content Handling** video: 10 | 11 | * * * 12 | 13 | ### **JavaScript Execution & Dynamic Content Handling** 14 | 15 | 1) **Why JavaScript Execution Matters**: 16 | 17 | - Many modern websites load content dynamically via JavaScript, requiring special handling to access all elements. 18 | - Crawl4AI can execute JavaScript on pages, enabling it to interact with elements like “load more” buttons, infinite scrolls, and content that appears only after certain actions. 19 | 20 | 2) **Basic JavaScript Execution**: 21 | 22 | - Use `js_code` to execute JavaScript commands on a page: 23 | 24 | 25 | 26 | ```hljs makefile 27 | # Scroll to bottom of the page 28 | result = await crawler.arun( 29 | url="https://example.com", 30 | js_code="window.scrollTo(0, document.body.scrollHeight);" 31 | ) 32 | 33 | ``` 34 | 35 | - This command scrolls to the bottom, triggering any lazy-loaded or dynamically added content. 36 | 37 | 3) **Multiple Commands & Simulating Clicks**: 38 | 39 | - Combine multiple JavaScript commands to interact with elements like “load more” buttons: 40 | 41 | 42 | 43 | ```hljs makefile 44 | js_commands = [\ 45 | "window.scrollTo(0, document.body.scrollHeight);",\ 46 | "document.querySelector('.load-more').click();"\ 47 | ] 48 | result = await crawler.arun( 49 | url="https://example.com", 50 | js_code=js_commands 51 | ) 52 | 53 | ``` 54 | 55 | - This script scrolls down and then clicks the “load more” button, useful for loading additional content blocks. 56 | 57 | 4) **Waiting for Dynamic Content**: 58 | 59 | - Use `wait_for` to ensure the page loads specific elements before proceeding: 60 | 61 | 62 | 63 | ```hljs makefile 64 | result = await crawler.arun( 65 | url="https://example.com", 66 | js_code="window.scrollTo(0, document.body.scrollHeight);", 67 | wait_for="css:.dynamic-content" # Wait for elements with class `.dynamic-content` 68 | ) 69 | 70 | ``` 71 | 72 | - This example waits until elements with `.dynamic-content` are loaded, helping to capture content that appears after JavaScript actions. 73 | 74 | 5) **Handling Complex Dynamic Content (e.g., Infinite Scroll)**: 75 | 76 | - Combine JavaScript execution with conditional waiting to handle infinite scrolls or paginated content: 77 | 78 | 79 | 80 | ```hljs csharp 81 | result = await crawler.arun( 82 | url="https://example.com", 83 | js_code=[\ 84 | "window.scrollTo(0, document.body.scrollHeight);",\ 85 | "const loadMore = document.querySelector('.load-more'); if (loadMore) loadMore.click();"\ 86 | ], 87 | wait_for="js:() => document.querySelectorAll('.item').length > 10" # Wait until 10 items are loaded 88 | ) 89 | 90 | ``` 91 | 92 | - This example scrolls and clicks "load more" repeatedly, waiting each time for a specified number of items to load. 93 | 94 | 6) **Complete Example: Dynamic Content Handling with Extraction**: 95 | 96 | - Full example demonstrating a dynamic load and content extraction in one process: 97 | 98 | 99 | 100 | ```hljs csharp 101 | async with AsyncWebCrawler() as crawler: 102 | result = await crawler.arun( 103 | url="https://example.com", 104 | js_code=[\ 105 | "window.scrollTo(0, document.body.scrollHeight);",\ 106 | "document.querySelector('.load-more').click();"\ 107 | ], 108 | wait_for="css:.main-content", 109 | css_selector=".main-content" 110 | ) 111 | print(result.markdown[:500]) # Output the main content extracted 112 | 113 | ``` 114 | 115 | 116 | 7) **Wrap Up & Next Steps**: 117 | 118 | - Recap how JavaScript execution allows access to dynamic content, enabling powerful interactions. 119 | - Tease the next video: **Content Cleaning and Fit Markdown** to show how Crawl4AI can extract only the most relevant content from complex pages. 120 | 121 | * * * 122 | 123 | This outline explains how to handle dynamic content and JavaScript-based interactions effectively, enabling users to scrape and interact with complex, modern websites. 124 | 125 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/6 magic mode - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 6: Magic Mode and Anti-Bot Protection 4 | 5 | ### Quick Intro 6 | 7 | Highlight `Magic Mode` and anti-bot features like user simulation, navigator overrides, and timing randomization. Demo: Access a site with anti-bot protection and show how `Magic Mode` seamlessly handles it. 8 | 9 | Here’s a concise outline for the **Magic Mode and Anti-Bot Protection** video: 10 | 11 | * * * 12 | 13 | ### **Magic Mode & Anti-Bot Protection** 14 | 15 | 1) **Why Anti-Bot Protection is Important**: 16 | 17 | - Many websites use bot detection mechanisms to block automated scraping. Crawl4AI’s anti-detection features help avoid IP bans, CAPTCHAs, and access restrictions. 18 | - **Magic Mode** is a one-step solution to enable a range of anti-bot features without complex configuration. 19 | 20 | 2) **Enabling Magic Mode**: 21 | 22 | - Simply set `magic=True` to activate Crawl4AI’s full anti-bot suite: 23 | 24 | 25 | 26 | ```hljs python 27 | result = await crawler.arun( 28 | url="https://example.com", 29 | magic=True # Enables all anti-detection features 30 | ) 31 | 32 | ``` 33 | 34 | - This enables a blend of stealth techniques, including masking automation signals, randomizing timings, and simulating real user behavior. 35 | 36 | 3) **What Magic Mode Does Behind the Scenes**: 37 | 38 | - **User Simulation**: Mimics human actions like mouse movements and scrolling. 39 | - **Navigator Overrides**: Hides signals that indicate an automated browser. 40 | - **Timing Randomization**: Adds random delays to simulate natural interaction patterns. 41 | - **Cookie Handling**: Accepts and manages cookies dynamically to avoid triggers from cookie pop-ups. 42 | 43 | 4) **Manual Anti-Bot Options (If Not Using Magic Mode)**: 44 | 45 | - For granular control, you can configure individual settings without Magic Mode: 46 | 47 | 48 | 49 | ```hljs python 50 | result = await crawler.arun( 51 | url="https://example.com", 52 | simulate_user=True, # Enables human-like behavior 53 | override_navigator=True # Hides automation fingerprints 54 | ) 55 | 56 | ``` 57 | 58 | - **Use Cases**: This approach allows more specific adjustments when certain anti-bot features are needed but others are not. 59 | 60 | 5) **Combining Proxies with Magic Mode**: 61 | 62 | - To avoid rate limits or IP blocks, combine Magic Mode with a proxy: 63 | 64 | 65 | 66 | ```hljs csharp 67 | async with AsyncWebCrawler( 68 | proxy="http://proxy.example.com:8080", 69 | headers={"Accept-Language": "en-US"} 70 | ) as crawler: 71 | result = await crawler.arun( 72 | url="https://example.com", 73 | magic=True # Full anti-detection 74 | ) 75 | 76 | ``` 77 | 78 | - This setup maximizes stealth by pairing anti-bot detection with IP obfuscation. 79 | 80 | 6) **Example of Anti-Bot Protection in Action**: 81 | 82 | - Full example with Magic Mode and proxies to scrape a protected page: 83 | 84 | 85 | 86 | ```hljs python 87 | async with AsyncWebCrawler() as crawler: 88 | result = await crawler.arun( 89 | url="https://example.com/protected-content", 90 | magic=True, 91 | proxy="http://proxy.example.com:8080", 92 | wait_for="css:.content-loaded" # Wait for the main content to load 93 | ) 94 | print(result.markdown[:500]) # Display first 500 characters of the content 95 | 96 | ``` 97 | 98 | - This example ensures seamless access to protected content by combining anti-detection and waiting for full content load. 99 | 100 | 7) **Wrap Up & Next Steps**: 101 | 102 | - Recap the power of Magic Mode and anti-bot features for handling restricted websites. 103 | - Tease the next video: **Content Cleaning and Fit Markdown** to show how to extract clean and focused content from a page. 104 | 105 | * * * 106 | 107 | This outline shows users how to easily avoid bot detection and access restricted content, demonstrating both the power and simplicity of Magic Mode in Crawl4AI. 108 | 109 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/7 content cleaning - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 7: Content Cleaning and Fit Markdown 4 | 5 | ### Quick Intro 6 | 7 | Explain content cleaning options, including `fit_markdown` to keep only the most relevant content. Demo: Extract and compare regular vs. fit markdown from a news site or blog. 8 | 9 | Here’s a streamlined outline for the **Content Cleaning and Fit Markdown** video: 10 | 11 | * * * 12 | 13 | ### **Content Cleaning & Fit Markdown** 14 | 15 | 1) **Overview of Content Cleaning in Crawl4AI**: 16 | 17 | - Explain that web pages often include extra elements like ads, navigation bars, footers, and popups. 18 | - Crawl4AI’s content cleaning features help extract only the main content, reducing noise and enhancing readability. 19 | 20 | 2) **Basic Content Cleaning Options**: 21 | 22 | - **Removing Unwanted Elements**: Exclude specific HTML tags, like forms or navigation bars: 23 | 24 | 25 | 26 | ```hljs python 27 | result = await crawler.arun( 28 | url="https://example.com", 29 | word_count_threshold=10, # Filter out blocks with fewer than 10 words 30 | excluded_tags=['form', 'nav'], # Exclude specific tags 31 | remove_overlay_elements=True # Remove popups and modals 32 | ) 33 | 34 | ``` 35 | 36 | - This example extracts content while excluding forms, navigation, and modal overlays, ensuring clean results. 37 | 38 | 3) **Fit Markdown for Main Content Extraction**: 39 | 40 | - **What is Fit Markdown**: Uses advanced analysis to identify the most relevant content (ideal for articles, blogs, and documentation). 41 | - **How it Works**: Analyzes content density, removes boilerplate elements, and maintains formatting for a clear output. 42 | - **Example**: 43 | 44 | 45 | 46 | ```hljs makefile 47 | result = await crawler.arun(url="https://example.com") 48 | main_content = result.fit_markdown # Extracted main content 49 | print(main_content[:500]) # Display first 500 characters 50 | 51 | ``` 52 | 53 | - Fit Markdown is especially helpful for long-form content like news articles or blog posts. 54 | 55 | 4) **Comparing Fit Markdown with Regular Markdown**: 56 | 57 | - **Fit Markdown** returns the primary content without extraneous elements. 58 | - **Regular Markdown** includes all extracted text in markdown format. 59 | - Example to show the difference: 60 | 61 | 62 | 63 | ```hljs python 64 | all_content = result.markdown # Full markdown 65 | main_content = result.fit_markdown # Only the main content 66 | 67 | print(f"All Content Length: {len(all_content)}") 68 | print(f"Main Content Length: {len(main_content)}") 69 | 70 | ``` 71 | 72 | - This comparison shows the effectiveness of Fit Markdown in focusing on essential content. 73 | 74 | 5) **Media and Metadata Handling with Content Cleaning**: 75 | 76 | - **Media Extraction**: Crawl4AI captures images and videos with metadata like alt text, descriptions, and relevance scores: 77 | 78 | 79 | 80 | ```hljs python 81 | for image in result.media["images"]: 82 | print(f"Source: {image['src']}, Alt Text: {image['alt']}, Relevance Score: {image['score']}") 83 | 84 | ``` 85 | 86 | - **Use Case**: Useful for saving only relevant images or videos from an article or content-heavy page. 87 | 88 | 6) **Example of Clean Content Extraction in Action**: 89 | 90 | - Full example extracting cleaned content and Fit Markdown: 91 | 92 | 93 | 94 | ```hljs csharp 95 | async with AsyncWebCrawler() as crawler: 96 | result = await crawler.arun( 97 | url="https://example.com", 98 | word_count_threshold=10, 99 | excluded_tags=['nav', 'footer'], 100 | remove_overlay_elements=True 101 | ) 102 | print(result.fit_markdown[:500]) # Show main content 103 | 104 | ``` 105 | 106 | - This example demonstrates content cleaning with settings for filtering noise and focusing on the core text. 107 | 108 | 7) **Wrap Up & Next Steps**: 109 | 110 | - Summarize the power of Crawl4AI’s content cleaning features and Fit Markdown for capturing clean, relevant content. 111 | - Tease the next video: **Link Analysis and Smart Filtering** to focus on analyzing and filtering links within crawled pages. 112 | 113 | * * * 114 | 115 | This outline covers Crawl4AI’s content cleaning features and the unique benefits of Fit Markdown, showing users how to retrieve focused, high-quality content from web pages. 116 | 117 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/8 media handling - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 8: Media Handling: Images, Videos, and Audio 4 | 5 | ### Quick Intro 6 | 7 | Showcase Crawl4AI’s media extraction capabilities, including lazy-loaded media and metadata. Demo: Crawl a multimedia page, extract images, and show metadata (alt text, context, relevance score). 8 | 9 | Here’s a clear and focused outline for the **Media Handling: Images, Videos, and Audio** video: 10 | 11 | * * * 12 | 13 | ### **Media Handling: Images, Videos, and Audio** 14 | 15 | 1) **Overview of Media Extraction in Crawl4AI**: 16 | 17 | - Crawl4AI can detect and extract different types of media (images, videos, and audio) along with useful metadata. 18 | - This functionality is essential for gathering visual content from multimedia-heavy pages like e-commerce sites, news articles, and social media feeds. 19 | 20 | 2) **Image Extraction and Metadata**: 21 | 22 | - Crawl4AI captures images with detailed metadata, including: 23 | - **Source URL**: The direct URL to the image. 24 | - **Alt Text**: Image description if available. 25 | - **Relevance Score**: A score (0–10) indicating how relevant the image is to the main content. 26 | - **Context**: Text surrounding the image on the page. 27 | - **Example**: 28 | 29 | 30 | 31 | ```hljs python 32 | result = await crawler.arun(url="https://example.com") 33 | 34 | for image in result.media["images"]: 35 | print(f"Source: {image['src']}") 36 | print(f"Alt Text: {image['alt']}") 37 | print(f"Relevance Score: {image['score']}") 38 | print(f"Context: {image['context']}") 39 | 40 | ``` 41 | 42 | - This example shows how to access each image’s metadata, making it easy to filter for the most relevant visuals. 43 | 44 | 3) **Handling Lazy-Loaded Images**: 45 | 46 | - Crawl4AI automatically supports lazy-loaded images, which are commonly used to optimize webpage loading. 47 | - **Example with Wait for Lazy-Loaded Content**: 48 | 49 | 50 | 51 | ```hljs makefile 52 | result = await crawler.arun( 53 | url="https://example.com", 54 | wait_for="css:img[data-src]", # Wait for lazy-loaded images 55 | delay_before_return_html=2.0 # Allow extra time for images to load 56 | ) 57 | 58 | ``` 59 | 60 | - This setup waits for lazy-loaded images to appear, ensuring they are fully captured. 61 | 62 | 4) **Video Extraction and Metadata**: 63 | 64 | - Crawl4AI captures video elements, including: 65 | - **Source URL**: The video’s direct URL. 66 | - **Type**: Format of the video (e.g., MP4). 67 | - **Thumbnail**: A poster or thumbnail image if available. 68 | - **Duration**: Video length, if metadata is provided. 69 | - **Example**: 70 | 71 | 72 | 73 | ```hljs python 74 | for video in result.media["videos"]: 75 | print(f"Video Source: {video['src']}") 76 | print(f"Type: {video['type']}") 77 | print(f"Thumbnail: {video.get('poster')}") 78 | print(f"Duration: {video.get('duration')}") 79 | 80 | ``` 81 | 82 | - This allows users to gather video content and relevant details for further processing or analysis. 83 | 84 | 5) **Audio Extraction and Metadata**: 85 | 86 | - Audio elements can also be extracted, with metadata like: 87 | - **Source URL**: The audio file’s direct URL. 88 | - **Type**: Format of the audio file (e.g., MP3). 89 | - **Duration**: Length of the audio, if available. 90 | - **Example**: 91 | 92 | 93 | 94 | ```hljs python 95 | for audio in result.media["audios"]: 96 | print(f"Audio Source: {audio['src']}") 97 | print(f"Type: {audio['type']}") 98 | print(f"Duration: {audio.get('duration')}") 99 | 100 | ``` 101 | 102 | - Useful for sites with podcasts, sound bites, or other audio content. 103 | 104 | 6) **Filtering Media by Relevance**: 105 | 106 | - Use metadata like relevance score to filter only the most useful media content: 107 | 108 | 109 | 110 | ```hljs ini 111 | relevant_images = [img for img in result.media["images"] if img['score'] > 5] 112 | 113 | ``` 114 | 115 | - This is especially helpful for content-heavy pages where you only want media directly related to the main content. 116 | 117 | 7) **Example: Full Media Extraction with Content Filtering**: 118 | 119 | - Full example extracting images, videos, and audio along with filtering by relevance: 120 | 121 | 122 | 123 | ```hljs python 124 | async with AsyncWebCrawler() as crawler: 125 | result = await crawler.arun( 126 | url="https://example.com", 127 | word_count_threshold=10, # Filter content blocks for relevance 128 | exclude_external_images=True # Only keep internal images 129 | ) 130 | 131 | # Display media summaries 132 | print(f"Relevant Images: {len(relevant_images)}") 133 | print(f"Videos: {len(result.media['videos'])}") 134 | print(f"Audio Clips: {len(result.media['audios'])}") 135 | 136 | ``` 137 | 138 | - This example shows how to capture and filter various media types, focusing on what’s most relevant. 139 | 140 | 8) **Wrap Up & Next Steps**: 141 | 142 | - Recap the comprehensive media extraction capabilities, emphasizing how metadata helps users focus on relevant content. 143 | - Tease the next video: **Link Analysis and Smart Filtering** to explore how Crawl4AI handles internal, external, and social media links for more focused data gathering. 144 | 145 | * * * 146 | 147 | This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction. 148 | 149 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/9 link analysis - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | ## Episode 9: Link Analysis and Smart Filtering 4 | 5 | ### Quick Intro 6 | 7 | Walk through internal and external link classification, social media link filtering, and custom domain exclusion. Demo: Analyze links on a website, focusing on internal navigation vs. external or ad links. 8 | 9 | Here’s a focused outline for the **Link Analysis and Smart Filtering** video: 10 | 11 | * * * 12 | 13 | ### **Link Analysis & Smart Filtering** 14 | 15 | 1) **Importance of Link Analysis in Web Crawling**: 16 | 17 | - Explain that web pages often contain numerous links, including internal links, external links, social media links, and ads. 18 | - Crawl4AI’s link analysis and filtering options help extract only relevant links, enabling more targeted and efficient crawls. 19 | 20 | 2) **Automatic Link Classification**: 21 | 22 | - Crawl4AI categorizes links automatically into internal, external, and social media links. 23 | - **Example**: 24 | 25 | 26 | 27 | ```hljs makefile 28 | result = await crawler.arun(url="https://example.com") 29 | 30 | # Access internal and external links 31 | internal_links = result.links["internal"] 32 | external_links = result.links["external"] 33 | 34 | # Print first few links for each type 35 | print("Internal Links:", internal_links[:3]) 36 | print("External Links:", external_links[:3]) 37 | 38 | ``` 39 | 40 | 41 | 3) **Filtering Out Unwanted Links**: 42 | 43 | - **Exclude External Links**: Remove all links pointing to external sites. 44 | - **Exclude Social Media Links**: Filter out social media domains like Facebook or Twitter. 45 | - **Example**: 46 | 47 | 48 | 49 | ```hljs python 50 | result = await crawler.arun( 51 | url="https://example.com", 52 | exclude_external_links=True, # Remove external links 53 | exclude_social_media_links=True # Remove social media links 54 | ) 55 | 56 | ``` 57 | 58 | 59 | 4) **Custom Domain Filtering**: 60 | 61 | - **Exclude Specific Domains**: Filter links from particular domains, e.g., ad sites. 62 | - **Custom Social Media Domains**: Add additional social media domains if needed. 63 | - **Example**: 64 | 65 | 66 | 67 | ```hljs csharp 68 | result = await crawler.arun( 69 | url="https://example.com", 70 | exclude_domains=["ads.com", "trackers.com"], 71 | exclude_social_media_domains=["facebook.com", "linkedin.com"] 72 | ) 73 | 74 | ``` 75 | 76 | 77 | 5) **Accessing Link Context and Metadata**: 78 | 79 | - Crawl4AI provides additional metadata for each link, including its text, type (e.g., navigation or content), and surrounding context. 80 | - **Example**: 81 | 82 | 83 | 84 | ```hljs python 85 | for link in result.links["internal"]: 86 | print(f"Link: {link['href']}, Text: {link['text']}, Context: {link['context']}") 87 | 88 | ``` 89 | 90 | - **Use Case**: Helps users understand the relevance of links based on where they are placed on the page (e.g., navigation vs. article content). 91 | 92 | 6) **Example of Comprehensive Link Filtering and Analysis**: 93 | 94 | - Full example combining link filtering, metadata access, and contextual information: 95 | 96 | 97 | 98 | ```hljs python 99 | async with AsyncWebCrawler() as crawler: 100 | result = await crawler.arun( 101 | url="https://example.com", 102 | exclude_external_links=True, 103 | exclude_social_media_links=True, 104 | exclude_domains=["ads.com"], 105 | css_selector=".main-content" # Focus only on main content area 106 | ) 107 | for link in result.links["internal"]: 108 | print(f"Internal Link: {link['href']}, Text: {link['text']}, Context: {link['context']}") 109 | 110 | ``` 111 | 112 | - This example filters unnecessary links, keeping only internal and relevant links from the main content area. 113 | 114 | 7) **Wrap Up & Next Steps**: 115 | 116 | - Summarize the benefits of link filtering for efficient crawling and relevant content extraction. 117 | - Tease the next video: **Custom Headers, Identity Management, and User Simulation** to explain how to configure identity settings and simulate user behavior for stealthier crawls. 118 | 119 | * * * 120 | 121 | This outline provides a practical overview of Crawl4AI’s link analysis and filtering features, helping users target only essential links while eliminating distractions. 122 | 123 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/asyncwebcrawlerarun - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Complete Parameter Guide for arun() 2 | 3 | The following parameters can be passed to the `arun()` method. They are organized by their primary usage context and functionality. 4 | 5 | ## Core Parameters 6 | 7 | ```hljs graphql 8 | await crawler.arun( 9 | url="https://example.com", # Required: URL to crawl 10 | verbose=True, # Enable detailed logging 11 | bypass_cache=False, # Skip cache for this request 12 | warmup=True # Whether to run warmup check 13 | ) 14 | 15 | ``` 16 | 17 | ## Content Processing Parameters 18 | 19 | ### Text Processing 20 | 21 | ```hljs python 22 | await crawler.arun( 23 | word_count_threshold=10, # Minimum words per content block 24 | image_description_min_word_threshold=5, # Minimum words for image descriptions 25 | only_text=False, # Extract only text content 26 | excluded_tags=['form', 'nav'], # HTML tags to exclude 27 | keep_data_attributes=False, # Preserve data-* attributes 28 | ) 29 | 30 | ``` 31 | 32 | ### Content Selection 33 | 34 | ```hljs python 35 | await crawler.arun( 36 | css_selector=".main-content", # CSS selector for content extraction 37 | remove_forms=True, # Remove all form elements 38 | remove_overlay_elements=True, # Remove popups/modals/overlays 39 | ) 40 | 41 | ``` 42 | 43 | ### Link Handling 44 | 45 | ```hljs python 46 | await crawler.arun( 47 | exclude_external_links=True, # Remove external links 48 | exclude_social_media_links=True, # Remove social media links 49 | exclude_external_images=True, # Remove external images 50 | exclude_domains=["ads.example.com"], # Specific domains to exclude 51 | social_media_domains=[ # Additional social media domains\ 52 | "facebook.com",\ 53 | "twitter.com",\ 54 | "instagram.com"\ 55 | ] 56 | ) 57 | 58 | ``` 59 | 60 | ## Browser Control Parameters 61 | 62 | ### Basic Browser Settings 63 | 64 | ```hljs python 65 | await crawler.arun( 66 | headless=True, # Run browser in headless mode 67 | browser_type="chromium", # Browser engine: "chromium", "firefox", "webkit" 68 | page_timeout=60000, # Page load timeout in milliseconds 69 | user_agent="custom-agent", # Custom user agent 70 | ) 71 | 72 | ``` 73 | 74 | ### Navigation and Waiting 75 | 76 | ```hljs python 77 | await crawler.arun( 78 | wait_for="css:.dynamic-content", # Wait for element/condition 79 | delay_before_return_html=2.0, # Wait before returning HTML (seconds) 80 | ) 81 | 82 | ``` 83 | 84 | ### JavaScript Execution 85 | 86 | ```hljs graphql 87 | await crawler.arun( 88 | js_code=[ # JavaScript to execute (string or list)\ 89 | "window.scrollTo(0, document.body.scrollHeight);",\ 90 | "document.querySelector('.load-more').click();"\ 91 | ], 92 | js_only=False, # Only execute JavaScript without reloading page 93 | ) 94 | 95 | ``` 96 | 97 | ### Anti-Bot Features 98 | 99 | ```hljs python 100 | await crawler.arun( 101 | magic=True, # Enable all anti-detection features 102 | simulate_user=True, # Simulate human behavior 103 | override_navigator=True # Override navigator properties 104 | ) 105 | 106 | ``` 107 | 108 | ### Session Management 109 | 110 | ```hljs python 111 | await crawler.arun( 112 | session_id="my_session", # Session identifier for persistent browsing 113 | ) 114 | 115 | ``` 116 | 117 | ### Screenshot Options 118 | 119 | ```hljs python 120 | await crawler.arun( 121 | screenshot=True, # Take page screenshot 122 | screenshot_wait_for=2.0, # Wait before screenshot (seconds) 123 | ) 124 | 125 | ``` 126 | 127 | ### Proxy Configuration 128 | 129 | ```hljs csharp 130 | await crawler.arun( 131 | proxy="http://proxy.example.com:8080", # Simple proxy URL 132 | proxy_config={ # Advanced proxy settings 133 | "server": "http://proxy.example.com:8080", 134 | "username": "user", 135 | "password": "pass" 136 | } 137 | ) 138 | 139 | ``` 140 | 141 | ## Content Extraction Parameters 142 | 143 | ### Extraction Strategy 144 | 145 | ```hljs graphql 146 | await crawler.arun( 147 | extraction_strategy=LLMExtractionStrategy( 148 | provider="ollama/llama2", 149 | schema=MySchema.schema(), 150 | instruction="Extract specific data" 151 | ) 152 | ) 153 | 154 | ``` 155 | 156 | ### Chunking Strategy 157 | 158 | ```hljs python 159 | await crawler.arun( 160 | chunking_strategy=RegexChunking( 161 | patterns=[r'\n\n', r'\.\s+'] 162 | ) 163 | ) 164 | 165 | ``` 166 | 167 | ### HTML to Text Options 168 | 169 | ```hljs python 170 | await crawler.arun( 171 | html2text={ 172 | "ignore_links": False, 173 | "ignore_images": False, 174 | "escape_dot": False, 175 | "body_width": 0, 176 | "protect_links": True, 177 | "unicode_snob": True 178 | } 179 | ) 180 | 181 | ``` 182 | 183 | ## Debug Options 184 | 185 | ```hljs python 186 | await crawler.arun( 187 | log_console=True, # Log browser console messages 188 | ) 189 | 190 | ``` 191 | 192 | ## Parameter Interactions and Notes 193 | 194 | 1. **Magic Mode Combinations** 195 | 196 | 197 | 198 | ```hljs python 199 | # Full anti-detection setup 200 | await crawler.arun( 201 | magic=True, 202 | headless=False, 203 | simulate_user=True, 204 | override_navigator=True 205 | ) 206 | 207 | ``` 208 | 209 | 2. **Dynamic Content Handling** 210 | 211 | 212 | 213 | ```hljs csharp 214 | # Handle lazy-loaded content 215 | await crawler.arun( 216 | js_code="window.scrollTo(0, document.body.scrollHeight);", 217 | wait_for="css:.lazy-content", 218 | delay_before_return_html=2.0 219 | ) 220 | 221 | ``` 222 | 223 | 3. **Content Extraction Pipeline** 224 | 225 | 226 | 227 | ```hljs python 228 | # Complete extraction setup 229 | await crawler.arun( 230 | css_selector=".main-content", 231 | word_count_threshold=20, 232 | extraction_strategy=my_strategy, 233 | chunking_strategy=my_chunking, 234 | process_iframes=True, 235 | remove_overlay_elements=True 236 | ) 237 | 238 | ``` 239 | 240 | 241 | ## Best Practices 242 | 243 | 1. **Performance Optimization** 244 | 245 | 246 | 247 | ```hljs python 248 | await crawler.arun( 249 | bypass_cache=False, # Use cache when possible 250 | word_count_threshold=10, # Filter out noise 251 | process_iframes=False # Skip iframes if not needed 252 | ) 253 | 254 | ``` 255 | 256 | 2. **Reliable Scraping** 257 | 258 | 259 | 260 | ```hljs python 261 | await crawler.arun( 262 | magic=True, # Enable anti-detection 263 | delay_before_return_html=1.0, # Wait for dynamic content 264 | page_timeout=60000 # Longer timeout for slow pages 265 | ) 266 | 267 | ``` 268 | 269 | 3. **Clean Content** 270 | 271 | 272 | 273 | ```hljs python 274 | await crawler.arun( 275 | remove_overlay_elements=True, # Remove popups 276 | excluded_tags=['nav', 'aside'],# Remove unnecessary elements 277 | keep_data_attributes=False # Remove data attributes 278 | ) 279 | 280 | ``` 281 | 282 | 283 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/browser configuration - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Browser Configuration 2 | 3 | Crawl4AI supports multiple browser engines and offers extensive configuration options for browser behavior. 4 | 5 | ## Browser Types 6 | 7 | Choose from three browser engines: 8 | 9 | ```hljs csharp 10 | # Chromium (default) 11 | async with AsyncWebCrawler(browser_type="chromium") as crawler: 12 | result = await crawler.arun(url="https://example.com") 13 | 14 | # Firefox 15 | async with AsyncWebCrawler(browser_type="firefox") as crawler: 16 | result = await crawler.arun(url="https://example.com") 17 | 18 | # WebKit 19 | async with AsyncWebCrawler(browser_type="webkit") as crawler: 20 | result = await crawler.arun(url="https://example.com") 21 | 22 | ``` 23 | 24 | ## Basic Configuration 25 | 26 | Common browser settings: 27 | 28 | ```hljs python 29 | async with AsyncWebCrawler( 30 | headless=True, # Run in headless mode (no GUI) 31 | verbose=True, # Enable detailed logging 32 | sleep_on_close=False # No delay when closing browser 33 | ) as crawler: 34 | result = await crawler.arun(url="https://example.com") 35 | 36 | ``` 37 | 38 | ## Identity Management 39 | 40 | Control how your crawler appears to websites: 41 | 42 | ```hljs csharp 43 | # Custom user agent 44 | async with AsyncWebCrawler( 45 | user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 46 | ) as crawler: 47 | result = await crawler.arun(url="https://example.com") 48 | 49 | # Custom headers 50 | headers = { 51 | "Accept-Language": "en-US,en;q=0.9", 52 | "Cache-Control": "no-cache" 53 | } 54 | async with AsyncWebCrawler(headers=headers) as crawler: 55 | result = await crawler.arun(url="https://example.com") 56 | 57 | ``` 58 | 59 | ## Screenshot Capabilities 60 | 61 | Capture page screenshots with enhanced error handling: 62 | 63 | ```hljs python 64 | result = await crawler.arun( 65 | url="https://example.com", 66 | screenshot=True, # Enable screenshot 67 | screenshot_wait_for=2.0 # Wait 2 seconds before capture 68 | ) 69 | 70 | if result.screenshot: # Base64 encoded image 71 | import base64 72 | with open("screenshot.png", "wb") as f: 73 | f.write(base64.b64decode(result.screenshot)) 74 | 75 | ``` 76 | 77 | ## Timeouts and Waiting 78 | 79 | Control page loading behavior: 80 | 81 | ```hljs makefile 82 | result = await crawler.arun( 83 | url="https://example.com", 84 | page_timeout=60000, # Page load timeout (ms) 85 | delay_before_return_html=2.0, # Wait before content capture 86 | wait_for="css:.dynamic-content" # Wait for specific element 87 | ) 88 | 89 | ``` 90 | 91 | ## JavaScript Execution 92 | 93 | Execute custom JavaScript before crawling: 94 | 95 | ```hljs makefile 96 | # Single JavaScript command 97 | result = await crawler.arun( 98 | url="https://example.com", 99 | js_code="window.scrollTo(0, document.body.scrollHeight);" 100 | ) 101 | 102 | # Multiple commands 103 | js_commands = [\ 104 | "window.scrollTo(0, document.body.scrollHeight);",\ 105 | "document.querySelector('.load-more').click();"\ 106 | ] 107 | result = await crawler.arun( 108 | url="https://example.com", 109 | js_code=js_commands 110 | ) 111 | 112 | ``` 113 | 114 | ## Proxy Configuration 115 | 116 | Use proxies for enhanced access: 117 | 118 | ```hljs csharp 119 | # Simple proxy 120 | async with AsyncWebCrawler( 121 | proxy="http://proxy.example.com:8080" 122 | ) as crawler: 123 | result = await crawler.arun(url="https://example.com") 124 | 125 | # Proxy with authentication 126 | proxy_config = { 127 | "server": "http://proxy.example.com:8080", 128 | "username": "user", 129 | "password": "pass" 130 | } 131 | async with AsyncWebCrawler(proxy_config=proxy_config) as crawler: 132 | result = await crawler.arun(url="https://example.com") 133 | 134 | ``` 135 | 136 | ## Anti-Detection Features 137 | 138 | Enable stealth features to avoid bot detection: 139 | 140 | ```hljs python 141 | result = await crawler.arun( 142 | url="https://example.com", 143 | simulate_user=True, # Simulate human behavior 144 | override_navigator=True, # Mask automation signals 145 | magic=True # Enable all anti-detection features 146 | ) 147 | 148 | ``` 149 | 150 | ## Handling Dynamic Content 151 | 152 | Configure browser to handle dynamic content: 153 | 154 | ```hljs python 155 | # Wait for dynamic content 156 | result = await crawler.arun( 157 | url="https://example.com", 158 | wait_for="js:() => document.querySelector('.content').children.length > 10", 159 | process_iframes=True # Process iframe content 160 | ) 161 | 162 | # Handle lazy-loaded images 163 | result = await crawler.arun( 164 | url="https://example.com", 165 | js_code="window.scrollTo(0, document.body.scrollHeight);", 166 | delay_before_return_html=2.0 # Wait for images to load 167 | ) 168 | 169 | ``` 170 | 171 | ## Comprehensive Example 172 | 173 | Here's how to combine various browser configurations: 174 | 175 | ```hljs python 176 | async def crawl_with_advanced_config(url: str): 177 | async with AsyncWebCrawler( 178 | # Browser setup 179 | browser_type="chromium", 180 | headless=True, 181 | verbose=True, 182 | 183 | # Identity 184 | user_agent="Custom User Agent", 185 | headers={"Accept-Language": "en-US"}, 186 | 187 | # Proxy setup 188 | proxy="http://proxy.example.com:8080" 189 | ) as crawler: 190 | result = await crawler.arun( 191 | url=url, 192 | # Content handling 193 | process_iframes=True, 194 | screenshot=True, 195 | 196 | # Timing 197 | page_timeout=60000, 198 | delay_before_return_html=2.0, 199 | 200 | # Anti-detection 201 | magic=True, 202 | simulate_user=True, 203 | 204 | # Dynamic content 205 | js_code=[\ 206 | "window.scrollTo(0, document.body.scrollHeight);",\ 207 | "document.querySelector('.load-more')?.click();"\ 208 | ], 209 | wait_for="css:.dynamic-content" 210 | ) 211 | 212 | return { 213 | "content": result.markdown, 214 | "screenshot": result.screenshot, 215 | "success": result.success 216 | } 217 | 218 | ``` 219 | 220 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/chunking - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | ## Chunking Strategies 📚 2 | 3 | Crawl4AI provides several powerful chunking strategies to divide text into manageable parts for further processing. Each strategy has unique characteristics and is suitable for different scenarios. Let's explore them one by one. 4 | 5 | ### RegexChunking 6 | 7 | `RegexChunking` splits text using regular expressions. This is ideal for creating chunks based on specific patterns like paragraphs or sentences. 8 | 9 | #### When to Use 10 | 11 | - Great for structured text with consistent delimiters. 12 | - Suitable for documents where specific patterns (e.g., double newlines, periods) indicate logical chunks. 13 | 14 | #### Parameters 15 | 16 | - `patterns` (list, optional): Regular expressions used to split the text. Default is to split by double newlines ( `['\n\n']`). 17 | 18 | #### Example 19 | 20 | ```hljs python 21 | from crawl4ai.chunking_strategy import RegexChunking 22 | 23 | # Define patterns for splitting text 24 | patterns = [r'\n\n', r'\. '] 25 | chunker = RegexChunking(patterns=patterns) 26 | 27 | # Sample text 28 | text = "This is a sample text. It will be split into chunks.\n\nThis is another paragraph." 29 | 30 | # Chunk the text 31 | chunks = chunker.chunk(text) 32 | print(chunks) 33 | 34 | ``` 35 | 36 | ### NlpSentenceChunking 37 | 38 | `NlpSentenceChunking` uses NLP models to split text into sentences, ensuring accurate sentence boundaries. 39 | 40 | #### When to Use 41 | 42 | - Ideal for texts where sentence boundaries are crucial. 43 | - Useful for creating chunks that preserve grammatical structures. 44 | 45 | #### Parameters 46 | 47 | - None. 48 | 49 | #### Example 50 | 51 | ```hljs makefile 52 | from crawl4ai.chunking_strategy import NlpSentenceChunking 53 | 54 | chunker = NlpSentenceChunking() 55 | 56 | # Sample text 57 | text = "This is a sample text. It will be split into sentences. Here's another sentence." 58 | 59 | # Chunk the text 60 | chunks = chunker.chunk(text) 61 | print(chunks) 62 | 63 | ``` 64 | 65 | ### TopicSegmentationChunking 66 | 67 | `TopicSegmentationChunking` employs the TextTiling algorithm to segment text into topic-based chunks. This method identifies thematic boundaries. 68 | 69 | #### When to Use 70 | 71 | - Perfect for long documents with distinct topics. 72 | - Useful when preserving topic continuity is more important than maintaining text order. 73 | 74 | #### Parameters 75 | 76 | - `num_keywords` (int, optional): Number of keywords for each topic segment. Default is `3`. 77 | 78 | #### Example 79 | 80 | ```hljs makefile 81 | from crawl4ai.chunking_strategy import TopicSegmentationChunking 82 | 83 | chunker = TopicSegmentationChunking(num_keywords=3) 84 | 85 | # Sample text 86 | text = "This document contains several topics. Topic one discusses AI. Topic two covers machine learning." 87 | 88 | # Chunk the text 89 | chunks = chunker.chunk(text) 90 | print(chunks) 91 | 92 | ``` 93 | 94 | ### FixedLengthWordChunking 95 | 96 | `FixedLengthWordChunking` splits text into chunks based on a fixed number of words. This ensures each chunk has approximately the same length. 97 | 98 | #### When to Use 99 | 100 | - Suitable for processing large texts where uniform chunk size is important. 101 | - Useful when the number of words per chunk needs to be controlled. 102 | 103 | #### Parameters 104 | 105 | - `chunk_size` (int, optional): Number of words per chunk. Default is `100`. 106 | 107 | #### Example 108 | 109 | ```hljs makefile 110 | from crawl4ai.chunking_strategy import FixedLengthWordChunking 111 | 112 | chunker = FixedLengthWordChunking(chunk_size=10) 113 | 114 | # Sample text 115 | text = "This is a sample text. It will be split into chunks of fixed length." 116 | 117 | # Chunk the text 118 | chunks = chunker.chunk(text) 119 | print(chunks) 120 | 121 | ``` 122 | 123 | ### SlidingWindowChunking 124 | 125 | `SlidingWindowChunking` uses a sliding window approach to create overlapping chunks. Each chunk has a fixed length, and the window slides by a specified step size. 126 | 127 | #### When to Use 128 | 129 | - Ideal for creating overlapping chunks to preserve context. 130 | - Useful for tasks where context from adjacent chunks is needed. 131 | 132 | #### Parameters 133 | 134 | - `window_size` (int, optional): Number of words in each chunk. Default is `100`. 135 | - `step` (int, optional): Number of words to slide the window. Default is `50`. 136 | 137 | #### Example 138 | 139 | ```hljs vbnet 140 | from crawl4ai.chunking_strategy import SlidingWindowChunking 141 | 142 | chunker = SlidingWindowChunking(window_size=10, step=5) 143 | 144 | # Sample text 145 | text = "This is a sample text. It will be split using a sliding window approach to preserve context." 146 | 147 | # Chunk the text 148 | chunks = chunker.chunk(text) 149 | print(chunks) 150 | 151 | ``` 152 | 153 | With these chunking strategies, you can choose the best method to divide your text based on your specific needs. Whether you need precise sentence boundaries, topic-based segmentation, or uniform chunk sizes, Crawl4AI has you covered. Happy chunking! 📝✨ 154 | 155 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/content processing - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Content Processing 2 | 3 | Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction. 4 | 5 | ## Content Cleaning 6 | 7 | ### Understanding Clean Content 8 | 9 | When crawling web pages, you often encounter a lot of noise - advertisements, navigation menus, footers, popups, and other irrelevant content. Crawl4AI automatically cleans this noise using several approaches: 10 | 11 | 1. **Basic Cleaning**: Removes unwanted HTML elements and attributes 12 | 2. **Content Relevance**: Identifies and preserves meaningful content blocks 13 | 3. **Layout Analysis**: Understands page structure to identify main content areas 14 | 15 | ```hljs python 16 | result = await crawler.arun( 17 | url="https://example.com", 18 | word_count_threshold=10, # Remove blocks with fewer words 19 | excluded_tags=['form', 'nav'], # Remove specific HTML tags 20 | remove_overlay_elements=True # Remove popups/modals 21 | ) 22 | 23 | # Get clean content 24 | print(result.cleaned_html) # Cleaned HTML 25 | print(result.markdown) # Clean markdown version 26 | 27 | ``` 28 | 29 | ### Fit Markdown: Smart Content Extraction 30 | 31 | One of Crawl4AI's most powerful features is `fit_markdown`. This feature uses advanced heuristics to identify and extract the main content from a webpage while excluding irrelevant elements. 32 | 33 | #### How Fit Markdown Works 34 | 35 | - Analyzes content density and distribution 36 | - Identifies content patterns and structures 37 | - Removes boilerplate content (headers, footers, sidebars) 38 | - Preserves the most relevant content blocks 39 | - Maintains content hierarchy and formatting 40 | 41 | #### Perfect For: 42 | 43 | - Blog posts and articles 44 | - News content 45 | - Documentation pages 46 | - Any page with a clear main content area 47 | 48 | #### Not Recommended For: 49 | 50 | - E-commerce product listings 51 | - Search results pages 52 | - Social media feeds 53 | - Pages with multiple equal-weight content sections 54 | 55 | ```hljs python 56 | result = await crawler.arun(url="https://example.com") 57 | 58 | # Get the most relevant content 59 | main_content = result.fit_markdown 60 | 61 | # Compare with regular markdown 62 | all_content = result.markdown 63 | 64 | print(f"Fit Markdown Length: {len(main_content)}") 65 | print(f"Regular Markdown Length: {len(all_content)}") 66 | 67 | ``` 68 | 69 | #### Example Use Case 70 | 71 | ```hljs python 72 | async def extract_article_content(url: str) -> str: 73 | """Extract main article content from a blog or news site.""" 74 | async with AsyncWebCrawler() as crawler: 75 | result = await crawler.arun(url=url) 76 | 77 | # fit_markdown will focus on the article content, 78 | # excluding navigation, ads, and other distractions 79 | return result.fit_markdown 80 | 81 | ``` 82 | 83 | ## Media Processing 84 | 85 | Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance. 86 | 87 | ### Image Processing 88 | 89 | The library handles various image scenarios, including: 90 | \- Regular images 91 | \- Lazy-loaded images 92 | \- Background images 93 | \- Responsive images 94 | \- Image metadata and context 95 | 96 | ```hljs python 97 | result = await crawler.arun(url="https://example.com") 98 | 99 | for image in result.media["images"]: 100 | # Each image includes rich metadata 101 | print(f"Source: {image['src']}") 102 | print(f"Alt text: {image['alt']}") 103 | print(f"Description: {image['desc']}") 104 | print(f"Context: {image['context']}") # Surrounding text 105 | print(f"Relevance score: {image['score']}") # 0-10 score 106 | 107 | ``` 108 | 109 | ### Handling Lazy-Loaded Content 110 | 111 | Crawl4aai already handles lazy loading for media elements. You can also customize the wait time for lazy-loaded content: 112 | 113 | ```hljs makefile 114 | result = await crawler.arun( 115 | url="https://example.com", 116 | wait_for="css:img[data-src]", # Wait for lazy images 117 | delay_before_return_html=2.0 # Additional wait time 118 | ) 119 | 120 | ``` 121 | 122 | ### Video and Audio Content 123 | 124 | The library extracts video and audio elements with their metadata: 125 | 126 | ```hljs python 127 | # Process videos 128 | for video in result.media["videos"]: 129 | print(f"Video source: {video['src']}") 130 | print(f"Type: {video['type']}") 131 | print(f"Duration: {video.get('duration')}") 132 | print(f"Thumbnail: {video.get('poster')}") 133 | 134 | # Process audio 135 | for audio in result.media["audios"]: 136 | print(f"Audio source: {audio['src']}") 137 | print(f"Type: {audio['type']}") 138 | print(f"Duration: {audio.get('duration')}") 139 | 140 | ``` 141 | 142 | ## Link Analysis 143 | 144 | Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns. 145 | 146 | ### Link Classification 147 | 148 | The library automatically categorizes links into: 149 | \- Internal links (same domain) 150 | \- External links (different domains) 151 | \- Social media links 152 | \- Navigation links 153 | \- Content links 154 | 155 | ```hljs python 156 | result = await crawler.arun(url="https://example.com") 157 | 158 | # Analyze internal links 159 | for link in result.links["internal"]: 160 | print(f"Internal: {link['href']}") 161 | print(f"Link text: {link['text']}") 162 | print(f"Context: {link['context']}") # Surrounding text 163 | print(f"Type: {link['type']}") # nav, content, etc. 164 | 165 | # Analyze external links 166 | for link in result.links["external"]: 167 | print(f"External: {link['href']}") 168 | print(f"Domain: {link['domain']}") 169 | print(f"Type: {link['type']}") 170 | 171 | ``` 172 | 173 | ### Smart Link Filtering 174 | 175 | Control which links are included in the results: 176 | 177 | ```hljs python 178 | result = await crawler.arun( 179 | url="https://example.com", 180 | exclude_external_links=True, # Remove external links 181 | exclude_social_media_links=True, # Remove social media links 182 | exclude_social_media_domains=[ # Custom social media domains\ 183 | "facebook.com", "twitter.com", "instagram.com"\ 184 | ], 185 | exclude_domains=["ads.example.com"] # Exclude specific domains 186 | ) 187 | 188 | ``` 189 | 190 | ## Metadata Extraction 191 | 192 | Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content: 193 | 194 | ```hljs python 195 | result = await crawler.arun(url="https://example.com") 196 | 197 | metadata = result.metadata 198 | print(f"Title: {metadata['title']}") 199 | print(f"Description: {metadata['description']}") 200 | print(f"Keywords: {metadata['keywords']}") 201 | print(f"Author: {metadata['author']}") 202 | print(f"Published Date: {metadata['published_date']}") 203 | print(f"Modified Date: {metadata['modified_date']}") 204 | print(f"Language: {metadata['language']}") 205 | 206 | ``` 207 | 208 | ## Best Practices 209 | 210 | 1. **Use Fit Markdown for Articles** 211 | 212 | 213 | 214 | ```hljs ini 215 | # Perfect for blog posts, news articles, documentation 216 | content = result.fit_markdown 217 | 218 | ``` 219 | 220 | 2. **Handle Media Appropriately** 221 | 222 | 223 | 224 | ```hljs ini 225 | # Filter by relevance score 226 | relevant_images = [\ 227 | img for img in result.media["images"]\ 228 | if img['score'] > 5\ 229 | ] 230 | 231 | ``` 232 | 233 | 3. **Combine Link Analysis with Content** 234 | 235 | 236 | 237 | ```hljs bash 238 | # Get content links with context 239 | content_links = [\ 240 | link for link in result.links["internal"]\ 241 | if link['type'] == 'content'\ 242 | ] 243 | 244 | ``` 245 | 246 | 4. **Clean Content with Purpose** 247 | 248 | 249 | 250 | ```hljs python 251 | # Customize cleaning based on your needs 252 | result = await crawler.arun( 253 | url=url, 254 | word_count_threshold=20, # Adjust based on content type 255 | keep_data_attributes=False, # Remove data attributes 256 | process_iframes=True # Include iframe content 257 | ) 258 | 259 | ``` 260 | 261 | 262 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/content selection - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Content Selection 2 | 3 | Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need. 4 | 5 | ## CSS Selectors 6 | 7 | The simplest way to extract specific content: 8 | 9 | ```hljs makefile 10 | # Extract specific content using CSS selector 11 | result = await crawler.arun( 12 | url="https://example.com", 13 | css_selector=".main-article" # Target main article content 14 | ) 15 | 16 | # Multiple selectors 17 | result = await crawler.arun( 18 | url="https://example.com", 19 | css_selector="article h1, article .content" # Target heading and content 20 | ) 21 | 22 | ``` 23 | 24 | ## Content Filtering 25 | 26 | Control what content is included or excluded: 27 | 28 | ```hljs python 29 | result = await crawler.arun( 30 | url="https://example.com", 31 | # Content thresholds 32 | word_count_threshold=10, # Minimum words per block 33 | 34 | # Tag exclusions 35 | excluded_tags=['form', 'header', 'footer', 'nav'], 36 | 37 | # Link filtering 38 | exclude_external_links=True, # Remove external links 39 | exclude_social_media_links=True, # Remove social media links 40 | 41 | # Media filtering 42 | exclude_external_images=True # Remove external images 43 | ) 44 | 45 | ``` 46 | 47 | ## Iframe Content 48 | 49 | Process content inside iframes: 50 | 51 | ```hljs python 52 | result = await crawler.arun( 53 | url="https://example.com", 54 | process_iframes=True, # Extract iframe content 55 | remove_overlay_elements=True # Remove popups/modals that might block iframes 56 | ) 57 | 58 | ``` 59 | 60 | ## Structured Content Selection 61 | 62 | ### Using LLMs for Smart Selection 63 | 64 | Use LLMs to intelligently extract specific types of content: 65 | 66 | ```hljs python 67 | from pydantic import BaseModel 68 | from crawl4ai.extraction_strategy import LLMExtractionStrategy 69 | 70 | class ArticleContent(BaseModel): 71 | title: str 72 | main_points: List[str] 73 | conclusion: str 74 | 75 | strategy = LLMExtractionStrategy( 76 | provider="ollama/nemotron", # Works with any supported LLM 77 | schema=ArticleContent.schema(), 78 | instruction="Extract the main article title, key points, and conclusion" 79 | ) 80 | 81 | result = await crawler.arun( 82 | url="https://example.com", 83 | extraction_strategy=strategy 84 | ) 85 | article = json.loads(result.extracted_content) 86 | 87 | ``` 88 | 89 | ### Pattern-Based Selection 90 | 91 | For repeated content patterns (like product listings, news feeds): 92 | 93 | ```hljs makefile 94 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy 95 | 96 | schema = { 97 | "name": "News Articles", 98 | "baseSelector": "article.news-item", # Repeated element 99 | "fields": [\ 100 | {"name": "headline", "selector": "h2", "type": "text"},\ 101 | {"name": "summary", "selector": ".summary", "type": "text"},\ 102 | {"name": "category", "selector": ".category", "type": "text"},\ 103 | {\ 104 | "name": "metadata",\ 105 | "type": "nested",\ 106 | "fields": [\ 107 | {"name": "author", "selector": ".author", "type": "text"},\ 108 | {"name": "date", "selector": ".date", "type": "text"}\ 109 | ]\ 110 | }\ 111 | ] 112 | } 113 | 114 | strategy = JsonCssExtractionStrategy(schema) 115 | result = await crawler.arun( 116 | url="https://example.com", 117 | extraction_strategy=strategy 118 | ) 119 | articles = json.loads(result.extracted_content) 120 | 121 | ``` 122 | 123 | ## Domain-Based Filtering 124 | 125 | Control content based on domains: 126 | 127 | ```hljs python 128 | result = await crawler.arun( 129 | url="https://example.com", 130 | exclude_domains=["ads.com", "tracker.com"], 131 | exclude_social_media_domains=["facebook.com", "twitter.com"], # Custom social media domains to exclude 132 | exclude_social_media_links=True 133 | ) 134 | 135 | ``` 136 | 137 | ## Media Selection 138 | 139 | Select specific types of media: 140 | 141 | ```hljs python 142 | result = await crawler.arun(url="https://example.com") 143 | 144 | # Access different media types 145 | images = result.media["images"] # List of image details 146 | videos = result.media["videos"] # List of video details 147 | audios = result.media["audios"] # List of audio details 148 | 149 | # Image with metadata 150 | for image in images: 151 | print(f"URL: {image['src']}") 152 | print(f"Alt text: {image['alt']}") 153 | print(f"Description: {image['desc']}") 154 | print(f"Relevance score: {image['score']}") 155 | 156 | ``` 157 | 158 | ## Comprehensive Example 159 | 160 | Here's how to combine different selection methods: 161 | 162 | ```hljs python 163 | async def extract_article_content(url: str): 164 | # Define structured extraction 165 | article_schema = { 166 | "name": "Article", 167 | "baseSelector": "article.main", 168 | "fields": [\ 169 | {"name": "title", "selector": "h1", "type": "text"},\ 170 | {"name": "content", "selector": ".content", "type": "text"}\ 171 | ] 172 | } 173 | 174 | # Define LLM extraction 175 | class ArticleAnalysis(BaseModel): 176 | key_points: List[str] 177 | sentiment: str 178 | category: str 179 | 180 | async with AsyncWebCrawler() as crawler: 181 | # Get structured content 182 | pattern_result = await crawler.arun( 183 | url=url, 184 | extraction_strategy=JsonCssExtractionStrategy(article_schema), 185 | word_count_threshold=10, 186 | excluded_tags=['nav', 'footer'], 187 | exclude_external_links=True 188 | ) 189 | 190 | # Get semantic analysis 191 | analysis_result = await crawler.arun( 192 | url=url, 193 | extraction_strategy=LLMExtractionStrategy( 194 | provider="ollama/nemotron", 195 | schema=ArticleAnalysis.schema(), 196 | instruction="Analyze the article content" 197 | ) 198 | ) 199 | 200 | # Combine results 201 | return { 202 | "article": json.loads(pattern_result.extracted_content), 203 | "analysis": json.loads(analysis_result.extracted_content), 204 | "media": pattern_result.media 205 | } 206 | 207 | ``` 208 | 209 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/cosine strategy - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Cosine Strategy 2 | 3 | The Cosine Strategy in Crawl4AI uses similarity-based clustering to identify and extract relevant content sections from web pages. This strategy is particularly useful when you need to find and extract content based on semantic similarity rather than structural patterns. 4 | 5 | ## How It Works 6 | 7 | The Cosine Strategy: 8 | 1\. Breaks down page content into meaningful chunks 9 | 2\. Converts text into vector representations 10 | 3\. Calculates similarity between chunks 11 | 4\. Clusters similar content together 12 | 5\. Ranks and filters content based on relevance 13 | 14 | ## Basic Usage 15 | 16 | ```hljs csharp 17 | from crawl4ai.extraction_strategy import CosineStrategy 18 | 19 | strategy = CosineStrategy( 20 | semantic_filter="product reviews", # Target content type 21 | word_count_threshold=10, # Minimum words per cluster 22 | sim_threshold=0.3 # Similarity threshold 23 | ) 24 | 25 | async with AsyncWebCrawler() as crawler: 26 | result = await crawler.arun( 27 | url="https://example.com/reviews", 28 | extraction_strategy=strategy 29 | ) 30 | 31 | content = result.extracted_content 32 | 33 | ``` 34 | 35 | ## Configuration Options 36 | 37 | ### Core Parameters 38 | 39 | ```hljs python 40 | CosineStrategy( 41 | # Content Filtering 42 | semantic_filter: str = None, # Keywords/topic for content filtering 43 | word_count_threshold: int = 10, # Minimum words per cluster 44 | sim_threshold: float = 0.3, # Similarity threshold (0.0 to 1.0) 45 | 46 | # Clustering Parameters 47 | max_dist: float = 0.2, # Maximum distance for clustering 48 | linkage_method: str = 'ward', # Clustering linkage method 49 | top_k: int = 3, # Number of top categories to extract 50 | 51 | # Model Configuration 52 | model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', # Embedding model 53 | 54 | verbose: bool = False # Enable logging 55 | ) 56 | 57 | ``` 58 | 59 | ### Parameter Details 60 | 61 | 01. **semantic\_filter** 62 | 02. Sets the target topic or content type 63 | 03. Use keywords relevant to your desired content 64 | 04. Example: "technical specifications", "user reviews", "pricing information" 65 | 66 | 05. **sim\_threshold** 67 | 68 | 06. Controls how similar content must be to be grouped together 69 | 07. Higher values (e.g., 0.8) mean stricter matching 70 | 08. Lower values (e.g., 0.3) allow more variation 71 | 72 | 73 | 74 | 75 | ```hljs ini 76 | # Strict matching 77 | strategy = CosineStrategy(sim_threshold=0.8) 78 | 79 | # Loose matching 80 | strategy = CosineStrategy(sim_threshold=0.3) 81 | 82 | ``` 83 | 84 | 09. **word\_count\_threshold** 85 | 86 | 10. Filters out short content blocks 87 | 11. Helps eliminate noise and irrelevant content 88 | 89 | 90 | 91 | 92 | ```hljs ini 93 | # Only consider substantial paragraphs 94 | strategy = CosineStrategy(word_count_threshold=50) 95 | 96 | ``` 97 | 98 | 12. **top\_k** 99 | 100 | 13. Number of top content clusters to return 101 | 14. Higher values return more diverse content 102 | 103 | 104 | 105 | ```hljs ini 106 | # Get top 5 most relevant content clusters 107 | strategy = CosineStrategy(top_k=5) 108 | 109 | ``` 110 | 111 | 112 | ## Use Cases 113 | 114 | ### 1\. Article Content Extraction 115 | 116 | ```hljs makefile 117 | strategy = CosineStrategy( 118 | semantic_filter="main article content", 119 | word_count_threshold=100, # Longer blocks for articles 120 | top_k=1 # Usually want single main content 121 | ) 122 | 123 | result = await crawler.arun( 124 | url="https://example.com/blog/post", 125 | extraction_strategy=strategy 126 | ) 127 | 128 | ``` 129 | 130 | ### 2\. Product Review Analysis 131 | 132 | ```hljs makefile 133 | strategy = CosineStrategy( 134 | semantic_filter="customer reviews and ratings", 135 | word_count_threshold=20, # Reviews can be shorter 136 | top_k=10, # Get multiple reviews 137 | sim_threshold=0.4 # Allow variety in review content 138 | ) 139 | 140 | ``` 141 | 142 | ### 3\. Technical Documentation 143 | 144 | ```hljs makefile 145 | strategy = CosineStrategy( 146 | semantic_filter="technical specifications documentation", 147 | word_count_threshold=30, 148 | sim_threshold=0.6, # Stricter matching for technical content 149 | max_dist=0.3 # Allow related technical sections 150 | ) 151 | 152 | ``` 153 | 154 | ## Advanced Features 155 | 156 | ### Custom Clustering 157 | 158 | ```hljs bash 159 | strategy = CosineStrategy( 160 | linkage_method='complete', # Alternative clustering method 161 | max_dist=0.4, # Larger clusters 162 | model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' # Multilingual support 163 | ) 164 | 165 | ``` 166 | 167 | ### Content Filtering Pipeline 168 | 169 | ```hljs python 170 | strategy = CosineStrategy( 171 | semantic_filter="pricing plans features", 172 | word_count_threshold=15, 173 | sim_threshold=0.5, 174 | top_k=3 175 | ) 176 | 177 | async def extract_pricing_features(url: str): 178 | async with AsyncWebCrawler() as crawler: 179 | result = await crawler.arun( 180 | url=url, 181 | extraction_strategy=strategy 182 | ) 183 | 184 | if result.success: 185 | content = json.loads(result.extracted_content) 186 | return { 187 | 'pricing_features': content, 188 | 'clusters': len(content), 189 | 'similarity_scores': [item['score'] for item in content] 190 | } 191 | 192 | ``` 193 | 194 | ## Best Practices 195 | 196 | 01. **Adjust Thresholds Iteratively** 197 | 02. Start with default values 198 | 03. Adjust based on results 199 | 04. Monitor clustering quality 200 | 201 | 05. **Choose Appropriate Word Count Thresholds** 202 | 203 | 06. Higher for articles (100+) 204 | 07. Lower for reviews/comments (20+) 205 | 08. Medium for product descriptions (50+) 206 | 207 | 09. **Optimize Performance** 208 | 209 | 210 | 211 | ```hljs graphql 212 | strategy = CosineStrategy( 213 | word_count_threshold=10, # Filter early 214 | top_k=5, # Limit results 215 | verbose=True # Monitor performance 216 | ) 217 | 218 | ``` 219 | 220 | 10. **Handle Different Content Types** 221 | 222 | 223 | 224 | ```hljs makefile 225 | # For mixed content pages 226 | strategy = CosineStrategy( 227 | semantic_filter="product features", 228 | sim_threshold=0.4, # More flexible matching 229 | max_dist=0.3, # Larger clusters 230 | top_k=3 # Multiple relevant sections 231 | ) 232 | 233 | ``` 234 | 235 | 236 | ## Error Handling 237 | 238 | ```hljs python 239 | try: 240 | result = await crawler.arun( 241 | url="https://example.com", 242 | extraction_strategy=strategy 243 | ) 244 | 245 | if result.success: 246 | content = json.loads(result.extracted_content) 247 | if not content: 248 | print("No relevant content found") 249 | else: 250 | print(f"Extraction failed: {result.error_message}") 251 | 252 | except Exception as e: 253 | print(f"Error during extraction: {str(e)}") 254 | 255 | ``` 256 | 257 | The Cosine Strategy is particularly effective when: 258 | \- Content structure is inconsistent 259 | \- You need semantic understanding 260 | \- You want to find similar content blocks 261 | \- Structure-based extraction (CSS/XPath) isn't reliable 262 | 263 | It works well with other strategies and can be used as a pre-processing step for LLM-based extraction. 264 | 265 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/crawlresult - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # CrawlResult 2 | 3 | The `CrawlResult` class represents the result of a web crawling operation. It provides access to various forms of extracted content and metadata from the crawled webpage. 4 | 5 | ## Class Definition 6 | 7 | ```hljs python 8 | class CrawlResult(BaseModel): 9 | """Result of a web crawling operation.""" 10 | 11 | # Basic Information 12 | url: str # Crawled URL 13 | success: bool # Whether crawl succeeded 14 | status_code: Optional[int] = None # HTTP status code 15 | error_message: Optional[str] = None # Error message if failed 16 | 17 | # Content 18 | html: str # Raw HTML content 19 | cleaned_html: Optional[str] = None # Cleaned HTML 20 | fit_html: Optional[str] = None # Most relevant HTML content 21 | markdown: Optional[str] = None # HTML converted to markdown 22 | fit_markdown: Optional[str] = None # Most relevant markdown content 23 | 24 | # Extracted Data 25 | extracted_content: Optional[str] = None # Content from extraction strategy 26 | media: Dict[str, List[Dict]] = {} # Extracted media information 27 | links: Dict[str, List[Dict]] = {} # Extracted links 28 | metadata: Optional[dict] = None # Page metadata 29 | 30 | # Additional Data 31 | screenshot: Optional[str] = None # Base64 encoded screenshot 32 | session_id: Optional[str] = None # Session identifier 33 | response_headers: Optional[dict] = None # HTTP response headers 34 | 35 | ``` 36 | 37 | ## Properties and Their Data Structures 38 | 39 | ### Basic Information 40 | 41 | ```hljs python 42 | # Access basic information 43 | result = await crawler.arun(url="https://example.com") 44 | 45 | print(result.url) # "https://example.com" 46 | print(result.success) # True/False 47 | print(result.status_code) # 200, 404, etc. 48 | print(result.error_message) # Error details if failed 49 | 50 | ``` 51 | 52 | ### Content Properties 53 | 54 | #### HTML Content 55 | 56 | ```hljs ini 57 | # Raw HTML 58 | html_content = result.html 59 | 60 | # Cleaned HTML (removed ads, popups, etc.) 61 | clean_content = result.cleaned_html 62 | 63 | # Most relevant HTML content 64 | main_content = result.fit_html 65 | 66 | ``` 67 | 68 | #### Markdown Content 69 | 70 | ```hljs ini 71 | # Full markdown version 72 | markdown_content = result.markdown 73 | 74 | # Most relevant markdown content 75 | main_content = result.fit_markdown 76 | 77 | ``` 78 | 79 | ### Media Content 80 | 81 | The media dictionary contains organized media elements: 82 | 83 | ```hljs python 84 | # Structure 85 | media = { 86 | "images": [\ 87 | {\ 88 | "src": str, # Image URL\ 89 | "alt": str, # Alt text\ 90 | "desc": str, # Contextual description\ 91 | "score": float, # Relevance score (0-10)\ 92 | "type": str, # "image"\ 93 | "width": int, # Image width (if available)\ 94 | "height": int, # Image height (if available)\ 95 | "context": str, # Surrounding text\ 96 | "lazy": bool # Whether image was lazy-loaded\ 97 | }\ 98 | ], 99 | "videos": [\ 100 | {\ 101 | "src": str, # Video URL\ 102 | "type": str, # "video"\ 103 | "title": str, # Video title\ 104 | "poster": str, # Thumbnail URL\ 105 | "duration": str, # Video duration\ 106 | "description": str # Video description\ 107 | }\ 108 | ], 109 | "audios": [\ 110 | {\ 111 | "src": str, # Audio URL\ 112 | "type": str, # "audio"\ 113 | "title": str, # Audio title\ 114 | "duration": str, # Audio duration\ 115 | "description": str # Audio description\ 116 | }\ 117 | ] 118 | } 119 | 120 | # Example usage 121 | for image in result.media["images"]: 122 | if image["score"] > 5: # High-relevance images 123 | print(f"High-quality image: {image['src']}") 124 | print(f"Context: {image['context']}") 125 | 126 | ``` 127 | 128 | ### Link Analysis 129 | 130 | The links dictionary organizes discovered links: 131 | 132 | ```hljs python 133 | # Structure 134 | links = { 135 | "internal": [\ 136 | {\ 137 | "href": str, # URL\ 138 | "text": str, # Link text\ 139 | "title": str, # Title attribute\ 140 | "type": str, # Link type (nav, content, etc.)\ 141 | "context": str, # Surrounding text\ 142 | "score": float # Relevance score\ 143 | }\ 144 | ], 145 | "external": [\ 146 | {\ 147 | "href": str, # External URL\ 148 | "text": str, # Link text\ 149 | "title": str, # Title attribute\ 150 | "domain": str, # Domain name\ 151 | "type": str, # Link type\ 152 | "context": str # Surrounding text\ 153 | }\ 154 | ] 155 | } 156 | 157 | # Example usage 158 | for link in result.links["internal"]: 159 | print(f"Internal link: {link['href']}") 160 | print(f"Context: {link['context']}") 161 | 162 | ``` 163 | 164 | ### Metadata 165 | 166 | The metadata dictionary contains page information: 167 | 168 | ```hljs python 169 | # Structure 170 | metadata = { 171 | "title": str, # Page title 172 | "description": str, # Meta description 173 | "keywords": List[str], # Meta keywords 174 | "author": str, # Author information 175 | "published_date": str, # Publication date 176 | "modified_date": str, # Last modified date 177 | "language": str, # Page language 178 | "canonical_url": str, # Canonical URL 179 | "og_data": Dict, # Open Graph data 180 | "twitter_data": Dict # Twitter card data 181 | } 182 | 183 | # Example usage 184 | if result.metadata: 185 | print(f"Title: {result.metadata['title']}") 186 | print(f"Author: {result.metadata.get('author', 'Unknown')}") 187 | 188 | ``` 189 | 190 | ### Extracted Content 191 | 192 | Content from extraction strategies: 193 | 194 | ```hljs bash 195 | # For LLM or CSS extraction strategies 196 | if result.extracted_content: 197 | structured_data = json.loads(result.extracted_content) 198 | print(structured_data) 199 | 200 | ``` 201 | 202 | ### Screenshot 203 | 204 | Base64 encoded screenshot: 205 | 206 | ```hljs python 207 | # Save screenshot if available 208 | if result.screenshot: 209 | import base64 210 | 211 | # Decode and save 212 | with open("screenshot.png", "wb") as f: 213 | f.write(base64.b64decode(result.screenshot)) 214 | 215 | ``` 216 | 217 | ## Usage Examples 218 | 219 | ### Basic Content Access 220 | 221 | ```hljs python 222 | async with AsyncWebCrawler() as crawler: 223 | result = await crawler.arun(url="https://example.com") 224 | 225 | if result.success: 226 | # Get clean content 227 | print(result.fit_markdown) 228 | 229 | # Process images 230 | for image in result.media["images"]: 231 | if image["score"] > 7: 232 | print(f"High-quality image: {image['src']}") 233 | 234 | ``` 235 | 236 | ### Complete Data Processing 237 | 238 | ```hljs python 239 | async def process_webpage(url: str) -> Dict: 240 | async with AsyncWebCrawler() as crawler: 241 | result = await crawler.arun(url=url) 242 | 243 | if not result.success: 244 | raise Exception(f"Crawl failed: {result.error_message}") 245 | 246 | return { 247 | "content": result.fit_markdown, 248 | "images": [\ 249 | img for img in result.media["images"]\ 250 | if img["score"] > 5\ 251 | ], 252 | "internal_links": [\ 253 | link["href"] for link in result.links["internal"]\ 254 | ], 255 | "metadata": result.metadata, 256 | "status": result.status_code 257 | } 258 | 259 | ``` 260 | 261 | ### Error Handling 262 | 263 | ```hljs python 264 | async def safe_crawl(url: str) -> Dict: 265 | async with AsyncWebCrawler() as crawler: 266 | try: 267 | result = await crawler.arun(url=url) 268 | 269 | if not result.success: 270 | return { 271 | "success": False, 272 | "error": result.error_message, 273 | "status": result.status_code 274 | } 275 | 276 | return { 277 | "success": True, 278 | "content": result.fit_markdown, 279 | "status": result.status_code 280 | } 281 | 282 | except Exception as e: 283 | return { 284 | "success": False, 285 | "error": str(e), 286 | "status": None 287 | } 288 | 289 | ``` 290 | 291 | ## Best Practices 292 | 293 | 1. **Always Check Success** 294 | 295 | 296 | 297 | ```hljs python 298 | if not result.success: 299 | print(f"Error: {result.error_message}") 300 | return 301 | 302 | ``` 303 | 304 | 2. **Use fit\_markdown for Articles** 305 | 306 | 307 | 308 | ```hljs ini 309 | # Better for article content 310 | content = result.fit_markdown if result.fit_markdown else result.markdown 311 | 312 | ``` 313 | 314 | 3. **Filter Media by Score** 315 | 316 | 317 | 318 | ```hljs ini 319 | relevant_images = [\ 320 | img for img in result.media["images"]\ 321 | if img["score"] > 5\ 322 | ] 323 | 324 | ``` 325 | 326 | 4. **Handle Missing Data** 327 | 328 | 329 | 330 | ```hljs ini 331 | metadata = result.metadata or {} 332 | title = metadata.get('title', 'Unknown Title') 333 | 334 | ``` 335 | 336 | 337 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/home - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI 2 | 3 | Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. 4 | 5 | ## Introduction 6 | 7 | Crawl4AI has one clear task: to make crawling and data extraction from web pages easy and efficient, especially for large language models (LLMs) and AI applications. Whether you are using it as a REST API or a Python library, Crawl4AI offers a robust and flexible solution with full asynchronous support. 8 | 9 | ## Quick Start 10 | 11 | Here's a quick example to show you how easy it is to use Crawl4AI with its asynchronous capabilities: 12 | 13 | ```hljs python 14 | import asyncio 15 | from crawl4ai import AsyncWebCrawler 16 | 17 | async def main(): 18 | # Create an instance of AsyncWebCrawler 19 | async with AsyncWebCrawler(verbose=True) as crawler: 20 | # Run the crawler on a URL 21 | result = await crawler.arun(url="https://www.nbcnews.com/business") 22 | 23 | # Print the extracted content 24 | print(result.markdown) 25 | 26 | # Run the async main function 27 | asyncio.run(main()) 28 | 29 | ``` 30 | 31 | ## Key Features ✨ 32 | 33 | - 🆓 Completely free and open-source 34 | - 🚀 Blazing fast performance, outperforming many paid services 35 | - 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) 36 | - 📄 Fit markdown generation for extracting main article content. 37 | - 🌐 Multi-browser support (Chromium, Firefox, WebKit) 38 | - 🌍 Supports crawling multiple URLs simultaneously 39 | - 🎨 Extracts and returns all media tags (Images, Audio, and Video) 40 | - 🔗 Extracts all external and internal links 41 | - 📚 Extracts metadata from the page 42 | - 🔄 Custom hooks for authentication, headers, and page modifications 43 | - 🕵️ User-agent customization 44 | - 🖼️ Takes screenshots of pages with enhanced error handling 45 | - 📜 Executes multiple custom JavaScripts before crawling 46 | - 📊 Generates structured output without LLM using JsonCssExtractionStrategy 47 | - 📚 Various chunking strategies: topic-based, regex, sentence, and more 48 | - 🧠 Advanced extraction strategies: cosine clustering, LLM, and more 49 | - 🎯 CSS selector support for precise data extraction 50 | - 📝 Passes instructions/keywords to refine extraction 51 | - 🔒 Proxy support with authentication for enhanced access 52 | - 🔄 Session management for complex multi-page crawling 53 | - 🌐 Asynchronous architecture for improved performance 54 | - 🖼️ Improved image processing with lazy-loading detection 55 | - 🕰️ Enhanced handling of delayed content loading 56 | - 🔑 Custom headers support for LLM interactions 57 | - 🖼️ iframe content extraction for comprehensive analysis 58 | - ⏱️ Flexible timeout and delayed content retrieval options 59 | 60 | ## Documentation Structure 61 | 62 | Our documentation is organized into several sections: 63 | 64 | ### Basic Usage 65 | 66 | - [Installation](basic/installation/) 67 | - [Quick Start](basic/quickstart/) 68 | - [Simple Crawling](basic/simple-crawling/) 69 | - [Browser Configuration](basic/browser-config/) 70 | - [Content Selection](basic/content-selection/) 71 | - [Output Formats](basic/output-formats/) 72 | - [Page Interaction](basic/page-interaction/) 73 | 74 | ### Advanced Features 75 | 76 | - [Magic Mode](advanced/magic-mode/) 77 | - [Session Management](advanced/session-management/) 78 | - [Hooks & Authentication](advanced/hooks-auth/) 79 | - [Proxy & Security](advanced/proxy-security/) 80 | - [Content Processing](advanced/content-processing/) 81 | 82 | ### Extraction & Processing 83 | 84 | - [Extraction Strategies Overview](extraction/overview/) 85 | - [LLM Integration](extraction/llm/) 86 | - [CSS-Based Extraction](extraction/css/) 87 | - [Cosine Strategy](extraction/cosine/) 88 | - [Chunking Strategies](extraction/chunking/) 89 | 90 | ### API Reference 91 | 92 | - [AsyncWebCrawler](api/async-webcrawler/) 93 | - [CrawlResult](api/crawl-result/) 94 | - [Extraction Strategies](api/strategies/) 95 | - [arun() Method Parameters](api/arun/) 96 | 97 | ### Examples 98 | 99 | - Coming soon! 100 | 101 | ## Getting Started 102 | 103 | 1. Install Crawl4AI: 104 | 105 | 106 | 107 | 108 | ```hljs undefined 109 | pip install crawl4ai 110 | 111 | ``` 112 | 113 | 2. Check out our [Quick Start Guide](basic/quickstart/) to begin crawling web pages. 114 | 115 | 3. Explore our [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) to see Crawl4AI in action. 116 | 117 | 118 | ## Support 119 | 120 | For questions, suggestions, or issues: 121 | \- GitHub Issues: [Report a Bug](https://github.com/unclecode/crawl4ai/issues) 122 | \- Twitter: [@unclecode](https://twitter.com/unclecode) 123 | \- Website: [crawl4ai.com](https://crawl4ai.com) 124 | 125 | Happy Crawling! 🕸️🚀 126 | 127 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/hooks auth - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Hooks & Auth for AsyncWebCrawler 2 | 3 | Crawl4AI's AsyncWebCrawler allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions that are called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This example demonstrates how to use various hooks to customize the asynchronous crawling process. 4 | 5 | ## Example: Using Crawler Hooks with AsyncWebCrawler 6 | 7 | Let's see how we can customize the AsyncWebCrawler using hooks! In this example, we'll: 8 | 9 | 1. Configure the browser when it's created. 10 | 2. Add custom headers before navigating to the URL. 11 | 3. Log the current URL after navigation. 12 | 4. Perform actions after JavaScript execution. 13 | 5. Log the length of the HTML before returning it. 14 | 15 | ### Hook Definitions 16 | 17 | ```hljs python 18 | import asyncio 19 | from crawl4ai import AsyncWebCrawler 20 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy 21 | from playwright.async_api import Page, Browser 22 | 23 | async def on_browser_created(browser: Browser): 24 | print("[HOOK] on_browser_created") 25 | # Example customization: set browser viewport size 26 | context = await browser.new_context(viewport={'width': 1920, 'height': 1080}) 27 | page = await context.new_page() 28 | 29 | # Example customization: logging in to a hypothetical website 30 | await page.goto('https://example.com/login') 31 | await page.fill('input[name="username"]', 'testuser') 32 | await page.fill('input[name="password"]', 'password123') 33 | await page.click('button[type="submit"]') 34 | await page.wait_for_selector('#welcome') 35 | 36 | # Add a custom cookie 37 | await context.add_cookies([{'name': 'test_cookie', 'value': 'cookie_value', 'url': 'https://example.com'}]) 38 | 39 | await page.close() 40 | await context.close() 41 | 42 | async def before_goto(page: Page): 43 | print("[HOOK] before_goto") 44 | # Example customization: add custom headers 45 | await page.set_extra_http_headers({'X-Test-Header': 'test'}) 46 | 47 | async def after_goto(page: Page): 48 | print("[HOOK] after_goto") 49 | # Example customization: log the URL 50 | print(f"Current URL: {page.url}") 51 | 52 | async def on_execution_started(page: Page): 53 | print("[HOOK] on_execution_started") 54 | # Example customization: perform actions after JS execution 55 | await page.evaluate("console.log('Custom JS executed')") 56 | 57 | async def before_return_html(page: Page, html: str): 58 | print("[HOOK] before_return_html") 59 | # Example customization: log the HTML length 60 | print(f"HTML length: {len(html)}") 61 | return page 62 | 63 | ``` 64 | 65 | ### Using the Hooks with the AsyncWebCrawler 66 | 67 | ```hljs css 68 | import asyncio 69 | from crawl4ai import AsyncWebCrawler 70 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy 71 | 72 | async def main(): 73 | print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!") 74 | 75 | crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True) 76 | crawler_strategy.set_hook('on_browser_created', on_browser_created) 77 | crawler_strategy.set_hook('before_goto', before_goto) 78 | crawler_strategy.set_hook('after_goto', after_goto) 79 | crawler_strategy.set_hook('on_execution_started', on_execution_started) 80 | crawler_strategy.set_hook('before_return_html', before_return_html) 81 | 82 | async with AsyncWebCrawler(verbose=True, crawler_strategy=crawler_strategy) as crawler: 83 | result = await crawler.arun( 84 | url="https://example.com", 85 | js_code="window.scrollTo(0, document.body.scrollHeight);", 86 | wait_for="footer" 87 | ) 88 | 89 | print("📦 Crawler Hooks result:") 90 | print(result) 91 | 92 | asyncio.run(main()) 93 | 94 | ``` 95 | 96 | ### Explanation 97 | 98 | - `on_browser_created`: This hook is called when the Playwright browser is created. It sets up the browser context, logs in to a website, and adds a custom cookie. 99 | - `before_goto`: This hook is called right before Playwright navigates to the URL. It adds custom HTTP headers. 100 | - `after_goto`: This hook is called after Playwright navigates to the URL. It logs the current URL. 101 | - `on_execution_started`: This hook is called after any custom JavaScript is executed. It performs additional JavaScript actions. 102 | - `before_return_html`: This hook is called before returning the HTML content. It logs the length of the HTML content. 103 | 104 | ### Additional Ideas 105 | 106 | - **Handling authentication**: Use the `on_browser_created` hook to handle login processes or set authentication tokens. 107 | - **Dynamic header modification**: Modify headers based on the target URL or other conditions in the `before_goto` hook. 108 | - **Content verification**: Use the `after_goto` hook to verify that the expected content is present on the page. 109 | - **Custom JavaScript injection**: Inject and execute custom JavaScript using the `on_execution_started` hook. 110 | - **Content preprocessing**: Modify or analyze the HTML content in the `before_return_html` hook before it's returned. 111 | 112 | By using these hooks, you can customize the behavior of the AsyncWebCrawler to suit your specific needs, including handling authentication, modifying requests, and preprocessing content. 113 | 114 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/installation - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Installation 💻 2 | 3 | Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server. 4 | 5 | ## Option 1: Python Package Installation (Recommended) 6 | 7 | Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs: 8 | 9 | ### Basic Installation 10 | 11 | For basic web crawling and scraping tasks: 12 | 13 | ```hljs bash 14 | pip install crawl4ai 15 | playwright install # Install Playwright dependencies 16 | 17 | ``` 18 | 19 | ### Installation with PyTorch 20 | 21 | For advanced text clustering (includes CosineSimilarity cluster strategy): 22 | 23 | ```hljs css 24 | pip install crawl4ai[torch] 25 | 26 | ``` 27 | 28 | ### Installation with Transformers 29 | 30 | For text summarization and Hugging Face models: 31 | 32 | ```hljs css 33 | pip install crawl4ai[transformer] 34 | 35 | ``` 36 | 37 | ### Full Installation 38 | 39 | For all features: 40 | 41 | ```hljs css 42 | pip install crawl4ai[all] 43 | 44 | ``` 45 | 46 | ### Development Installation 47 | 48 | For contributors who plan to modify the source code: 49 | 50 | ```hljs bash 51 | git clone https://github.com/unclecode/crawl4ai.git 52 | cd crawl4ai 53 | pip install -e ".[all]" 54 | playwright install # Install Playwright dependencies 55 | 56 | ``` 57 | 58 | 💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models: 59 | 60 | ```hljs undefined 61 | crawl4ai-download-models 62 | 63 | ``` 64 | 65 | This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation. 66 | 67 | ## Option 2: Using Docker (Coming Soon) 68 | 69 | Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems. 70 | 71 | ## Option 3: Local Server Installation 72 | 73 | For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete. 74 | 75 | ## Verifying Your Installation 76 | 77 | After installation, you can verify that Crawl4AI is working correctly by running a simple Python script: 78 | 79 | ```hljs python 80 | import asyncio 81 | from crawl4ai import AsyncWebCrawler 82 | 83 | async def main(): 84 | async with AsyncWebCrawler(verbose=True) as crawler: 85 | result = await crawler.arun(url="https://www.example.com") 86 | print(result.markdown[:500]) # Print first 500 characters 87 | 88 | if __name__ == "__main__": 89 | asyncio.run(main()) 90 | 91 | ``` 92 | 93 | This script should successfully crawl the example website and print the first 500 characters of the extracted content. 94 | 95 | ## Getting Help 96 | 97 | If you encounter any issues during installation or usage, please check the [documentation](https://crawl4ai.com/mkdocs/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues). 98 | 99 | Happy crawling! 🕷️🤖 100 | 101 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/json-css extractor basic - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # JSON CSS Extraction Strategy with AsyncWebCrawler 2 | 3 | The `JsonCssExtractionStrategy` is a powerful feature of Crawl4AI that allows you to extract structured data from web pages using CSS selectors. This method is particularly useful when you need to extract specific data points from a consistent HTML structure, such as tables or repeated elements. Here's how to use it with the AsyncWebCrawler. 4 | 5 | ## Overview 6 | 7 | The `JsonCssExtractionStrategy` works by defining a schema that specifies: 8 | 1\. A base CSS selector for the repeating elements 9 | 2\. Fields to extract from each element, each with its own CSS selector 10 | 11 | This strategy is fast and efficient, as it doesn't rely on external services like LLMs for extraction. 12 | 13 | ## Example: Extracting Cryptocurrency Prices from Coinbase 14 | 15 | Let's look at an example that extracts cryptocurrency prices from the Coinbase explore page. 16 | 17 | ```hljs python 18 | import json 19 | import asyncio 20 | from crawl4ai import AsyncWebCrawler 21 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy 22 | 23 | async def extract_structured_data_using_css_extractor(): 24 | print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") 25 | 26 | # Define the extraction schema 27 | schema = { 28 | "name": "Coinbase Crypto Prices", 29 | "baseSelector": ".cds-tableRow-t45thuk", 30 | "fields": [\ 31 | {\ 32 | "name": "crypto",\ 33 | "selector": "td:nth-child(1) h2",\ 34 | "type": "text",\ 35 | },\ 36 | {\ 37 | "name": "symbol",\ 38 | "selector": "td:nth-child(1) p",\ 39 | "type": "text",\ 40 | },\ 41 | {\ 42 | "name": "price",\ 43 | "selector": "td:nth-child(2)",\ 44 | "type": "text",\ 45 | }\ 46 | ], 47 | } 48 | 49 | # Create the extraction strategy 50 | extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) 51 | 52 | # Use the AsyncWebCrawler with the extraction strategy 53 | async with AsyncWebCrawler(verbose=True) as crawler: 54 | result = await crawler.arun( 55 | url="https://www.coinbase.com/explore", 56 | extraction_strategy=extraction_strategy, 57 | bypass_cache=True, 58 | ) 59 | 60 | assert result.success, "Failed to crawl the page" 61 | 62 | # Parse the extracted content 63 | crypto_prices = json.loads(result.extracted_content) 64 | print(f"Successfully extracted {len(crypto_prices)} cryptocurrency prices") 65 | print(json.dumps(crypto_prices[0], indent=2)) 66 | 67 | return crypto_prices 68 | 69 | # Run the async function 70 | asyncio.run(extract_structured_data_using_css_extractor()) 71 | 72 | ``` 73 | 74 | ## Explanation of the Schema 75 | 76 | The schema defines how to extract the data: 77 | 78 | - `name`: A descriptive name for the extraction task. 79 | - `baseSelector`: The CSS selector for the repeating elements (in this case, table rows). 80 | - `fields`: An array of fields to extract from each element: 81 | - `name`: The name to give the extracted data. 82 | - `selector`: The CSS selector to find the specific data within the base element. 83 | - `type`: The type of data to extract (usually "text" for textual content). 84 | 85 | ## Advantages of JsonCssExtractionStrategy 86 | 87 | 1. **Speed**: CSS selectors are fast to execute, making this method efficient for large datasets. 88 | 2. **Precision**: You can target exactly the elements you need. 89 | 3. **Structured Output**: The result is already structured as JSON, ready for further processing. 90 | 4. **No External Dependencies**: Unlike LLM-based strategies, this doesn't require any API calls to external services. 91 | 92 | ## Tips for Using JsonCssExtractionStrategy 93 | 94 | 1. **Inspect the Page**: Use browser developer tools to identify the correct CSS selectors. 95 | 2. **Test Selectors**: Verify your selectors in the browser console before using them in the script. 96 | 3. **Handle Dynamic Content**: If the page uses JavaScript to load content, you may need to combine this with JS execution (see the Advanced Usage section). 97 | 4. **Error Handling**: Always check the `result.success` flag and handle potential failures. 98 | 99 | ## Advanced Usage: Combining with JavaScript Execution 100 | 101 | For pages that load data dynamically, you can combine the `JsonCssExtractionStrategy` with JavaScript execution: 102 | 103 | ```hljs python 104 | async def extract_dynamic_structured_data(): 105 | schema = { 106 | "name": "Dynamic Crypto Prices", 107 | "baseSelector": ".crypto-row", 108 | "fields": [\ 109 | {"name": "name", "selector": ".crypto-name", "type": "text"},\ 110 | {"name": "price", "selector": ".crypto-price", "type": "text"},\ 111 | ] 112 | } 113 | 114 | js_code = """ 115 | window.scrollTo(0, document.body.scrollHeight); 116 | await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds 117 | """ 118 | 119 | extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) 120 | 121 | async with AsyncWebCrawler(verbose=True) as crawler: 122 | result = await crawler.arun( 123 | url="https://example.com/crypto-prices", 124 | extraction_strategy=extraction_strategy, 125 | js_code=js_code, 126 | wait_for=".crypto-row:nth-child(20)", # Wait for 20 rows to load 127 | bypass_cache=True, 128 | ) 129 | 130 | crypto_data = json.loads(result.extracted_content) 131 | print(f"Extracted {len(crypto_data)} cryptocurrency entries") 132 | 133 | asyncio.run(extract_dynamic_structured_data()) 134 | 135 | ``` 136 | 137 | This advanced example demonstrates how to: 138 | 1\. Execute JavaScript to trigger dynamic content loading. 139 | 2\. Wait for a specific condition (20 rows loaded) before extraction. 140 | 3\. Extract data from the dynamically loaded content. 141 | 142 | By mastering the `JsonCssExtractionStrategy`, you can efficiently extract structured data from a wide variety of web pages, making it a valuable tool in your web scraping toolkit. 143 | 144 | For more details on schema definitions and advanced extraction strategies, check out the [Advanced JsonCssExtraction](../css-advanced/). 145 | 146 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/llm strategy - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # LLM Extraction with AsyncWebCrawler 2 | 3 | Crawl4AI's AsyncWebCrawler allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages asynchronously. Below are two examples demonstrating how to use `LLMExtractionStrategy` for different purposes with the AsyncWebCrawler. 4 | 5 | ## Example 1: Extract Structured Data 6 | 7 | In this example, we use the `LLMExtractionStrategy` to extract structured data (model names and their fees) from the OpenAI pricing page. 8 | 9 | ```hljs python 10 | import os 11 | import json 12 | import asyncio 13 | from crawl4ai import AsyncWebCrawler 14 | from crawl4ai.extraction_strategy import LLMExtractionStrategy 15 | from pydantic import BaseModel, Field 16 | 17 | class OpenAIModelFee(BaseModel): 18 | model_name: str = Field(..., description="Name of the OpenAI model.") 19 | input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") 20 | output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") 21 | 22 | async def extract_openai_fees(): 23 | url = 'https://openai.com/api/pricing/' 24 | 25 | async with AsyncWebCrawler(verbose=True) as crawler: 26 | result = await crawler.arun( 27 | url=url, 28 | word_count_threshold=1, 29 | extraction_strategy=LLMExtractionStrategy( 30 | provider="openai/gpt-4o", # Or use ollama like provider="ollama/nemotron" 31 | api_token=os.getenv('OPENAI_API_KEY'), 32 | schema=OpenAIModelFee.model_json_schema(), 33 | extraction_type="schema", 34 | instruction="From the crawled content, extract all mentioned model names along with their " 35 | "fees for input and output tokens. Make sure not to miss anything in the entire content. " 36 | 'One extracted model JSON format should look like this: ' 37 | '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' 38 | ), 39 | bypass_cache=True, 40 | ) 41 | 42 | model_fees = json.loads(result.extracted_content) 43 | print(f"Number of models extracted: {len(model_fees)}") 44 | 45 | with open(".data/openai_fees.json", "w", encoding="utf-8") as f: 46 | json.dump(model_fees, f, indent=2) 47 | 48 | asyncio.run(extract_openai_fees()) 49 | 50 | ``` 51 | 52 | ## Example 2: Extract Relevant Content 53 | 54 | In this example, we instruct the LLM to extract only content related to technology from the NBC News business page. 55 | 56 | ```hljs python 57 | import os 58 | import json 59 | import asyncio 60 | from crawl4ai import AsyncWebCrawler 61 | from crawl4ai.extraction_strategy import LLMExtractionStrategy 62 | 63 | async def extract_tech_content(): 64 | async with AsyncWebCrawler(verbose=True) as crawler: 65 | result = await crawler.arun( 66 | url="https://www.nbcnews.com/business", 67 | extraction_strategy=LLMExtractionStrategy( 68 | provider="openai/gpt-4o", 69 | api_token=os.getenv('OPENAI_API_KEY'), 70 | instruction="Extract only content related to technology" 71 | ), 72 | bypass_cache=True, 73 | ) 74 | 75 | tech_content = json.loads(result.extracted_content) 76 | print(f"Number of tech-related items extracted: {len(tech_content)}") 77 | 78 | with open(".data/tech_content.json", "w", encoding="utf-8") as f: 79 | json.dump(tech_content, f, indent=2) 80 | 81 | asyncio.run(extract_tech_content()) 82 | 83 | ``` 84 | 85 | ## Advanced Usage: Combining JS Execution with LLM Extraction 86 | 87 | This example demonstrates how to combine JavaScript execution with LLM extraction to handle dynamic content: 88 | 89 | ```hljs python 90 | async def extract_dynamic_content(): 91 | js_code = """ 92 | const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); 93 | if (loadMoreButton) { 94 | loadMoreButton.click(); 95 | await new Promise(resolve => setTimeout(resolve, 2000)); 96 | } 97 | """ 98 | 99 | wait_for = """ 100 | () => { 101 | const articles = document.querySelectorAll('article.tease-card'); 102 | return articles.length > 10; 103 | } 104 | """ 105 | 106 | async with AsyncWebCrawler(verbose=True) as crawler: 107 | result = await crawler.arun( 108 | url="https://www.nbcnews.com/business", 109 | js_code=js_code, 110 | wait_for=wait_for, 111 | css_selector="article.tease-card", 112 | extraction_strategy=LLMExtractionStrategy( 113 | provider="openai/gpt-4o", 114 | api_token=os.getenv('OPENAI_API_KEY'), 115 | instruction="Summarize each article, focusing on technology-related content" 116 | ), 117 | bypass_cache=True, 118 | ) 119 | 120 | summaries = json.loads(result.extracted_content) 121 | print(f"Number of summarized articles: {len(summaries)}") 122 | 123 | with open(".data/tech_summaries.json", "w", encoding="utf-8") as f: 124 | json.dump(summaries, f, indent=2) 125 | 126 | asyncio.run(extract_dynamic_content()) 127 | 128 | ``` 129 | 130 | ## Customizing LLM Provider 131 | 132 | Crawl4AI uses the `litellm` library under the hood, which allows you to use any LLM provider you want. Just pass the correct model name and API token: 133 | 134 | ```hljs makefile 135 | extraction_strategy=LLMExtractionStrategy( 136 | provider="your_llm_provider/model_name", 137 | api_token="your_api_token", 138 | instruction="Your extraction instruction" 139 | ) 140 | 141 | ``` 142 | 143 | This flexibility allows you to integrate with various LLM providers and tailor the extraction process to your specific needs. 144 | 145 | ## Error Handling and Retries 146 | 147 | When working with external LLM APIs, it's important to handle potential errors and implement retry logic. Here's an example of how you might do this: 148 | 149 | ```hljs python 150 | import asyncio 151 | from tenacity import retry, stop_after_attempt, wait_exponential 152 | 153 | class LLMExtractionError(Exception): 154 | pass 155 | 156 | @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) 157 | async def extract_with_retry(crawler, url, extraction_strategy): 158 | try: 159 | result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True) 160 | return json.loads(result.extracted_content) 161 | except Exception as e: 162 | raise LLMExtractionError(f"Failed to extract content: {str(e)}") 163 | 164 | async def main(): 165 | async with AsyncWebCrawler(verbose=True) as crawler: 166 | try: 167 | content = await extract_with_retry( 168 | crawler, 169 | "https://www.example.com", 170 | LLMExtractionStrategy( 171 | provider="openai/gpt-4o", 172 | api_token=os.getenv('OPENAI_API_KEY'), 173 | instruction="Extract and summarize main points" 174 | ) 175 | ) 176 | print("Extracted content:", content) 177 | except LLMExtractionError as e: 178 | print(f"Extraction failed after retries: {e}") 179 | 180 | asyncio.run(main()) 181 | 182 | ``` 183 | 184 | This example uses the `tenacity` library to implement a retry mechanism with exponential backoff, which can help handle temporary failures or rate limiting from the LLM API. 185 | 186 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/magic mode - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Magic Mode & Anti-Bot Protection 2 | 3 | Crawl4AI provides powerful anti-detection capabilities, with Magic Mode being the simplest and most comprehensive solution. 4 | 5 | ## Magic Mode 6 | 7 | The easiest way to bypass anti-bot protections: 8 | 9 | ```hljs csharp 10 | async with AsyncWebCrawler() as crawler: 11 | result = await crawler.arun( 12 | url="https://example.com", 13 | magic=True # Enables all anti-detection features 14 | ) 15 | 16 | ``` 17 | 18 | Magic Mode automatically: 19 | \- Masks browser automation signals 20 | \- Simulates human-like behavior 21 | \- Overrides navigator properties 22 | \- Handles cookie consent popups 23 | \- Manages browser fingerprinting 24 | \- Randomizes timing patterns 25 | 26 | ## Manual Anti-Bot Options 27 | 28 | While Magic Mode is recommended, you can also configure individual anti-detection features: 29 | 30 | ```hljs python 31 | result = await crawler.arun( 32 | url="https://example.com", 33 | simulate_user=True, # Simulate human behavior 34 | override_navigator=True # Mask automation signals 35 | ) 36 | 37 | ``` 38 | 39 | Note: When `magic=True` is used, you don't need to set these individual options. 40 | 41 | ## Example: Handling Protected Sites 42 | 43 | ```hljs python 44 | async def crawl_protected_site(url: str): 45 | async with AsyncWebCrawler(headless=True) as crawler: 46 | result = await crawler.arun( 47 | url=url, 48 | magic=True, 49 | remove_overlay_elements=True, # Remove popups/modals 50 | page_timeout=60000 # Increased timeout for protection checks 51 | ) 52 | 53 | return result.markdown if result.success else None 54 | 55 | ``` 56 | 57 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/output formats - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Output Formats 2 | 3 | Crawl4AI provides multiple output formats to suit different needs, from raw HTML to structured data using LLM or pattern-based extraction. 4 | 5 | ## Basic Formats 6 | 7 | ```hljs ini 8 | result = await crawler.arun(url="https://example.com") 9 | 10 | # Access different formats 11 | raw_html = result.html # Original HTML 12 | clean_html = result.cleaned_html # Sanitized HTML 13 | markdown = result.markdown # Standard markdown 14 | fit_md = result.fit_markdown # Most relevant content in markdown 15 | 16 | ``` 17 | 18 | ## Raw HTML 19 | 20 | Original, unmodified HTML from the webpage. Useful when you need to: 21 | \- Preserve the exact page structure 22 | \- Process HTML with your own tools 23 | \- Debug page issues 24 | 25 | ```hljs python 26 | result = await crawler.arun(url="https://example.com") 27 | print(result.html) # Complete HTML including headers, scripts, etc. 28 | 29 | ``` 30 | 31 | ## Cleaned HTML 32 | 33 | Sanitized HTML with unnecessary elements removed. Automatically: 34 | \- Removes scripts and styles 35 | \- Cleans up formatting 36 | \- Preserves semantic structure 37 | 38 | ```hljs python 39 | result = await crawler.arun( 40 | url="https://example.com", 41 | excluded_tags=['form', 'header', 'footer'], # Additional tags to remove 42 | keep_data_attributes=False # Remove data-* attributes 43 | ) 44 | print(result.cleaned_html) 45 | 46 | ``` 47 | 48 | ## Standard Markdown 49 | 50 | HTML converted to clean markdown format. Great for: 51 | \- Content analysis 52 | \- Documentation 53 | \- Readability 54 | 55 | ```hljs python 56 | result = await crawler.arun( 57 | url="https://example.com", 58 | include_links_on_markdown=True # Include links in markdown 59 | ) 60 | print(result.markdown) 61 | 62 | ``` 63 | 64 | ## Fit Markdown 65 | 66 | Most relevant content extracted and converted to markdown. Ideal for: 67 | \- Article extraction 68 | \- Main content focus 69 | \- Removing boilerplate 70 | 71 | ```hljs python 72 | result = await crawler.arun(url="https://example.com") 73 | print(result.fit_markdown) # Only the main content 74 | 75 | ``` 76 | 77 | ## Structured Data Extraction 78 | 79 | Crawl4AI offers two powerful approaches for structured data extraction: 80 | 81 | ### 1\. LLM-Based Extraction 82 | 83 | Use any LLM (OpenAI, HuggingFace, Ollama, etc.) to extract structured data with high accuracy: 84 | 85 | ```hljs python 86 | from pydantic import BaseModel 87 | from crawl4ai.extraction_strategy import LLMExtractionStrategy 88 | 89 | class KnowledgeGraph(BaseModel): 90 | entities: List[dict] 91 | relationships: List[dict] 92 | 93 | strategy = LLMExtractionStrategy( 94 | provider="ollama/nemotron", # or "huggingface/...", "ollama/..." 95 | api_token="your-token", # not needed for Ollama 96 | schema=KnowledgeGraph.schema(), 97 | instruction="Extract entities and relationships from the content" 98 | ) 99 | 100 | result = await crawler.arun( 101 | url="https://example.com", 102 | extraction_strategy=strategy 103 | ) 104 | knowledge_graph = json.loads(result.extracted_content) 105 | 106 | ``` 107 | 108 | ### 2\. Pattern-Based Extraction 109 | 110 | For pages with repetitive patterns (e.g., product listings, article feeds), use JsonCssExtractionStrategy: 111 | 112 | ```hljs makefile 113 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy 114 | 115 | schema = { 116 | "name": "Product Listing", 117 | "baseSelector": ".product-card", # Repeated element 118 | "fields": [\ 119 | {"name": "title", "selector": "h2", "type": "text"},\ 120 | {"name": "price", "selector": ".price", "type": "text"},\ 121 | {"name": "description", "selector": ".desc", "type": "text"}\ 122 | ] 123 | } 124 | 125 | strategy = JsonCssExtractionStrategy(schema) 126 | result = await crawler.arun( 127 | url="https://example.com", 128 | extraction_strategy=strategy 129 | ) 130 | products = json.loads(result.extracted_content) 131 | 132 | ``` 133 | 134 | ## Content Customization 135 | 136 | ### HTML to Text Options 137 | 138 | Configure markdown conversion: 139 | 140 | ```hljs python 141 | result = await crawler.arun( 142 | url="https://example.com", 143 | html2text={ 144 | "escape_dot": False, 145 | "body_width": 0, 146 | "protect_links": True, 147 | "unicode_snob": True 148 | } 149 | ) 150 | 151 | ``` 152 | 153 | ### Content Filters 154 | 155 | Control what content is included: 156 | 157 | ```hljs python 158 | result = await crawler.arun( 159 | url="https://example.com", 160 | word_count_threshold=10, # Minimum words per block 161 | exclude_external_links=True, # Remove external links 162 | exclude_external_images=True, # Remove external images 163 | excluded_tags=['form', 'nav'] # Remove specific HTML tags 164 | ) 165 | 166 | ``` 167 | 168 | ## Comprehensive Example 169 | 170 | Here's how to use multiple output formats together: 171 | 172 | ```hljs python 173 | async def crawl_content(url: str): 174 | async with AsyncWebCrawler() as crawler: 175 | # Extract main content with fit markdown 176 | result = await crawler.arun( 177 | url=url, 178 | word_count_threshold=10, 179 | exclude_external_links=True 180 | ) 181 | 182 | # Get structured data using LLM 183 | llm_result = await crawler.arun( 184 | url=url, 185 | extraction_strategy=LLMExtractionStrategy( 186 | provider="ollama/nemotron", 187 | schema=YourSchema.schema(), 188 | instruction="Extract key information" 189 | ) 190 | ) 191 | 192 | # Get repeated patterns (if any) 193 | pattern_result = await crawler.arun( 194 | url=url, 195 | extraction_strategy=JsonCssExtractionStrategy(your_schema) 196 | ) 197 | 198 | return { 199 | "main_content": result.fit_markdown, 200 | "structured_data": json.loads(llm_result.extracted_content), 201 | "pattern_data": json.loads(pattern_result.extracted_content), 202 | "media": result.media 203 | } 204 | 205 | ``` 206 | 207 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/overview - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Extraction Strategies Overview 2 | 3 | Crawl4AI provides powerful extraction strategies to help you get structured data from web pages. Each strategy is designed for specific use cases and offers different approaches to data extraction. 4 | 5 | ## Available Strategies 6 | 7 | ### [LLM-Based Extraction](../llm/) 8 | 9 | `LLMExtractionStrategy` uses Language Models to extract structured data from web content. This approach is highly flexible and can understand content semantically. 10 | 11 | ```hljs python 12 | from pydantic import BaseModel 13 | from crawl4ai.extraction_strategy import LLMExtractionStrategy 14 | 15 | class Product(BaseModel): 16 | name: str 17 | price: float 18 | description: str 19 | 20 | strategy = LLMExtractionStrategy( 21 | provider="ollama/llama2", 22 | schema=Product.schema(), 23 | instruction="Extract product details from the page" 24 | ) 25 | 26 | result = await crawler.arun( 27 | url="https://example.com/product", 28 | extraction_strategy=strategy 29 | ) 30 | 31 | ``` 32 | 33 | **Best for:** 34 | \- Complex data structures 35 | \- Content requiring interpretation 36 | \- Flexible content formats 37 | \- Natural language processing 38 | 39 | ### [CSS-Based Extraction](../css/) 40 | 41 | `JsonCssExtractionStrategy` extracts data using CSS selectors. This is fast, reliable, and perfect for consistently structured pages. 42 | 43 | ```hljs javascript 44 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy 45 | 46 | schema = { 47 | "name": "Product Listing", 48 | "baseSelector": ".product-card", 49 | "fields": [\ 50 | {"name": "title", "selector": "h2", "type": "text"},\ 51 | {"name": "price", "selector": ".price", "type": "text"},\ 52 | {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"}\ 53 | ] 54 | } 55 | 56 | strategy = JsonCssExtractionStrategy(schema) 57 | 58 | result = await crawler.arun( 59 | url="https://example.com/products", 60 | extraction_strategy=strategy 61 | ) 62 | 63 | ``` 64 | 65 | **Best for:** 66 | \- E-commerce product listings 67 | \- News article collections 68 | \- Structured content pages 69 | \- High-performance needs 70 | 71 | ### [Cosine Strategy](../cosine/) 72 | 73 | `CosineStrategy` uses similarity-based clustering to identify and extract relevant content sections. 74 | 75 | ```hljs python 76 | from crawl4ai.extraction_strategy import CosineStrategy 77 | 78 | strategy = CosineStrategy( 79 | semantic_filter="product reviews", # Content focus 80 | word_count_threshold=10, # Minimum words per cluster 81 | sim_threshold=0.3, # Similarity threshold 82 | max_dist=0.2, # Maximum cluster distance 83 | top_k=3 # Number of top clusters to extract 84 | ) 85 | 86 | result = await crawler.arun( 87 | url="https://example.com/reviews", 88 | extraction_strategy=strategy 89 | ) 90 | 91 | ``` 92 | 93 | **Best for:** 94 | \- Content similarity analysis 95 | \- Topic clustering 96 | \- Relevant content extraction 97 | \- Pattern recognition in text 98 | 99 | ## Strategy Selection Guide 100 | 101 | Choose your strategy based on these factors: 102 | 103 | 01. **Content Structure** 104 | 02. Well-structured HTML → Use CSS Strategy 105 | 03. Natural language text → Use LLM Strategy 106 | 04. Mixed/Complex content → Use Cosine Strategy 107 | 108 | 05. **Performance Requirements** 109 | 110 | 06. Fastest: CSS Strategy 111 | 07. Moderate: Cosine Strategy 112 | 08. Variable: LLM Strategy (depends on provider) 113 | 114 | 09. **Accuracy Needs** 115 | 116 | 10. Highest structure accuracy: CSS Strategy 117 | 11. Best semantic understanding: LLM Strategy 118 | 12. Best content relevance: Cosine Strategy 119 | 120 | ## Combining Strategies 121 | 122 | You can combine strategies for more powerful extraction: 123 | 124 | ```hljs rust 125 | # First use CSS strategy for initial structure 126 | css_result = await crawler.arun( 127 | url="https://example.com", 128 | extraction_strategy=css_strategy 129 | ) 130 | 131 | # Then use LLM for semantic analysis 132 | llm_result = await crawler.arun( 133 | url="https://example.com", 134 | extraction_strategy=llm_strategy 135 | ) 136 | 137 | ``` 138 | 139 | ## Common Use Cases 140 | 141 | 1. **E-commerce Scraping** 142 | 143 | 144 | 145 | ```hljs graphql 146 | # CSS Strategy for product listings 147 | schema = { 148 | "name": "Products", 149 | "baseSelector": ".product", 150 | "fields": [\ 151 | {"name": "name", "selector": ".title", "type": "text"},\ 152 | {"name": "price", "selector": ".price", "type": "text"}\ 153 | ] 154 | } 155 | 156 | ``` 157 | 158 | 2. **News Article Extraction** 159 | 160 | 161 | 162 | ```hljs python 163 | # LLM Strategy for article content 164 | class Article(BaseModel): 165 | title: str 166 | content: str 167 | author: str 168 | date: str 169 | 170 | strategy = LLMExtractionStrategy( 171 | provider="ollama/llama2", 172 | schema=Article.schema() 173 | ) 174 | 175 | ``` 176 | 177 | 3. **Content Analysis** 178 | 179 | 180 | 181 | ```hljs makefile 182 | # Cosine Strategy for topic analysis 183 | strategy = CosineStrategy( 184 | semantic_filter="technology trends", 185 | top_k=5 186 | ) 187 | 188 | ``` 189 | 190 | 191 | ## Best Practices 192 | 193 | 1. **Choose the Right Strategy** 194 | 2. Start with CSS for structured data 195 | 3. Use LLM for complex interpretation 196 | 4. Try Cosine for content relevance 197 | 198 | 5. **Optimize Performance** 199 | 200 | 6. Cache LLM results 201 | 7. Keep CSS selectors specific 202 | 8. Tune similarity thresholds 203 | 204 | 9. **Handle Errors** 205 | 206 | 207 | 208 | ```hljs python 209 | result = await crawler.arun( 210 | url="https://example.com", 211 | extraction_strategy=strategy 212 | ) 213 | 214 | if not result.success: 215 | print(f"Extraction failed: {result.error_message}") 216 | else: 217 | data = json.loads(result.extracted_content) 218 | 219 | ``` 220 | 221 | 222 | Each strategy has its strengths and optimal use cases. Explore the detailed documentation for each strategy to learn more about their specific features and configurations. 223 | 224 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/page interaction - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Page Interaction 2 | 3 | Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events. 4 | 5 | ## JavaScript Execution 6 | 7 | ### Basic Execution 8 | 9 | ```hljs makefile 10 | # Single JavaScript command 11 | result = await crawler.arun( 12 | url="https://example.com", 13 | js_code="window.scrollTo(0, document.body.scrollHeight);" 14 | ) 15 | 16 | # Multiple commands 17 | js_commands = [\ 18 | "window.scrollTo(0, document.body.scrollHeight);",\ 19 | "document.querySelector('.load-more').click();",\ 20 | "document.querySelector('#consent-button').click();"\ 21 | ] 22 | result = await crawler.arun( 23 | url="https://example.com", 24 | js_code=js_commands 25 | ) 26 | 27 | ``` 28 | 29 | ## Wait Conditions 30 | 31 | ### CSS-Based Waiting 32 | 33 | Wait for elements to appear: 34 | 35 | ```hljs vbnet 36 | result = await crawler.arun( 37 | url="https://example.com", 38 | wait_for="css:.dynamic-content" # Wait for element with class 'dynamic-content' 39 | ) 40 | 41 | ``` 42 | 43 | ### JavaScript-Based Waiting 44 | 45 | Wait for custom conditions: 46 | 47 | ```hljs python 48 | # Wait for number of elements 49 | wait_condition = """() => { 50 | return document.querySelectorAll('.item').length > 10; 51 | }""" 52 | 53 | result = await crawler.arun( 54 | url="https://example.com", 55 | wait_for=f"js:{wait_condition}" 56 | ) 57 | 58 | # Wait for dynamic content to load 59 | wait_for_content = """() => { 60 | const content = document.querySelector('.content'); 61 | return content && content.innerText.length > 100; 62 | }""" 63 | 64 | result = await crawler.arun( 65 | url="https://example.com", 66 | wait_for=f"js:{wait_for_content}" 67 | ) 68 | 69 | ``` 70 | 71 | ## Handling Dynamic Content 72 | 73 | ### Load More Content 74 | 75 | Handle infinite scroll or load more buttons: 76 | 77 | ```hljs makefile 78 | # Scroll and wait pattern 79 | result = await crawler.arun( 80 | url="https://example.com", 81 | js_code=[\ 82 | # Scroll to bottom\ 83 | "window.scrollTo(0, document.body.scrollHeight);",\ 84 | # Click load more if exists\ 85 | "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();"\ 86 | ], 87 | # Wait for new content 88 | wait_for="js:() => document.querySelectorAll('.item').length > previousCount" 89 | ) 90 | 91 | ``` 92 | 93 | ### Form Interaction 94 | 95 | Handle forms and inputs: 96 | 97 | ```hljs python 98 | js_form_interaction = """ 99 | // Fill form fields 100 | document.querySelector('#search').value = 'search term'; 101 | // Submit form 102 | document.querySelector('form').submit(); 103 | """ 104 | 105 | result = await crawler.arun( 106 | url="https://example.com", 107 | js_code=js_form_interaction, 108 | wait_for="css:.results" # Wait for results to load 109 | ) 110 | 111 | ``` 112 | 113 | ## Timing Control 114 | 115 | ### Delays and Timeouts 116 | 117 | Control timing of interactions: 118 | 119 | ```hljs makefile 120 | result = await crawler.arun( 121 | url="https://example.com", 122 | page_timeout=60000, # Page load timeout (ms) 123 | delay_before_return_html=2.0, # Wait before capturing content 124 | ) 125 | 126 | ``` 127 | 128 | ## Complex Interactions Example 129 | 130 | Here's an example of handling a dynamic page with multiple interactions: 131 | 132 | ```hljs python 133 | async def crawl_dynamic_content(): 134 | async with AsyncWebCrawler() as crawler: 135 | # Initial page load 136 | result = await crawler.arun( 137 | url="https://example.com", 138 | # Handle cookie consent 139 | js_code="document.querySelector('.cookie-accept')?.click();", 140 | wait_for="css:.main-content" 141 | ) 142 | 143 | # Load more content 144 | session_id = "dynamic_session" # Keep session for multiple interactions 145 | 146 | for page in range(3): # Load 3 pages of content 147 | result = await crawler.arun( 148 | url="https://example.com", 149 | session_id=session_id, 150 | js_code=[\ 151 | # Scroll to bottom\ 152 | "window.scrollTo(0, document.body.scrollHeight);",\ 153 | # Store current item count\ 154 | "window.previousCount = document.querySelectorAll('.item').length;",\ 155 | # Click load more\ 156 | "document.querySelector('.load-more')?.click();"\ 157 | ], 158 | # Wait for new items 159 | wait_for="""() => { 160 | const currentCount = document.querySelectorAll('.item').length; 161 | return currentCount > window.previousCount; 162 | }""", 163 | # Only execute JS without reloading page 164 | js_only=True if page > 0 else False 165 | ) 166 | 167 | # Process content after each load 168 | print(f"Page {page + 1} items:", len(result.cleaned_html)) 169 | 170 | # Clean up session 171 | await crawler.crawler_strategy.kill_session(session_id) 172 | 173 | ``` 174 | 175 | ## Using with Extraction Strategies 176 | 177 | Combine page interaction with structured extraction: 178 | 179 | ```hljs python 180 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy 181 | 182 | # Pattern-based extraction after interaction 183 | schema = { 184 | "name": "Dynamic Items", 185 | "baseSelector": ".item", 186 | "fields": [\ 187 | {"name": "title", "selector": "h2", "type": "text"},\ 188 | {"name": "description", "selector": ".desc", "type": "text"}\ 189 | ] 190 | } 191 | 192 | result = await crawler.arun( 193 | url="https://example.com", 194 | js_code="window.scrollTo(0, document.body.scrollHeight);", 195 | wait_for="css:.item:nth-child(10)", # Wait for 10 items 196 | extraction_strategy=JsonCssExtractionStrategy(schema) 197 | ) 198 | 199 | # Or use LLM to analyze dynamic content 200 | class ContentAnalysis(BaseModel): 201 | topics: List[str] 202 | summary: str 203 | 204 | result = await crawler.arun( 205 | url="https://example.com", 206 | js_code="document.querySelector('.show-more').click();", 207 | wait_for="css:.full-content", 208 | extraction_strategy=LLMExtractionStrategy( 209 | provider="ollama/nemotron", 210 | schema=ContentAnalysis.schema(), 211 | instruction="Analyze the full content" 212 | ) 213 | ) 214 | 215 | ``` 216 | 217 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/parameters table - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Parameter Reference Table 2 | 3 | | File Name | Parameter Name | Code Usage | Strategy/Class | Description | 4 | | --- | --- | --- | --- | --- | 5 | | async\_crawler\_strategy.py | user\_agent | `kwargs.get("user_agent")` | AsyncPlaywrightCrawlerStrategy | User agent string for browser identification | 6 | | async\_crawler\_strategy.py | proxy | `kwargs.get("proxy")` | AsyncPlaywrightCrawlerStrategy | Proxy server configuration for network requests | 7 | | async\_crawler\_strategy.py | proxy\_config | `kwargs.get("proxy_config")` | AsyncPlaywrightCrawlerStrategy | Detailed proxy configuration including auth | 8 | | async\_crawler\_strategy.py | headless | `kwargs.get("headless", True)` | AsyncPlaywrightCrawlerStrategy | Whether to run browser in headless mode | 9 | | async\_crawler\_strategy.py | browser\_type | `kwargs.get("browser_type", "chromium")` | AsyncPlaywrightCrawlerStrategy | Type of browser to use (chromium/firefox/webkit) | 10 | | async\_crawler\_strategy.py | headers | `kwargs.get("headers", {})` | AsyncPlaywrightCrawlerStrategy | Custom HTTP headers for requests | 11 | | async\_crawler\_strategy.py | verbose | `kwargs.get("verbose", False)` | AsyncPlaywrightCrawlerStrategy | Enable detailed logging output | 12 | | async\_crawler\_strategy.py | sleep\_on\_close | `kwargs.get("sleep_on_close", False)` | AsyncPlaywrightCrawlerStrategy | Add delay before closing browser | 13 | | async\_crawler\_strategy.py | use\_managed\_browser | `kwargs.get("use_managed_browser", False)` | AsyncPlaywrightCrawlerStrategy | Use managed browser instance | 14 | | async\_crawler\_strategy.py | user\_data\_dir | `kwargs.get("user_data_dir", None)` | AsyncPlaywrightCrawlerStrategy | Custom directory for browser profile data | 15 | | async\_crawler\_strategy.py | session\_id | `kwargs.get("session_id")` | AsyncPlaywrightCrawlerStrategy | Unique identifier for browser session | 16 | | async\_crawler\_strategy.py | override\_navigator | `kwargs.get("override_navigator", False)` | AsyncPlaywrightCrawlerStrategy | Override browser navigator properties | 17 | | async\_crawler\_strategy.py | simulate\_user | `kwargs.get("simulate_user", False)` | AsyncPlaywrightCrawlerStrategy | Simulate human-like behavior | 18 | | async\_crawler\_strategy.py | magic | `kwargs.get("magic", False)` | AsyncPlaywrightCrawlerStrategy | Enable advanced anti-detection features | 19 | | async\_crawler\_strategy.py | log\_console | `kwargs.get("log_console", False)` | AsyncPlaywrightCrawlerStrategy | Log browser console messages | 20 | | async\_crawler\_strategy.py | js\_only | `kwargs.get("js_only", False)` | AsyncPlaywrightCrawlerStrategy | Only execute JavaScript without page load | 21 | | async\_crawler\_strategy.py | page\_timeout | `kwargs.get("page_timeout", 60000)` | AsyncPlaywrightCrawlerStrategy | Timeout for page load in milliseconds | 22 | | async\_crawler\_strategy.py | ignore\_body\_visibility | `kwargs.get("ignore_body_visibility", True)` | AsyncPlaywrightCrawlerStrategy | Process page even if body is hidden | 23 | | async\_crawler\_strategy.py | js\_code | `kwargs.get("js_code", kwargs.get("js", self.js_code))` | AsyncPlaywrightCrawlerStrategy | Custom JavaScript code to execute | 24 | | async\_crawler\_strategy.py | wait\_for | `kwargs.get("wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait for specific element/condition | 25 | | async\_crawler\_strategy.py | process\_iframes | `kwargs.get("process_iframes", False)` | AsyncPlaywrightCrawlerStrategy | Extract content from iframes | 26 | | async\_crawler\_strategy.py | delay\_before\_return\_html | `kwargs.get("delay_before_return_html")` | AsyncPlaywrightCrawlerStrategy | Additional delay before returning HTML | 27 | | async\_crawler\_strategy.py | remove\_overlay\_elements | `kwargs.get("remove_overlay_elements", False)` | AsyncPlaywrightCrawlerStrategy | Remove pop-ups and overlay elements | 28 | | async\_crawler\_strategy.py | screenshot | `kwargs.get("screenshot")` | AsyncPlaywrightCrawlerStrategy | Take page screenshot | 29 | | async\_crawler\_strategy.py | screenshot\_wait\_for | `kwargs.get("screenshot_wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait before taking screenshot | 30 | | async\_crawler\_strategy.py | semaphore\_count | `kwargs.get("semaphore_count", 5)` | AsyncPlaywrightCrawlerStrategy | Concurrent request limit | 31 | | async\_webcrawler.py | verbose | `kwargs.get("verbose", False)` | AsyncWebCrawler | Enable detailed logging | 32 | | async\_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request | 33 | | async\_webcrawler.py | session\_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse | 34 | | async\_webcrawler.py | only\_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content | 35 | | async\_webcrawler.py | bypass\_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | 36 | 37 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/proxy security - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Proxy & Security 2 | 3 | Configure proxy settings and enhance security features in Crawl4AI for reliable data extraction. 4 | 5 | ## Basic Proxy Setup 6 | 7 | Simple proxy configuration: 8 | 9 | ```hljs csharp 10 | # Using proxy URL 11 | async with AsyncWebCrawler( 12 | proxy="http://proxy.example.com:8080" 13 | ) as crawler: 14 | result = await crawler.arun(url="https://example.com") 15 | 16 | # Using SOCKS proxy 17 | async with AsyncWebCrawler( 18 | proxy="socks5://proxy.example.com:1080" 19 | ) as crawler: 20 | result = await crawler.arun(url="https://example.com") 21 | 22 | ``` 23 | 24 | ## Authenticated Proxy 25 | 26 | Use proxy with authentication: 27 | 28 | ```hljs csharp 29 | proxy_config = { 30 | "server": "http://proxy.example.com:8080", 31 | "username": "user", 32 | "password": "pass" 33 | } 34 | 35 | async with AsyncWebCrawler(proxy_config=proxy_config) as crawler: 36 | result = await crawler.arun(url="https://example.com") 37 | 38 | ``` 39 | 40 | ## Rotating Proxies 41 | 42 | Example using a proxy rotation service: 43 | 44 | ```hljs python 45 | async def get_next_proxy(): 46 | # Your proxy rotation logic here 47 | return {"server": "http://next.proxy.com:8080"} 48 | 49 | async with AsyncWebCrawler() as crawler: 50 | # Update proxy for each request 51 | for url in urls: 52 | proxy = await get_next_proxy() 53 | crawler.update_proxy(proxy) 54 | result = await crawler.arun(url=url) 55 | 56 | ``` 57 | 58 | ## Custom Headers 59 | 60 | Add security-related headers: 61 | 62 | ```hljs csharp 63 | headers = { 64 | "X-Forwarded-For": "203.0.113.195", 65 | "Accept-Language": "en-US,en;q=0.9", 66 | "Cache-Control": "no-cache", 67 | "Pragma": "no-cache" 68 | } 69 | 70 | async with AsyncWebCrawler(headers=headers) as crawler: 71 | result = await crawler.arun(url="https://example.com") 72 | 73 | ``` 74 | 75 | ## Combining with Magic Mode 76 | 77 | For maximum protection, combine proxy with Magic Mode: 78 | 79 | ```hljs python 80 | async with AsyncWebCrawler( 81 | proxy="http://proxy.example.com:8080", 82 | headers={"Accept-Language": "en-US"} 83 | ) as crawler: 84 | result = await crawler.arun( 85 | url="https://example.com", 86 | magic=True # Enable all anti-detection features 87 | ) 88 | 89 | ``` 90 | 91 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/session management - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Session Management 2 | 3 | Session management in Crawl4AI allows you to maintain state across multiple requests and handle complex multi-page crawling tasks, particularly useful for dynamic websites. 4 | 5 | ## Basic Session Usage 6 | 7 | Use `session_id` to maintain state between requests: 8 | 9 | ```hljs csharp 10 | async with AsyncWebCrawler() as crawler: 11 | session_id = "my_session" 12 | 13 | # First request 14 | result1 = await crawler.arun( 15 | url="https://example.com/page1", 16 | session_id=session_id 17 | ) 18 | 19 | # Subsequent request using same session 20 | result2 = await crawler.arun( 21 | url="https://example.com/page2", 22 | session_id=session_id 23 | ) 24 | 25 | # Clean up when done 26 | await crawler.crawler_strategy.kill_session(session_id) 27 | 28 | ``` 29 | 30 | ## Dynamic Content with Sessions 31 | 32 | Here's a real-world example of crawling GitHub commits across multiple pages: 33 | 34 | ```hljs python 35 | async def crawl_dynamic_content(): 36 | async with AsyncWebCrawler(verbose=True) as crawler: 37 | url = "https://github.com/microsoft/TypeScript/commits/main" 38 | session_id = "typescript_commits_session" 39 | all_commits = [] 40 | 41 | # Define navigation JavaScript 42 | js_next_page = """ 43 | const button = document.querySelector('a[data-testid="pagination-next-button"]'); 44 | if (button) button.click(); 45 | """ 46 | 47 | # Define wait condition 48 | wait_for = """() => { 49 | const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); 50 | if (commits.length === 0) return false; 51 | const firstCommit = commits[0].textContent.trim(); 52 | return firstCommit !== window.firstCommit; 53 | }""" 54 | 55 | # Define extraction schema 56 | schema = { 57 | "name": "Commit Extractor", 58 | "baseSelector": "li.Box-sc-g0xbh4-0", 59 | "fields": [\ 60 | {\ 61 | "name": "title",\ 62 | "selector": "h4.markdown-title",\ 63 | "type": "text",\ 64 | "transform": "strip",\ 65 | },\ 66 | ], 67 | } 68 | extraction_strategy = JsonCssExtractionStrategy(schema) 69 | 70 | # Crawl multiple pages 71 | for page in range(3): 72 | result = await crawler.arun( 73 | url=url, 74 | session_id=session_id, 75 | extraction_strategy=extraction_strategy, 76 | js_code=js_next_page if page > 0 else None, 77 | wait_for=wait_for if page > 0 else None, 78 | js_only=page > 0, 79 | bypass_cache=True 80 | ) 81 | 82 | if result.success: 83 | commits = json.loads(result.extracted_content) 84 | all_commits.extend(commits) 85 | print(f"Page {page + 1}: Found {len(commits)} commits") 86 | 87 | # Clean up session 88 | await crawler.crawler_strategy.kill_session(session_id) 89 | return all_commits 90 | 91 | ``` 92 | 93 | ## Session Best Practices 94 | 95 | 1. **Session Naming**: 96 | 97 | 98 | 99 | 100 | ```hljs ini 101 | # Use descriptive session IDs 102 | session_id = "login_flow_session" 103 | session_id = "product_catalog_session" 104 | 105 | ``` 106 | 107 | 2. **Resource Management**: 108 | 109 | 110 | 111 | 112 | ```hljs python 113 | try: 114 | # Your crawling code 115 | pass 116 | finally: 117 | # Always clean up sessions 118 | await crawler.crawler_strategy.kill_session(session_id) 119 | 120 | ``` 121 | 122 | 3. **State Management**: 123 | 124 | 125 | 126 | 127 | ```hljs makefile 128 | # First page: login 129 | result = await crawler.arun( 130 | url="https://example.com/login", 131 | session_id=session_id, 132 | js_code="document.querySelector('form').submit();" 133 | ) 134 | 135 | # Second page: verify login success 136 | result = await crawler.arun( 137 | url="https://example.com/dashboard", 138 | session_id=session_id, 139 | wait_for="css:.user-profile" # Wait for authenticated content 140 | ) 141 | 142 | ``` 143 | 144 | 145 | ## Common Use Cases 146 | 147 | 1. **Authentication Flows** 148 | 2. **Pagination Handling** 149 | 3. **Form Submissions** 150 | 4. **Multi-step Processes** 151 | 5. **Dynamic Content Navigation** 152 | 153 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/simple crawling - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Simple Crawling 2 | 3 | This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response. 4 | 5 | ## Basic Usage 6 | 7 | Here's the simplest way to crawl a webpage: 8 | 9 | ```hljs python 10 | import asyncio 11 | from crawl4ai import AsyncWebCrawler 12 | 13 | async def main(): 14 | async with AsyncWebCrawler() as crawler: 15 | result = await crawler.arun(url="https://example.com") 16 | print(result.markdown) # Print clean markdown content 17 | 18 | if __name__ == "__main__": 19 | asyncio.run(main()) 20 | 21 | ``` 22 | 23 | ## Understanding the Response 24 | 25 | The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../../api/crawl-result/) for complete details): 26 | 27 | ```hljs python 28 | result = await crawler.arun(url="https://example.com") 29 | 30 | # Different content formats 31 | print(result.html) # Raw HTML 32 | print(result.cleaned_html) # Cleaned HTML 33 | print(result.markdown) # Markdown version 34 | print(result.fit_markdown) # Most relevant content in markdown 35 | 36 | # Check success status 37 | print(result.success) # True if crawl succeeded 38 | print(result.status_code) # HTTP status code (e.g., 200, 404) 39 | 40 | # Access extracted media and links 41 | print(result.media) # Dictionary of found media (images, videos, audio) 42 | print(result.links) # Dictionary of internal and external links 43 | 44 | ``` 45 | 46 | ## Adding Basic Options 47 | 48 | Customize your crawl with these common options: 49 | 50 | ```hljs python 51 | result = await crawler.arun( 52 | url="https://example.com", 53 | word_count_threshold=10, # Minimum words per content block 54 | exclude_external_links=True, # Remove external links 55 | remove_overlay_elements=True, # Remove popups/modals 56 | process_iframes=True # Process iframe content 57 | ) 58 | 59 | ``` 60 | 61 | ## Handling Errors 62 | 63 | Always check if the crawl was successful: 64 | 65 | ```hljs python 66 | result = await crawler.arun(url="https://example.com") 67 | if not result.success: 68 | print(f"Crawl failed: {result.error_message}") 69 | print(f"Status code: {result.status_code}") 70 | 71 | ``` 72 | 73 | ## Logging and Debugging 74 | 75 | Enable verbose mode for detailed logging: 76 | 77 | ```hljs csharp 78 | async with AsyncWebCrawler(verbose=True) as crawler: 79 | result = await crawler.arun(url="https://example.com") 80 | 81 | ``` 82 | 83 | ## Complete Example 84 | 85 | Here's a more comprehensive example showing common usage patterns: 86 | 87 | ```hljs python 88 | import asyncio 89 | from crawl4ai import AsyncWebCrawler 90 | 91 | async def main(): 92 | async with AsyncWebCrawler(verbose=True) as crawler: 93 | result = await crawler.arun( 94 | url="https://example.com", 95 | # Content filtering 96 | word_count_threshold=10, 97 | excluded_tags=['form', 'header'], 98 | exclude_external_links=True, 99 | 100 | # Content processing 101 | process_iframes=True, 102 | remove_overlay_elements=True, 103 | 104 | # Cache control 105 | bypass_cache=False # Use cache if available 106 | ) 107 | 108 | if result.success: 109 | # Print clean content 110 | print("Content:", result.markdown[:500]) # First 500 chars 111 | 112 | # Process images 113 | for image in result.media["images"]: 114 | print(f"Found image: {image['src']}") 115 | 116 | # Process links 117 | for link in result.links["internal"]: 118 | print(f"Internal link: {link['href']}") 119 | 120 | else: 121 | print(f"Crawl failed: {result.error_message}") 122 | 123 | if __name__ == "__main__": 124 | asyncio.run(main()) 125 | 126 | ``` 127 | 128 | * * * -------------------------------------------------------------------------------- /docs/crawl4ai/strategies - crawl4ai documentation.md: -------------------------------------------------------------------------------- 1 | # Extraction & Chunking Strategies API 2 | 3 | This documentation covers the API reference for extraction and chunking strategies in Crawl4AI. 4 | 5 | ## Extraction Strategies 6 | 7 | All extraction strategies inherit from the base `ExtractionStrategy` class and implement two key methods: 8 | \- `extract(url: str, html: str) -> List[Dict[str, Any]]` 9 | \- `run(url: str, sections: List[str]) -> List[Dict[str, Any]]` 10 | 11 | ### LLMExtractionStrategy 12 | 13 | Used for extracting structured data using Language Models. 14 | 15 | ```hljs python 16 | LLMExtractionStrategy( 17 | # Required Parameters 18 | provider: str = DEFAULT_PROVIDER, # LLM provider (e.g., "ollama/llama2") 19 | api_token: Optional[str] = None, # API token 20 | 21 | # Extraction Configuration 22 | instruction: str = None, # Custom extraction instruction 23 | schema: Dict = None, # Pydantic model schema for structured data 24 | extraction_type: str = "block", # "block" or "schema" 25 | 26 | # Chunking Parameters 27 | chunk_token_threshold: int = 4000, # Maximum tokens per chunk 28 | overlap_rate: float = 0.1, # Overlap between chunks 29 | word_token_rate: float = 0.75, # Word to token conversion rate 30 | apply_chunking: bool = True, # Enable/disable chunking 31 | 32 | # API Configuration 33 | base_url: str = None, # Base URL for API 34 | extra_args: Dict = {}, # Additional provider arguments 35 | verbose: bool = False # Enable verbose logging 36 | ) 37 | 38 | ``` 39 | 40 | ### CosineStrategy 41 | 42 | Used for content similarity-based extraction and clustering. 43 | 44 | ```hljs python 45 | CosineStrategy( 46 | # Content Filtering 47 | semantic_filter: str = None, # Topic/keyword filter 48 | word_count_threshold: int = 10, # Minimum words per cluster 49 | sim_threshold: float = 0.3, # Similarity threshold 50 | 51 | # Clustering Parameters 52 | max_dist: float = 0.2, # Maximum cluster distance 53 | linkage_method: str = 'ward', # Clustering method 54 | top_k: int = 3, # Top clusters to return 55 | 56 | # Model Configuration 57 | model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', # Embedding model 58 | 59 | verbose: bool = False # Enable verbose logging 60 | ) 61 | 62 | ``` 63 | 64 | ### JsonCssExtractionStrategy 65 | 66 | Used for CSS selector-based structured data extraction. 67 | 68 | ```hljs python 69 | JsonCssExtractionStrategy( 70 | schema: Dict[str, Any], # Extraction schema 71 | verbose: bool = False # Enable verbose logging 72 | ) 73 | 74 | # Schema Structure 75 | schema = { 76 | "name": str, # Schema name 77 | "baseSelector": str, # Base CSS selector 78 | "fields": [ # List of fields to extract\ 79 | {\ 80 | "name": str, # Field name\ 81 | "selector": str, # CSS selector\ 82 | "type": str, # Field type: "text", "attribute", "html", "regex"\ 83 | "attribute": str, # For type="attribute"\ 84 | "pattern": str, # For type="regex"\ 85 | "transform": str, # Optional: "lowercase", "uppercase", "strip"\ 86 | "default": Any # Default value if extraction fails\ 87 | }\ 88 | ] 89 | } 90 | 91 | ``` 92 | 93 | ## Chunking Strategies 94 | 95 | All chunking strategies inherit from `ChunkingStrategy` and implement the `chunk(text: str) -> list` method. 96 | 97 | ### RegexChunking 98 | 99 | Splits text based on regex patterns. 100 | 101 | ```hljs python 102 | RegexChunking( 103 | patterns: List[str] = None # Regex patterns for splitting 104 | # Default: [r'\n\n'] 105 | ) 106 | 107 | ``` 108 | 109 | ### SlidingWindowChunking 110 | 111 | Creates overlapping chunks with a sliding window approach. 112 | 113 | ```hljs perl 114 | SlidingWindowChunking( 115 | window_size: int = 100, # Window size in words 116 | step: int = 50 # Step size between windows 117 | ) 118 | 119 | ``` 120 | 121 | ### OverlappingWindowChunking 122 | 123 | Creates chunks with specified overlap. 124 | 125 | ```hljs yaml 126 | OverlappingWindowChunking( 127 | window_size: int = 1000, # Chunk size in words 128 | overlap: int = 100 # Overlap size in words 129 | ) 130 | 131 | ``` 132 | 133 | ## Usage Examples 134 | 135 | ### LLM Extraction 136 | 137 | ```hljs python 138 | from pydantic import BaseModel 139 | from crawl4ai.extraction_strategy import LLMExtractionStrategy 140 | 141 | # Define schema 142 | class Article(BaseModel): 143 | title: str 144 | content: str 145 | author: str 146 | 147 | # Create strategy 148 | strategy = LLMExtractionStrategy( 149 | provider="ollama/llama2", 150 | schema=Article.schema(), 151 | instruction="Extract article details" 152 | ) 153 | 154 | # Use with crawler 155 | result = await crawler.arun( 156 | url="https://example.com/article", 157 | extraction_strategy=strategy 158 | ) 159 | 160 | # Access extracted data 161 | data = json.loads(result.extracted_content) 162 | 163 | ``` 164 | 165 | ### CSS Extraction 166 | 167 | ```hljs makefile 168 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy 169 | 170 | # Define schema 171 | schema = { 172 | "name": "Product List", 173 | "baseSelector": ".product-card", 174 | "fields": [\ 175 | {\ 176 | "name": "title",\ 177 | "selector": "h2.title",\ 178 | "type": "text"\ 179 | },\ 180 | {\ 181 | "name": "price",\ 182 | "selector": ".price",\ 183 | "type": "text",\ 184 | "transform": "strip"\ 185 | },\ 186 | {\ 187 | "name": "image",\ 188 | "selector": "img",\ 189 | "type": "attribute",\ 190 | "attribute": "src"\ 191 | }\ 192 | ] 193 | } 194 | 195 | # Create and use strategy 196 | strategy = JsonCssExtractionStrategy(schema) 197 | result = await crawler.arun( 198 | url="https://example.com/products", 199 | extraction_strategy=strategy 200 | ) 201 | 202 | ``` 203 | 204 | ### Content Chunking 205 | 206 | ```hljs makefile 207 | from crawl4ai.chunking_strategy import OverlappingWindowChunking 208 | 209 | # Create chunking strategy 210 | chunker = OverlappingWindowChunking( 211 | window_size=500, # 500 words per chunk 212 | overlap=50 # 50 words overlap 213 | ) 214 | 215 | # Use with extraction strategy 216 | strategy = LLMExtractionStrategy( 217 | provider="ollama/llama2", 218 | chunking_strategy=chunker 219 | ) 220 | 221 | result = await crawler.arun( 222 | url="https://example.com/long-article", 223 | extraction_strategy=strategy 224 | ) 225 | 226 | ``` 227 | 228 | ## Best Practices 229 | 230 | 1. **Choose the Right Strategy** 231 | 2. Use `LLMExtractionStrategy` for complex, unstructured content 232 | 3. Use `JsonCssExtractionStrategy` for well-structured HTML 233 | 4. Use `CosineStrategy` for content similarity and clustering 234 | 235 | 5. **Optimize Chunking** 236 | 237 | 238 | 239 | ```hljs makefile 240 | # For long documents 241 | strategy = LLMExtractionStrategy( 242 | chunk_token_threshold=2000, # Smaller chunks 243 | overlap_rate=0.1 # 10% overlap 244 | ) 245 | 246 | ``` 247 | 248 | 6. **Handle Errors** 249 | 250 | 251 | 252 | ```hljs python 253 | try: 254 | result = await crawler.arun( 255 | url="https://example.com", 256 | extraction_strategy=strategy 257 | ) 258 | if result.success: 259 | content = json.loads(result.extracted_content) 260 | except Exception as e: 261 | print(f"Extraction failed: {e}") 262 | 263 | ``` 264 | 265 | 7. **Monitor Performance** 266 | 267 | 268 | 269 | ```hljs graphql 270 | strategy = CosineStrategy( 271 | verbose=True, # Enable logging 272 | word_count_threshold=20, # Filter short content 273 | top_k=5 # Limit results 274 | ) 275 | 276 | ``` 277 | 278 | 279 | * * * -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import shutil 4 | import re 5 | from fastapi import FastAPI, HTTPException 6 | from fastapi.staticfiles import StaticFiles 7 | from fastapi.responses import FileResponse 8 | from pydantic import BaseModel 9 | from typing import Optional, Dict 10 | from crawl4ai import AsyncWebCrawler 11 | import uuid 12 | import aiofiles 13 | from urllib.parse import urlparse, unquote 14 | 15 | app = FastAPI() 16 | 17 | # Serve static files 18 | app.mount("/static", StaticFiles(directory="static"), name="static") 19 | 20 | # Store crawl jobs status 21 | crawl_jobs: Dict[str, dict] = {} 22 | 23 | class CrawlRequest(BaseModel): 24 | url: str 25 | limit: int = 10 26 | 27 | class CrawlResponse(BaseModel): 28 | job_id: str 29 | status: str 30 | progress: int = 0 31 | total_pages: int = 0 32 | current_url: Optional[str] = None 33 | 34 | def clean_path(url: str, base_url: str) -> str: 35 | """Extract and clean the path from URL relative to base URL""" 36 | # URL decode both URLs to handle any encoded characters 37 | url = unquote(url) 38 | base_url = unquote(base_url) 39 | 40 | # Remove base URL to get the relative path 41 | path = url.replace(base_url, '') 42 | 43 | # If path starts with /, remove it 44 | path = path.lstrip('/') 45 | 46 | # Handle fragment identifiers (#) 47 | if '#' in path: 48 | path = path.split('#')[1] # Take the fragment part 49 | else: 50 | # Remove query parameters if no fragment 51 | path = path.split('?')[0] 52 | 53 | # If path is empty after cleaning, return empty string 54 | if not path: 55 | return '' 56 | 57 | # Clean special characters and convert spaces 58 | clean = re.sub(r'[^\w\s-]', '', path) 59 | clean = re.sub(r'\s+', '_', clean.strip()) 60 | return clean.lower() 61 | 62 | async def process_url(url: str, output_dir: str, crawler: AsyncWebCrawler, job_id: str): 63 | """Process a single URL and save markdown""" 64 | try: 65 | result = await crawler.arun( 66 | url=url, 67 | remove_overlay_elements=True, 68 | bypass_cache=True 69 | ) 70 | 71 | if result.success: 72 | # Get title from metadata 73 | metadata = result.metadata 74 | title = metadata['title'] 75 | # Clean title for filename 76 | clean_title = re.sub(r'[^\w\s-]', '', title) 77 | clean_title = re.sub(r'\s+', '_', clean_title.strip()) 78 | 79 | # Get and clean URL path 80 | path_suffix = clean_path(url, crawl_jobs[job_id]["base_url"]) 81 | 82 | # Combine title and path for unique filename 83 | filename = f"{clean_title.lower()}" 84 | if path_suffix: 85 | filename += f"_{path_suffix}" 86 | filename += ".md" 87 | 88 | # Save markdown 89 | filepath = os.path.join(output_dir, filename) 90 | async with aiofiles.open(filepath, 'w') as f: 91 | await f.write(result.markdown) 92 | 93 | # Return internal links 94 | return result.links.get("internal", []) 95 | except Exception as e: 96 | print(f"Error processing {url}: {str(e)}") 97 | return [] 98 | 99 | async def crawl_website(job_id: str, url: str, limit: int): 100 | """Recursively crawl website and update job status""" 101 | try: 102 | # Create output directory 103 | output_dir = f"output/output_{job_id}" 104 | os.makedirs(output_dir, exist_ok=True) 105 | 106 | # Store the base URL for this job 107 | crawl_jobs[job_id]["base_url"] = url 108 | 109 | # Initialize crawler 110 | async with AsyncWebCrawler(verbose=True) as crawler: 111 | processed_urls = set() 112 | urls_to_process = {url} 113 | 114 | while urls_to_process and len(processed_urls) < limit: 115 | current_url = urls_to_process.pop() 116 | 117 | if current_url in processed_urls: 118 | continue 119 | 120 | # Update job status 121 | crawl_jobs[job_id].update({ 122 | "status": "processing", 123 | "progress": len(processed_urls), 124 | "current_url": current_url 125 | }) 126 | 127 | # Process URL and get internal links 128 | internal_links = await process_url(current_url, output_dir, crawler, job_id) 129 | processed_urls.add(current_url) 130 | 131 | # Add new internal links that contain the base URL 132 | for link in internal_links: 133 | if isinstance(link, dict): 134 | link_url = link.get("href", "") 135 | else: 136 | link_url = link 137 | 138 | if link_url and link_url.startswith(url) and link_url not in processed_urls: 139 | urls_to_process.add(link_url) 140 | 141 | # Create zip file 142 | shutil.make_archive(output_dir, 'zip', output_dir) 143 | 144 | # Update final status 145 | crawl_jobs[job_id].update({ 146 | "status": "completed", 147 | "progress": len(processed_urls), 148 | "total_pages": len(processed_urls) 149 | }) 150 | 151 | # Cleanup output directory 152 | shutil.rmtree(output_dir) 153 | 154 | except Exception as e: 155 | crawl_jobs[job_id]["status"] = "failed" 156 | print(f"Crawl failed: {str(e)}") 157 | 158 | @app.post("/api/crawl", response_model=CrawlResponse) 159 | async def start_crawl(request: CrawlRequest): 160 | job_id = str(uuid.uuid4()) 161 | crawl_jobs[job_id] = { 162 | "status": "starting", 163 | "progress": 0, 164 | "total_pages": 0, 165 | "base_url": request.url # Store the base URL 166 | } 167 | 168 | # Start crawl in background 169 | asyncio.create_task(crawl_website(job_id, request.url, request.limit)) 170 | 171 | return CrawlResponse( 172 | job_id=job_id, 173 | status="starting", 174 | progress=0 175 | ) 176 | 177 | @app.get("/api/status/{job_id}", response_model=CrawlResponse) 178 | async def get_status(job_id: str): 179 | if job_id not in crawl_jobs: 180 | raise HTTPException(status_code=404, detail="Job not found") 181 | 182 | job = crawl_jobs[job_id] 183 | return CrawlResponse( 184 | job_id=job_id, 185 | status=job["status"], 186 | progress=job["progress"], 187 | total_pages=job["total_pages"], 188 | current_url=job.get("current_url") 189 | ) 190 | 191 | @app.get("/api/download/{job_id}") 192 | async def download_results(job_id: str): 193 | if job_id not in crawl_jobs: 194 | raise HTTPException(status_code=404, detail="Job not found") 195 | 196 | job = crawl_jobs[job_id] 197 | if job["status"] != "completed": 198 | raise HTTPException(status_code=400, detail="Job not completed") 199 | 200 | zip_path = f"output/output_{job_id}.zip" 201 | if not os.path.exists(zip_path): 202 | raise HTTPException(status_code=404, detail="Results not found") 203 | 204 | return FileResponse( 205 | zip_path, 206 | media_type="application/zip", 207 | filename="crawl_results.zip" 208 | ) 209 | 210 | # Serve index.html 211 | @app.get("/") 212 | async def read_root(): 213 | return FileResponse("static/index.html") 214 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "crawl4ai-frontend" 3 | version = "0.1.0" 4 | description = "A frontend for crawl4ai" 5 | authors = ["f4ww4z"] 6 | license = "MIT" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.10" 10 | Crawl4AI = "^0.3.746" 11 | fastapi = {extras = ["standard"], version = "^0.115.5"} 12 | aiofiles = "^24.1.0" 13 | 14 | [tool.poetry.dev-dependencies] 15 | autopep8 = "^2.3.1" 16 | djlint = "^1.36.3" 17 | 18 | [tool.djlint] 19 | convert_errors_to_warnings = true 20 | indent = 2 21 | 22 | [tool.djlint.js] 23 | warn_on_js_errors = true 24 | 25 | [build-system] 26 | requires = ["poetry-core>=1.0.0"] 27 | build-backend = "poetry.core.masonry.api" 28 | --------------------------------------------------------------------------------