├── .gitignore
├── .vscode
└── launch.json
├── Dockerfile
├── LICENSE
├── README.md
├── docker-compose.yml
├── docs
└── crawl4ai
│ ├── 1 getting started - crawl4ai documentation.md
│ ├── 10 user simulation - crawl4ai documentation.md
│ ├── 111 json css - crawl4ai documentation.md
│ ├── 112 llm strategy - crawl4ai documentation.md
│ ├── 113 cosine strategy - crawl4ai documentation.md
│ ├── 12 session crawling - crawl4ai documentation.md
│ ├── 13 text chunking - crawl4ai documentation.md
│ ├── 14 custom workflows - crawl4ai documentation.md
│ ├── 2 advanced features - crawl4ai documentation.md
│ ├── 3 browser setup - crawl4ai documentation.md
│ ├── 4 proxy settings - crawl4ai documentation.md
│ ├── 5 dynamic content - crawl4ai documentation.md
│ ├── 6 magic mode - crawl4ai documentation.md
│ ├── 7 content cleaning - crawl4ai documentation.md
│ ├── 8 media handling - crawl4ai documentation.md
│ ├── 9 link analysis - crawl4ai documentation.md
│ ├── asyncwebcrawler - crawl4ai documentation.md
│ ├── asyncwebcrawlerarun - crawl4ai documentation.md
│ ├── browser configuration - crawl4ai documentation.md
│ ├── chunking - crawl4ai documentation.md
│ ├── content processing - crawl4ai documentation.md
│ ├── content selection - crawl4ai documentation.md
│ ├── cosine strategy - crawl4ai documentation.md
│ ├── crawlresult - crawl4ai documentation.md
│ ├── docker deployment - crawl4ai documentation.md
│ ├── home - crawl4ai documentation.md
│ ├── hooks auth - crawl4ai documentation.md
│ ├── installation - crawl4ai documentation.md
│ ├── json-css extractor advanced - crawl4ai documentation.md
│ ├── json-css extractor basic - crawl4ai documentation.md
│ ├── llm strategy - crawl4ai documentation.md
│ ├── magic mode - crawl4ai documentation.md
│ ├── output formats - crawl4ai documentation.md
│ ├── overview - crawl4ai documentation.md
│ ├── page interaction - crawl4ai documentation.md
│ ├── parameters table - crawl4ai documentation.md
│ ├── proxy security - crawl4ai documentation.md
│ ├── quick start - crawl4ai documentation.md
│ ├── session management - crawl4ai documentation.md
│ ├── session management advanced - crawl4ai documentation.md
│ ├── simple crawling - crawl4ai documentation.md
│ └── strategies - crawl4ai documentation.md
├── main.py
├── poetry.lock
├── pyproject.toml
└── static
└── index.html
/.gitignore:
--------------------------------------------------------------------------------
1 | output*/
2 | output_*/
3 | __pycache__
4 | .env
5 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // Use IntelliSense to learn about possible attributes.
3 | // Hover to view descriptions of existing attributes.
4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 | {
8 | "name": "Python Debugger: FastAPI",
9 | "type": "debugpy",
10 | "request": "launch",
11 | "module": "uvicorn",
12 | "args": [
13 | "main:app",
14 | "--reload"
15 | ],
16 | "jinja": true
17 | }
18 | ]
19 | }
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use Python 3.10 as base image
2 | FROM python:3.10-slim
3 |
4 | # Install system dependencies
5 | RUN apt-get update && apt-get install -y default-libmysqlclient-dev build-essential pkg-config
6 |
7 | # Install Poetry
8 | RUN pip install poetry
9 |
10 | # Set working directory
11 | WORKDIR /app
12 |
13 | # Copy Poetry configuration files
14 | COPY pyproject.toml poetry.lock ./
15 |
16 | # Configure Poetry to not create virtual environment inside container
17 | RUN poetry config virtualenvs.create false
18 |
19 | # Install dependencies
20 | RUN poetry install --no-dev --no-interaction --no-ansi
21 |
22 | RUN playwright install
23 |
24 | RUN playwright install-deps
25 |
26 | # Copy application files
27 | COPY main.py .
28 | COPY static/ static/
29 |
30 | # Create directory for temporary files
31 | RUN mkdir -p /app/output
32 |
33 | # Expose port 8000
34 | EXPOSE 8000
35 |
36 | # Set the entry command
37 | CMD ["poetry", "run", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 f4ww4z
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # crawl4ai-frontend
2 |
3 | A FastAPI-based frontend for crawl4ai that provides a web interface and REST API for crawling websites and converting them to markdown format.
4 |
5 | ## Features
6 |
7 | - Web interface for easy interaction
8 | - REST API for programmatic access
9 | - Recursive website crawling with configurable depth
10 | - Automatic conversion of web pages to markdown format
11 | - Background job processing with status tracking
12 | - Results downloadable as ZIP archives
13 | - Docker support for easy deployment
14 |
15 | ## Installation
16 |
17 | ### Local Installation
18 |
19 | 1. Ensure you have Python 3.10+ and Poetry installed
20 | 2. Clone the repository
21 | 3. Install dependencies:
22 | ```bash
23 | poetry install
24 | ```
25 |
26 | ### Docker Installation
27 |
28 | 1. Ensure you have Docker installed
29 | 2. Build the image:
30 | ```bash
31 | docker build -t crawl4ai-frontend .
32 | ```
33 |
34 | Or use docker-compose:
35 | ```bash
36 | docker-compose up -d
37 | ```
38 |
39 | ## Usage
40 |
41 | ### Running Locally
42 |
43 | ```bash
44 | poetry run uvicorn main:app --host 0.0.0.0 --port 8000
45 | ```
46 |
47 | Then open http://localhost:8000 in your browser.
48 |
49 | ### Running with Docker
50 |
51 | ```bash
52 | docker run -p 8000:8000 crawl4ai-frontend
53 | ```
54 |
55 | Then open http://localhost:8000 in your browser.
56 |
57 | ## API Documentation
58 |
59 | ### Start a Crawl Job
60 |
61 | ```http
62 | POST /api/crawl
63 | ```
64 |
65 | Request body:
66 | ```json
67 | {
68 | "url": "https://example.com",
69 | "limit": 10
70 | }
71 | ```
72 |
73 | Response:
74 | ```json
75 | {
76 | "job_id": "uuid",
77 | "status": "starting",
78 | "progress": 0
79 | }
80 | ```
81 |
82 | ### Check Job Status
83 |
84 | ```http
85 | GET /api/status/{job_id}
86 | ```
87 |
88 | Response:
89 | ```json
90 | {
91 | "job_id": "uuid",
92 | "status": "processing|completed|failed",
93 | "progress": 5,
94 | "total_pages": 10,
95 | "current_url": "https://example.com/page"
96 | }
97 | ```
98 |
99 | ### Download Results
100 |
101 | ```http
102 | GET /api/download/{job_id}
103 | ```
104 |
105 | Returns a ZIP file containing the crawled pages in markdown format.
106 |
107 | ## Dependencies
108 |
109 | - Python 3.10+
110 | - FastAPI
111 | - Crawl4AI
112 | - aiofiles
113 | - Poetry (for dependency management)
114 |
115 | ## Development
116 |
117 | For development, additional dependencies can be installed:
118 | ```bash
119 | poetry install --with dev
120 | ```
121 |
122 | Development dependencies include:
123 | - autopep8 (code formatting)
124 | - djlint (HTML template linting)
125 |
126 | ## License
127 |
128 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
129 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | web:
3 | build: .
4 | ports:
5 | - "${PORT}:8000"
6 | volumes:
7 | - ./output:/app/output
8 | restart: unless-stopped
9 |
--------------------------------------------------------------------------------
/docs/crawl4ai/1 getting started - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 1: Introduction to Crawl4AI and Basic Installation
4 |
5 | ### Quick Intro
6 |
7 | Walk through installation from PyPI, setup, and verification. Show how to install with options like `torch` or `transformer` for advanced capabilities.
8 |
9 | Here's a condensed outline of the **Installation and Setup** video content:
10 |
11 | * * *
12 |
13 | 1) **Introduction to Crawl4AI**: Briefly explain that Crawl4AI is a powerful tool for web scraping, data extraction, and content processing, with customizable options for various needs.
14 |
15 | 2) **Installation Overview**:
16 |
17 | - **Basic Install**: Run `pip install crawl4ai` and `playwright install` (to set up browser dependencies).
18 |
19 | - **Optional Advanced Installs**:
20 | - `pip install crawl4ai[torch]` \- Adds PyTorch for clustering.
21 | - `pip install crawl4ai[transformer]` \- Adds support for LLM-based extraction.
22 | - `pip install crawl4ai[all]` \- Installs all features for complete functionality.
23 |
24 | 3) **Verifying the Installation**:
25 |
26 | - Walk through a simple test script to confirm the setup:
27 |
28 |
29 |
30 | ```hljs python
31 | import asyncio
32 | from crawl4ai import AsyncWebCrawler
33 |
34 | async def main():
35 | async with AsyncWebCrawler(verbose=True) as crawler:
36 | result = await crawler.arun(url="https://www.example.com")
37 | print(result.markdown[:500]) # Show first 500 characters
38 |
39 | asyncio.run(main())
40 |
41 | ```
42 |
43 | - Explain that this script initializes the crawler and runs it on a test URL, displaying part of the extracted content to verify functionality.
44 |
45 | 4) **Important Tips**:
46 |
47 | - **Run** `playwright install` **after installation** to set up dependencies.
48 | - **For full performance** on text-related tasks, run `crawl4ai-download-models` after installing with `[torch]`, `[transformer]`, or `[all]` options.
49 | - If you encounter issues, refer to the documentation or GitHub issues.
50 |
51 | 5) **Wrap Up**:
52 |
53 | - Introduce the next topic in the series, which will cover Crawl4AI's browser configuration options (like choosing between `chromium`, `firefox`, and `webkit`).
54 |
55 | * * *
56 |
57 | This structure provides a concise, effective guide to get viewers up and running with Crawl4AI in minutes.
58 |
59 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/10 user simulation - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 10: Custom Headers, Identity, and User Simulation
4 |
5 | ### Quick Intro
6 |
7 | Teach how to use custom headers, user-agent strings, and simulate real user interactions. Demo: Set custom user-agent and headers to access a site that blocks typical crawlers.
8 |
9 | Here’s a concise outline for the **Custom Headers, Identity Management, and User Simulation** video:
10 |
11 | * * *
12 |
13 | ### **Custom Headers, Identity Management, & User Simulation**
14 |
15 | 1) **Why Customize Headers and Identity in Crawling**:
16 |
17 | - Websites often track request headers and browser properties to detect bots. Customizing headers and managing identity help make requests appear more human, improving access to restricted sites.
18 |
19 | 2) **Setting Custom Headers**:
20 |
21 | - Customize HTTP headers to mimic genuine browser requests or meet site-specific requirements:
22 |
23 |
24 |
25 | ```hljs makefile
26 | headers = {
27 | "Accept-Language": "en-US,en;q=0.9",
28 | "X-Requested-With": "XMLHttpRequest",
29 | "Cache-Control": "no-cache"
30 | }
31 | crawler = AsyncWebCrawler(headers=headers)
32 |
33 | ```
34 |
35 | - **Use Case**: Customize the `Accept-Language` header to simulate local user settings, or `Cache-Control` to bypass cache for fresh content.
36 |
37 | 3) **Setting a Custom User Agent**:
38 |
39 | - Some websites block requests from common crawler user agents. Setting a custom user agent string helps bypass these restrictions:
40 |
41 |
42 |
43 | ```hljs makefile
44 | crawler = AsyncWebCrawler(
45 | user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
46 | )
47 |
48 | ```
49 |
50 | - **Tip**: Use user-agent strings from popular browsers (e.g., Chrome, Firefox) to improve access and reduce detection risks.
51 |
52 | 4) **User Simulation for Human-like Behavior**:
53 |
54 | - Enable `simulate_user=True` to mimic natural user interactions, such as random timing and simulated mouse movements:
55 |
56 |
57 |
58 | ```hljs python
59 | result = await crawler.arun(
60 | url="https://example.com",
61 | simulate_user=True # Simulates human-like behavior
62 | )
63 |
64 | ```
65 |
66 | - **Behavioral Effects**: Adds subtle variations in interactions, making the crawler harder to detect on bot-protected sites.
67 |
68 | 5) **Navigator Overrides and Magic Mode for Full Identity Masking**:
69 |
70 | - Use `override_navigator=True` to mask automation indicators like `navigator.webdriver`, which websites check to detect bots:
71 |
72 |
73 |
74 | ```hljs python
75 | result = await crawler.arun(
76 | url="https://example.com",
77 | override_navigator=True # Masks bot-related signals
78 | )
79 |
80 | ```
81 |
82 | - **Combining with Magic Mode**: For a complete anti-bot setup, combine these identity options with `magic=True` for maximum protection:
83 |
84 |
85 |
86 | ```hljs python
87 | async with AsyncWebCrawler() as crawler:
88 | result = await crawler.arun(
89 | url="https://example.com",
90 | magic=True, # Enables all anti-bot detection features
91 | user_agent="Custom-Agent", # Custom agent with Magic Mode
92 | )
93 |
94 | ```
95 |
96 | - This setup includes all anti-detection techniques like navigator masking, random timing, and user simulation.
97 |
98 | 6) **Example: Comprehensive Setup for Identity Management**:
99 |
100 | - A full example combining custom headers, user-agent, and user simulation for a realistic browsing profile:
101 |
102 |
103 |
104 | ```hljs python
105 | async with AsyncWebCrawler(
106 | headers={"Accept-Language": "en-US", "Cache-Control": "no-cache"},
107 | user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0",
108 | simulate_user=True
109 | ) as crawler:
110 | result = await crawler.arun(url="https://example.com/secure-page")
111 | print(result.markdown[:500]) # Display extracted content
112 |
113 | ```
114 |
115 | - This example enables detailed customization for evading detection and accessing protected pages smoothly.
116 |
117 | 7) **Wrap Up & Next Steps**:
118 |
119 | - Recap the value of headers, user-agent customization, and simulation in bypassing bot detection.
120 | - Tease the next video: **Extraction Strategies: JSON CSS, LLM, and Cosine** to dive into structured data extraction methods for high-quality content retrieval.
121 |
122 | * * *
123 |
124 | This outline equips users with tools for managing crawler identity and human-like behavior, essential for accessing bot-protected or restricted websites.
125 |
126 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/111 json css - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, covering all key aspects and supported structures in Crawl4AI:
2 |
3 | * * *
4 |
5 | ### **10.1 JSON-CSS Extraction Strategy**
6 |
7 | #### **1\. Introduction to JSON-CSS Extraction**
8 |
9 | - JSON-CSS Extraction is used for pulling structured data from pages with repeated patterns, like product listings, article feeds, or directories.
10 | - This strategy allows defining a schema with CSS selectors and data fields, making it easy to capture nested, list-based, or singular elements.
11 |
12 | #### **2\. Basic Schema Structure**
13 |
14 | - **Schema Fields**: The schema has two main components:
15 | - `baseSelector`: A CSS selector to locate the main elements you want to extract (e.g., each article or product block).
16 | - `fields`: Defines the data fields for each element, supporting various data types and structures.
17 |
18 | #### **3\. Simple Field Extraction**
19 |
20 | - **Example HTML**:
21 |
22 |
23 |
24 | ```hljs php-template
25 |
26 |
Sample Product
27 |
$19.99
28 |
This is a sample product.
29 |
30 |
31 | ```
32 |
33 | - **Schema**:
34 |
35 |
36 |
37 | ```hljs graphql
38 | schema = {
39 | "baseSelector": ".product",
40 | "fields": [\
41 | {"name": "title", "selector": ".title", "type": "text"},\
42 | {"name": "price", "selector": ".price", "type": "text"},\
43 | {"name": "description", "selector": ".description", "type": "text"}\
44 | ]
45 | }
46 |
47 | ```
48 |
49 | - **Explanation**: Each field captures text content from specified CSS selectors within each `.product` element.
50 |
51 | #### **4\. Supported Field Types: Text, Attribute, HTML, Regex**
52 |
53 | - **Field Type Options**:
54 | - `text`: Extracts visible text.
55 | - `attribute`: Captures an HTML attribute (e.g., `src`, `href`).
56 | - `html`: Extracts the raw HTML of an element.
57 | - `regex`: Allows regex patterns to extract part of the text.
58 | - **Example HTML** (including an image):
59 |
60 |
61 |
62 |
63 | ```hljs javascript
64 |
65 |
Sample Product
66 |

67 |
$19.99
68 |
Limited time offer.
69 |
70 |
71 | ```
72 |
73 | - **Schema**:
74 |
75 |
76 |
77 | ```hljs python
78 | schema = {
79 | "baseSelector": ".product",
80 | "fields": [\
81 | {"name": "title", "selector": ".title", "type": "text"},\
82 | {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"},\
83 | {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"},\
84 | {"name": "description_html", "selector": ".description", "type": "html"}\
85 | ]
86 | }
87 |
88 | ```
89 |
90 | - **Explanation**:
91 | - `attribute`: Extracts the `src` attribute from `.product-image`.
92 | - `regex`: Extracts the numeric part from `$19.99`.
93 | - `html`: Retrieves the full HTML of the description element.
94 |
95 | #### **5\. Nested Field Extraction**
96 |
97 | - **Use Case**: Useful when content contains sub-elements, such as an article with author details within it.
98 | - **Example HTML**:
99 |
100 |
101 |
102 | ```hljs php-template
103 |
104 |
Sample Article
105 |
106 | John Doe
107 | Writer and editor
108 |
109 |
110 |
111 | ```
112 |
113 | - **Schema**:
114 |
115 |
116 |
117 | ```hljs graphql
118 | schema = {
119 | "baseSelector": ".article",
120 | "fields": [\
121 | {"name": "title", "selector": ".title", "type": "text"},\
122 | {"name": "author", "type": "nested", "selector": ".author", "fields": [\
123 | {"name": "name", "selector": ".name", "type": "text"},\
124 | {"name": "bio", "selector": ".bio", "type": "text"}\
125 | ]}\
126 | ]
127 | }
128 |
129 | ```
130 |
131 | - **Explanation**:
132 | - `nested`: Extracts `name` and `bio` within `.author`, grouping the author details in a single `author` object.
133 |
134 | #### **6\. List and Nested List Extraction**
135 |
136 | - **List**: Extracts multiple elements matching the selector as a list.
137 | - **Nested List**: Allows lists within lists, useful for items with sub-lists (e.g., specifications for each product).
138 | - **Example HTML**:
139 |
140 |
141 |
142 | ```hljs php-template
143 |
144 |
Product with Features
145 |
146 | - Feature 1
147 | - Feature 2
148 | - Feature 3
149 |
150 |
151 |
152 | ```
153 |
154 | - **Schema**:
155 |
156 |
157 |
158 | ```hljs graphql
159 | schema = {
160 | "baseSelector": ".product",
161 | "fields": [\
162 | {"name": "title", "selector": ".title", "type": "text"},\
163 | {"name": "features", "type": "list", "selector": ".features .feature", "fields": [\
164 | {"name": "feature", "type": "text"}\
165 | ]}\
166 | ]
167 | }
168 |
169 | ```
170 |
171 | - **Explanation**:
172 | - `list`: Captures each `.feature` item within `.features`, outputting an array of features under the `features` field.
173 |
174 | #### **7\. Transformations for Field Values**
175 |
176 | - Transformations allow you to modify extracted values (e.g., converting to lowercase).
177 | - Supported transformations: `lowercase`, `uppercase`, `strip`.
178 | - **Example HTML**:
179 |
180 |
181 |
182 | ```hljs php-template
183 |
184 |
Special Product
185 |
186 |
187 | ```
188 |
189 | - **Schema**:
190 |
191 |
192 |
193 | ```hljs graphql
194 | schema = {
195 | "baseSelector": ".product",
196 | "fields": [\
197 | {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"}\
198 | ]
199 | }
200 |
201 | ```
202 |
203 | - **Explanation**: The `transform` property changes the `title` to uppercase, useful for standardized outputs.
204 |
205 | #### **8\. Full JSON-CSS Extraction Example**
206 |
207 | - Combining all elements in a single schema example for a comprehensive crawl:
208 | - **Example HTML**:
209 |
210 |
211 |
212 | ```hljs javascript
213 |
214 |
Featured Product
215 |

216 |
$99.99
217 |
Best product of the year.
218 |
219 | - Durable
220 | - Eco-friendly
221 |
222 |
223 |
224 | ```
225 |
226 | - **Schema**:
227 |
228 |
229 |
230 | ```hljs python
231 | schema = {
232 | "baseSelector": ".product",
233 | "fields": [\
234 | {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"},\
235 | {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"},\
236 | {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"},\
237 | {"name": "description", "selector": ".description", "type": "html"},\
238 | {"name": "features", "type": "list", "selector": ".features .feature", "fields": [\
239 | {"name": "feature", "type": "text"}\
240 | ]}\
241 | ]
242 | }
243 |
244 | ```
245 |
246 | - **Explanation**: This schema captures and transforms each aspect of the product, illustrating the JSON-CSS strategy’s versatility for structured extraction.
247 |
248 | #### **9\. Wrap Up & Next Steps**
249 |
250 | - Summarize JSON-CSS Extraction’s flexibility for structured, pattern-based extraction.
251 | - Tease the next video: **10.2 LLM Extraction Strategy**, focusing on using language models to extract data based on intelligent content analysis.
252 |
253 | * * *
254 |
255 | This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users.
256 |
257 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/112 llm strategy - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine
4 |
5 | ### Quick Intro
6 |
7 | Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site.
8 |
9 | Here’s a comprehensive outline for the **LLM Extraction Strategy** video, covering key details and example applications.
10 |
11 | * * *
12 |
13 | ### **10.2 LLM Extraction Strategy**
14 |
15 | #### **1\. Introduction to LLM Extraction Strategy**
16 |
17 | - The LLM Extraction Strategy leverages language models to interpret and extract structured data from complex web content.
18 | - Unlike traditional CSS selectors, this strategy uses natural language instructions and schemas to guide the extraction, ideal for unstructured or diverse content.
19 | - Supports **OpenAI**, **Azure OpenAI**, **HuggingFace**, and **Ollama** models, enabling flexibility with both proprietary and open-source providers.
20 |
21 | #### **2\. Key Components of LLM Extraction Strategy**
22 |
23 | - **Provider**: Specifies the LLM provider (e.g., OpenAI, HuggingFace, Azure).
24 | - **API Token**: Required for most providers, except Ollama (local LLM model).
25 | - **Instruction**: Custom extraction instructions sent to the model, providing flexibility in how the data is structured and extracted.
26 | - **Schema**: Optional, defines structured fields to organize extracted data into JSON format.
27 | - **Extraction Type**: Supports `"block"` for simpler text blocks or `"schema"` when a structured output format is required.
28 | - **Chunking Parameters**: Breaks down large documents, with options to adjust chunk size and overlap rate for more accurate extraction across lengthy texts.
29 |
30 | #### **3\. Basic Extraction Example: OpenAI Model Pricing**
31 |
32 | - **Goal**: Extract model names and their input and output fees from the OpenAI pricing page.
33 | - **Schema Definition**:
34 | - **Model Name**: Text for model identification.
35 | - **Input Fee**: Token cost for input processing.
36 | - **Output Fee**: Token cost for output generation.
37 | - **Schema**:
38 |
39 |
40 |
41 |
42 | ```hljs scss
43 | class OpenAIModelFee(BaseModel):
44 | model_name: str = Field(..., description="Name of the OpenAI model.")
45 | input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
46 | output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
47 |
48 | ```
49 |
50 | - **Example Code**:
51 |
52 |
53 |
54 |
55 | ```hljs csharp
56 | async def extract_openai_pricing():
57 | async with AsyncWebCrawler() as crawler:
58 | result = await crawler.arun(
59 | url="https://openai.com/api/pricing/",
60 | extraction_strategy=LLMExtractionStrategy(
61 | provider="openai/gpt-4o",
62 | api_token=os.getenv("OPENAI_API_KEY"),
63 | schema=OpenAIModelFee.schema(),
64 | extraction_type="schema",
65 | instruction="Extract model names and fees for input and output tokens from the page."
66 | ),
67 | bypass_cache=True
68 | )
69 | print(result.extracted_content)
70 |
71 | ```
72 |
73 | - **Explanation**:
74 | - The extraction strategy combines a schema and detailed instruction to guide the LLM in capturing structured data.
75 | - Each model’s name, input fee, and output fee are extracted in a JSON format.
76 |
77 | #### **4\. Knowledge Graph Extraction Example**
78 |
79 | - **Goal**: Extract entities and their relationships from a document for use in a knowledge graph.
80 | - **Schema Definition**:
81 | - **Entities**: Individual items with descriptions (e.g., people, organizations).
82 | - **Relationships**: Connections between entities, including descriptions and relationship types.
83 | - **Schema**:
84 |
85 |
86 |
87 |
88 | ```hljs yaml
89 | class Entity(BaseModel):
90 | name: str
91 | description: str
92 |
93 | class Relationship(BaseModel):
94 | entity1: Entity
95 | entity2: Entity
96 | description: str
97 | relation_type: str
98 |
99 | class KnowledgeGraph(BaseModel):
100 | entities: List[Entity]
101 | relationships: List[Relationship]
102 |
103 | ```
104 |
105 | - **Example Code**:
106 |
107 |
108 |
109 |
110 | ```hljs csharp
111 | async def extract_knowledge_graph():
112 | extraction_strategy = LLMExtractionStrategy(
113 | provider="azure/gpt-4o-mini",
114 | api_token=os.getenv("AZURE_API_KEY"),
115 | schema=KnowledgeGraph.schema(),
116 | extraction_type="schema",
117 | instruction="Extract entities and relationships from the content to build a knowledge graph."
118 | )
119 | async with AsyncWebCrawler() as crawler:
120 | result = await crawler.arun(
121 | url="https://example.com/some-article",
122 | extraction_strategy=extraction_strategy,
123 | bypass_cache=True
124 | )
125 | print(result.extracted_content)
126 |
127 | ```
128 |
129 | - **Explanation**:
130 | - In this setup, the LLM extracts entities and their relationships based on the schema and instruction.
131 | - The schema organizes results into a JSON-based knowledge graph format.
132 |
133 | #### **5\. Key Settings in LLM Extraction**
134 |
135 | - **Chunking Options**:
136 | - For long pages, set `chunk_token_threshold` to specify maximum token count per section.
137 | - Adjust `overlap_rate` to control the overlap between chunks, useful for contextual consistency.
138 | - **Example**:
139 |
140 |
141 |
142 | ```hljs lua
143 | extraction_strategy = LLMExtractionStrategy(
144 | provider="openai/gpt-4",
145 | api_token=os.getenv("OPENAI_API_KEY"),
146 | chunk_token_threshold=3000,
147 | overlap_rate=0.2, # 20% overlap between chunks
148 | instruction="Extract key insights and relationships."
149 | )
150 |
151 | ```
152 |
153 | - This setup ensures that longer texts are divided into manageable chunks with slight overlap, enhancing the quality of extraction.
154 |
155 | #### **6\. Flexible Provider Options for LLM Extraction**
156 |
157 | - **Using Proprietary Models**: OpenAI, Azure, and HuggingFace provide robust language models, often suited for complex or detailed extractions.
158 | - **Using Open-Source Models**: Ollama and other open-source models can be deployed locally, suitable for offline or cost-effective extraction.
159 | - **Example Call**:
160 |
161 |
162 |
163 | ```hljs lua
164 | await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
165 | await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
166 | await extract_structured_data_using_llm("ollama/llama3.2")
167 |
168 | ```
169 |
170 |
171 | #### **7\. Complete Example of LLM Extraction Setup**
172 |
173 | - Code to run both the OpenAI pricing and Knowledge Graph extractions, using various providers:
174 |
175 |
176 |
177 | ```hljs csharp
178 | async def main():
179 | await extract_openai_pricing()
180 | await extract_knowledge_graph()
181 |
182 | if __name__ == "__main__":
183 | asyncio.run(main())
184 |
185 | ```
186 |
187 |
188 | #### **8\. Wrap Up & Next Steps**
189 |
190 | - Recap the power of LLM extraction for handling unstructured or complex data extraction tasks.
191 | - Tease the next video: **10.3 Cosine Similarity Strategy** for clustering similar content based on semantic similarity.
192 |
193 | * * *
194 |
195 | This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases.
196 |
197 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/113 cosine strategy - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine
4 |
5 | ### Quick Intro
6 |
7 | Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site.
8 |
9 | Here’s a structured outline for the **Cosine Similarity Strategy** video, covering key concepts, configuration, and a practical example.
10 |
11 | * * *
12 |
13 | ### **10.3 Cosine Similarity Strategy**
14 |
15 | #### **1\. Introduction to Cosine Similarity Strategy**
16 |
17 | - The Cosine Similarity Strategy clusters content by semantic similarity, offering an efficient alternative to LLM-based extraction, especially when speed is a priority.
18 | - Ideal for grouping similar sections of text, this strategy is well-suited for pages with content sections that may need to be classified or tagged, like news articles, product descriptions, or reviews.
19 |
20 | #### **2\. Key Configuration Options**
21 |
22 | - **semantic\_filter**: A keyword-based filter to focus on relevant content.
23 | - **word\_count\_threshold**: Minimum number of words per cluster, filtering out shorter, less meaningful clusters.
24 | - **max\_dist**: Maximum allowable distance between elements in clusters, impacting cluster tightness.
25 | - **linkage\_method**: Method for hierarchical clustering, such as `'ward'` (for well-separated clusters).
26 | - **top\_k**: Specifies the number of top categories for each cluster.
27 | - **model\_name**: Defines the model for embeddings, such as `sentence-transformers/all-MiniLM-L6-v2`.
28 | - **sim\_threshold**: Minimum similarity threshold for filtering, allowing control over cluster relevance.
29 |
30 | #### **3\. How Cosine Similarity Clustering Works**
31 |
32 | - **Step 1**: Embeddings are generated for each text section, transforming them into vectors that capture semantic meaning.
33 | - **Step 2**: Hierarchical clustering groups similar sections based on cosine similarity, forming clusters with related content.
34 | - **Step 3**: Clusters are filtered based on word count, removing those below the `word_count_threshold`.
35 | - **Step 4**: Each cluster is then categorized with tags, if enabled, providing context to each grouped content section.
36 |
37 | #### **4\. Example Use Case: Clustering Blog Article Sections**
38 |
39 | - **Goal**: Group related sections of a blog or news page to identify distinct topics or discussion areas.
40 | - **Example HTML Sections**:
41 |
42 |
43 |
44 |
45 | ```hljs bash
46 | "The economy is showing signs of recovery, with markets up this quarter.",
47 | "In the sports world, several major teams are preparing for the upcoming season.",
48 | "New advancements in AI technology are reshaping the tech landscape.",
49 | "Market analysts are optimistic about continued growth in tech stocks."
50 |
51 | ```
52 |
53 | - **Code Setup**:
54 |
55 |
56 |
57 |
58 | ```hljs csharp
59 | async def extract_blog_sections():
60 | extraction_strategy = CosineStrategy(
61 | word_count_threshold=15,
62 | max_dist=0.3,
63 | sim_threshold=0.2,
64 | model_name="sentence-transformers/all-MiniLM-L6-v2",
65 | top_k=2
66 | )
67 | async with AsyncWebCrawler() as crawler:
68 | url = "https://example.com/blog-page"
69 | result = await crawler.arun(
70 | url=url,
71 | extraction_strategy=extraction_strategy,
72 | bypass_cache=True
73 | )
74 | print(result.extracted_content)
75 |
76 | ```
77 |
78 | - **Explanation**:
79 | - **word\_count\_threshold**: Ensures only clusters with meaningful content are included.
80 | - **sim\_threshold**: Filters out clusters with low similarity, focusing on closely related sections.
81 | - **top\_k**: Selects top tags, useful for identifying main topics.
82 |
83 | #### **5\. Applying Semantic Filtering with Cosine Similarity**
84 |
85 | - **Semantic Filter**: Filters sections based on relevance to a specific keyword, such as “technology” for tech articles.
86 | - **Example Code**:
87 |
88 |
89 |
90 | ```hljs makefile
91 | extraction_strategy = CosineStrategy(
92 | semantic_filter="technology",
93 | word_count_threshold=10,
94 | max_dist=0.25,
95 | model_name="sentence-transformers/all-MiniLM-L6-v2"
96 | )
97 |
98 | ```
99 |
100 | - **Explanation**:
101 | - **semantic\_filter**: Only sections with high similarity to the “technology” keyword will be included in the clustering, making it easy to focus on specific topics within a mixed-content page.
102 |
103 | #### **6\. Clustering Product Reviews by Similarity**
104 |
105 | - **Goal**: Organize product reviews by themes, such as “price,” “quality,” or “durability.”
106 | - **Example Reviews**:
107 |
108 |
109 |
110 |
111 | ```hljs css
112 | "The quality of this product is outstanding and well worth the price.",
113 | "I found the product to be durable but a bit overpriced.",
114 | "Great value for the money and long-lasting.",
115 | "The build quality is good, but I expected a lower price point."
116 |
117 | ```
118 |
119 | - **Code Setup**:
120 |
121 |
122 |
123 |
124 | ```hljs csharp
125 | async def extract_product_reviews():
126 | extraction_strategy = CosineStrategy(
127 | word_count_threshold=20,
128 | max_dist=0.35,
129 | sim_threshold=0.25,
130 | model_name="sentence-transformers/all-MiniLM-L6-v2"
131 | )
132 | async with AsyncWebCrawler() as crawler:
133 | url = "https://example.com/product-reviews"
134 | result = await crawler.arun(
135 | url=url,
136 | extraction_strategy=extraction_strategy,
137 | bypass_cache=True
138 | )
139 | print(result.extracted_content)
140 |
141 | ```
142 |
143 | - **Explanation**:
144 | - This configuration clusters similar reviews, grouping feedback by common themes, helping businesses understand customer sentiments around particular product aspects.
145 |
146 | #### **7\. Performance Advantages of Cosine Strategy**
147 |
148 | - **Speed**: The Cosine Similarity Strategy is faster than LLM-based extraction, as it doesn’t rely on API calls to external LLMs.
149 | - **Local Processing**: The strategy runs locally with pre-trained sentence embeddings, ideal for high-throughput scenarios where cost and latency are concerns.
150 | - **Comparison**: With a well-optimized local model, this method can perform clustering on large datasets quickly, making it suitable for tasks requiring rapid, repeated analysis.
151 |
152 | #### **8\. Full Code Example for Clustering News Articles**
153 |
154 | - **Code**:
155 |
156 |
157 |
158 | ```hljs csharp
159 | async def main():
160 | await extract_blog_sections()
161 | await extract_product_reviews()
162 |
163 | if __name__ == "__main__":
164 | asyncio.run(main())
165 |
166 | ```
167 |
168 |
169 | #### **9\. Wrap Up & Next Steps**
170 |
171 | - Recap the efficiency and effectiveness of Cosine Similarity for clustering related content quickly.
172 | - Close with a reminder of Crawl4AI’s flexibility across extraction strategies, and prompt users to experiment with different settings to optimize clustering for their specific content.
173 |
174 | * * *
175 |
176 | This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently.
177 |
178 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/12 session crawling - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 12: Session-Based Crawling for Dynamic Websites
4 |
5 | ### Quick Intro
6 |
7 | Show session management for handling websites with multiple pages or actions (like “load more” buttons). Demo: Crawl a paginated content page, persisting session data across multiple requests.
8 |
9 | Here’s a detailed outline for the **Session-Based Crawling for Dynamic Websites** video, explaining why sessions are necessary, how to use them, and providing practical examples and a visual diagram to illustrate the concept.
10 |
11 | * * *
12 |
13 | ### **11\. Session-Based Crawling for Dynamic Websites**
14 |
15 | #### **1\. Introduction to Session-Based Crawling**
16 |
17 | - **What is Session-Based Crawling**: Session-based crawling maintains a continuous browsing session across multiple page states, allowing the crawler to interact with a page and retrieve content that loads dynamically or based on user interactions.
18 | - **Why It’s Needed**:
19 | - In static pages, all content is available directly from a single URL.
20 | - In dynamic websites, content often loads progressively or based on user actions (e.g., clicking “load more,” submitting forms, scrolling).
21 | - Session-based crawling helps simulate user actions, capturing content that is otherwise hidden until specific actions are taken.
22 |
23 | #### **2\. Conceptual Diagram for Session-Based Crawling**
24 |
25 | ```hljs less
26 | graph TD
27 | Start[Start Session] --> S1[Initial State (S1)]
28 | S1 -->|Crawl| Content1[Extract Content S1]
29 | S1 -->|Action: Click Load More| S2[State S2]
30 | S2 -->|Crawl| Content2[Extract Content S2]
31 | S2 -->|Action: Scroll Down| S3[State S3]
32 | S3 -->|Crawl| Content3[Extract Content S3]
33 | S3 -->|Action: Submit Form| S4[Final State]
34 | S4 -->|Crawl| Content4[Extract Content S4]
35 | Content4 --> End[End Session]
36 |
37 | ```
38 |
39 | - **Explanation of Diagram**:
40 | - **Start**: Initializes the session and opens the starting URL.
41 | - **State Transitions**: Each action (e.g., clicking “load more,” scrolling) transitions to a new state, where additional content becomes available.
42 | - **Session Persistence**: Keeps the same browsing session active, preserving the state and allowing for a sequence of actions to unfold.
43 | - **End**: After reaching the final state, the session ends, and all accumulated content has been extracted.
44 |
45 | #### **3\. Key Components of Session-Based Crawling in Crawl4AI**
46 |
47 | - **Session ID**: A unique identifier to maintain the state across requests, allowing the crawler to “remember” previous actions.
48 | - **JavaScript Execution**: Executes JavaScript commands (e.g., clicks, scrolls) to simulate interactions.
49 | - **Wait Conditions**: Ensures the crawler waits for content to load in each state before moving on.
50 | - **Sequential State Transitions**: By defining actions and wait conditions between states, the crawler can navigate through the page as a user would.
51 |
52 | #### **4\. Basic Session Example: Multi-Step Content Loading**
53 |
54 | - **Goal**: Crawl an article feed that requires several “load more” clicks to display additional content.
55 | - **Code**:
56 |
57 |
58 |
59 | ```hljs python
60 | async def crawl_article_feed():
61 | async with AsyncWebCrawler() as crawler:
62 | session_id = "feed_session"
63 |
64 | for page in range(3):
65 | result = await crawler.arun(
66 | url="https://example.com/articles",
67 | session_id=session_id,
68 | js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
69 | wait_for="css:.article",
70 | css_selector=".article" # Target article elements
71 | )
72 | print(f"Page {page + 1}: Extracted {len(result.extracted_content)} articles")
73 |
74 | ```
75 |
76 | - **Explanation**:
77 | - **session\_id**: Ensures all requests share the same browsing state.
78 | - **js\_code**: Clicks the “load more” button after the initial page load, expanding content on each iteration.
79 | - **wait\_for**: Ensures articles have loaded after each click before extraction.
80 |
81 | #### **5\. Advanced Example: E-Commerce Product Search with Filter Selection**
82 |
83 | - **Goal**: Interact with filters on an e-commerce page to extract products based on selected criteria.
84 | - **Example Steps**:
85 | 1. **State 1**: Load the main product page.
86 | 2. **State 2**: Apply a filter (e.g., “On Sale”) by selecting a checkbox.
87 | 3. **State 3**: Scroll to load additional products and capture updated results.
88 | - **Code**:
89 |
90 |
91 |
92 |
93 | ```hljs python
94 | async def extract_filtered_products():
95 | async with AsyncWebCrawler() as crawler:
96 | session_id = "product_session"
97 |
98 | # Step 1: Open product page
99 | result = await crawler.arun(
100 | url="https://example.com/products",
101 | session_id=session_id,
102 | wait_for="css:.product-item"
103 | )
104 |
105 | # Step 2: Apply filter (e.g., "On Sale")
106 | result = await crawler.arun(
107 | url="https://example.com/products",
108 | session_id=session_id,
109 | js_code="document.querySelector('#sale-filter-checkbox').click();",
110 | wait_for="css:.product-item"
111 | )
112 |
113 | # Step 3: Scroll to load additional products
114 | for _ in range(2): # Scroll down twice
115 | result = await crawler.arun(
116 | url="https://example.com/products",
117 | session_id=session_id,
118 | js_code="window.scrollTo(0, document.body.scrollHeight);",
119 | wait_for="css:.product-item"
120 | )
121 | print(f"Loaded {len(result.extracted_content)} products after scroll")
122 |
123 | ```
124 |
125 | - **Explanation**:
126 | - **State Persistence**: Each action (filter selection and scroll) builds on the previous session state.
127 | - **Multiple Interactions**: Combines clicking a filter with scrolling, demonstrating how the session preserves these actions.
128 |
129 | #### **6\. Key Benefits of Session-Based Crawling**
130 |
131 | - **Accessing Hidden Content**: Retrieves data that loads only after user actions.
132 | - **Simulating User Behavior**: Handles interactive elements such as “load more” buttons, dropdowns, and filters.
133 | - **Maintaining Continuity Across States**: Enables a sequential process, moving logically from one state to the next, capturing all desired content without reloading the initial state each time.
134 |
135 | #### **7\. Additional Configuration Tips**
136 |
137 | - **Manage Session End**: Always conclude the session after the final state to release resources.
138 | - **Optimize with Wait Conditions**: Use `wait_for` to ensure complete loading before each extraction.
139 | - **Handling Errors in Session-Based Crawling**: Include error handling for interactions that may fail, ensuring robustness across state transitions.
140 |
141 | #### **8\. Complete Code Example: Multi-Step Session Workflow**
142 |
143 | - **Example**:
144 |
145 |
146 |
147 | ```hljs csharp
148 | async def main():
149 | await crawl_article_feed()
150 | await extract_filtered_products()
151 |
152 | if __name__ == "__main__":
153 | asyncio.run(main())
154 |
155 | ```
156 |
157 |
158 | #### **9\. Wrap Up & Next Steps**
159 |
160 | - Recap the usefulness of session-based crawling for dynamic content extraction.
161 | - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler** to cover advanced customization options for further control over the crawling process.
162 |
163 | * * *
164 |
165 | This outline covers session-based crawling from both a conceptual and practical perspective, helping users understand its importance, configure it effectively, and use it to handle complex dynamic content.
166 |
167 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/13 text chunking - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 13: Chunking Strategies for Large Text Processing
4 |
5 | ### Quick Intro
6 |
7 | Explain Regex, NLP, and Fixed-Length chunking, and when to use each. Demo: Chunk a large article or document for processing by topics or sentences.
8 |
9 | Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, emphasizing how chunking works within extraction and why it’s crucial for effective data aggregation.
10 |
11 | Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, explaining each strategy, when to use it, and providing examples to illustrate.
12 |
13 | * * *
14 |
15 | ### **12\. Chunking Strategies for Large Text Processing**
16 |
17 | #### **1\. Introduction to Chunking in Crawl4AI**
18 |
19 | - **What is Chunking**: Chunking is the process of dividing large text into manageable sections or “chunks,” enabling efficient processing in extraction tasks.
20 | - **Why It’s Needed**:
21 | - When processing large text, feeding it directly into an extraction function (like `F(x)`) can overwhelm memory or token limits.
22 | - Chunking breaks down `x` (the text) into smaller pieces, which are processed sequentially or in parallel by the extraction function, with the final result being an aggregation of all chunks’ processed output.
23 |
24 | #### **2\. Key Chunking Strategies and Use Cases**
25 |
26 | - Crawl4AI offers various chunking strategies to suit different text structures, chunk sizes, and processing requirements.
27 | - **Choosing a Strategy**: Select based on the type of text (e.g., articles, transcripts) and extraction needs (e.g., simple splitting or context-sensitive processing).
28 |
29 | #### **3\. Strategy 1: Regex-Based Chunking**
30 |
31 | - **Description**: Uses regular expressions to split text based on specified patterns (e.g., paragraphs or section breaks).
32 | - **Use Case**: Ideal for dividing text by paragraphs or larger logical blocks where sections are clearly separated by line breaks or punctuation.
33 | - **Example**:
34 | - **Pattern**: `r'\n\n'` for double line breaks.
35 |
36 |
37 |
38 | ```hljs python
39 | chunker = RegexChunking(patterns=[r'\n\n'])
40 | text_chunks = chunker.chunk(long_text)
41 | print(text_chunks) # Output: List of paragraphs
42 |
43 | ```
44 | - **Pros**: Flexible for pattern-based chunking.
45 | - **Cons**: Limited to text with consistent formatting.
46 |
47 | #### **4\. Strategy 2: NLP Sentence-Based Chunking**
48 |
49 | - **Description**: Uses NLP to split text by sentences, ensuring grammatically complete segments.
50 | - **Use Case**: Useful for extracting individual statements, such as in news articles, quotes, or legal text.
51 | - **Example**:
52 |
53 |
54 |
55 | ```hljs makefile
56 | chunker = NlpSentenceChunking()
57 | sentence_chunks = chunker.chunk(long_text)
58 | print(sentence_chunks) # Output: List of sentences
59 |
60 | ```
61 |
62 | - **Pros**: Maintains sentence structure, ideal for tasks needing semantic completeness.
63 | - **Cons**: May create very small chunks, which could limit contextual extraction.
64 |
65 | #### **5\. Strategy 3: Topic-Based Segmentation Using TextTiling**
66 |
67 | - **Description**: Segments text into topics using TextTiling, identifying topic shifts and key segments.
68 | - **Use Case**: Ideal for long articles, reports, or essays where each section covers a different topic.
69 | - **Example**:
70 |
71 |
72 |
73 | ```hljs makefile
74 | chunker = TopicSegmentationChunking(num_keywords=3)
75 | topic_chunks = chunker.chunk_with_topics(long_text)
76 | print(topic_chunks) # Output: List of topic segments with keywords
77 |
78 | ```
79 |
80 | - **Pros**: Groups related content, preserving topical coherence.
81 | - **Cons**: Depends on identifiable topic shifts, which may not be present in all texts.
82 |
83 | #### **6\. Strategy 4: Fixed-Length Word Chunking**
84 |
85 | - **Description**: Splits text into chunks based on a fixed number of words.
86 | - **Use Case**: Ideal for text where exact segment size is required, such as processing word-limited documents for LLMs.
87 | - **Example**:
88 |
89 |
90 |
91 | ```hljs makefile
92 | chunker = FixedLengthWordChunking(chunk_size=100)
93 | word_chunks = chunker.chunk(long_text)
94 | print(word_chunks) # Output: List of 100-word chunks
95 |
96 | ```
97 |
98 | - **Pros**: Ensures uniform chunk sizes, suitable for token-based extraction limits.
99 | - **Cons**: May split sentences, affecting semantic coherence.
100 |
101 | #### **7\. Strategy 5: Sliding Window Chunking**
102 |
103 | - **Description**: Uses a fixed window size with a step, creating overlapping chunks to maintain context.
104 | - **Use Case**: Useful for maintaining context across sections, as with documents where context is needed for neighboring sections.
105 | - **Example**:
106 |
107 |
108 |
109 | ```hljs makefile
110 | chunker = SlidingWindowChunking(window_size=100, step=50)
111 | window_chunks = chunker.chunk(long_text)
112 | print(window_chunks) # Output: List of overlapping word chunks
113 |
114 | ```
115 |
116 | - **Pros**: Retains context across adjacent chunks, ideal for complex semantic extraction.
117 | - **Cons**: Overlap increases data size, potentially impacting processing time.
118 |
119 | #### **8\. Strategy 6: Overlapping Window Chunking**
120 |
121 | - **Description**: Similar to sliding windows but with a defined overlap, allowing chunks to share content at the edges.
122 | - **Use Case**: Suitable for handling long texts with essential overlapping information, like research articles or medical records.
123 | - **Example**:
124 |
125 |
126 |
127 | ```hljs makefile
128 | chunker = OverlappingWindowChunking(window_size=1000, overlap=100)
129 | overlap_chunks = chunker.chunk(long_text)
130 | print(overlap_chunks) # Output: List of overlapping chunks with defined overlap
131 |
132 | ```
133 |
134 | - **Pros**: Allows controlled overlap for consistent content coverage across chunks.
135 | - **Cons**: Redundant data in overlapping areas may increase computation.
136 |
137 | #### **9\. Practical Example: Using Chunking with an Extraction Strategy**
138 |
139 | - **Goal**: Combine chunking with an extraction strategy to process large text effectively.
140 | - **Example Code**:
141 |
142 |
143 |
144 |
145 | ```hljs python
146 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
147 |
148 | async def extract_large_text():
149 | # Initialize chunker and extraction strategy
150 | chunker = FixedLengthWordChunking(chunk_size=200)
151 | extraction_strategy = LLMExtractionStrategy(provider="openai/gpt-4", api_token="your_api_token")
152 |
153 | # Split text into chunks
154 | text_chunks = chunker.chunk(large_text)
155 |
156 | async with AsyncWebCrawler() as crawler:
157 | for chunk in text_chunks:
158 | result = await crawler.arun(
159 | url="https://example.com",
160 | extraction_strategy=extraction_strategy,
161 | content=chunk
162 | )
163 | print(result.extracted_content)
164 |
165 | ```
166 |
167 | - **Explanation**:
168 | - `chunker.chunk()`: Divides the `large_text` into smaller segments based on the chosen strategy.
169 | - `extraction_strategy`: Processes each chunk separately, and results are then aggregated to form the final output.
170 |
171 | #### **10\. Choosing the Right Chunking Strategy**
172 |
173 | - **Text Structure**: If text has clear sections (e.g., paragraphs, topics), use Regex or Topic Segmentation.
174 | - **Extraction Needs**: If context is crucial, consider Sliding or Overlapping Window Chunking.
175 | - **Processing Constraints**: For word-limited extractions (e.g., LLMs with token limits), Fixed-Length Word Chunking is often most effective.
176 |
177 | #### **11\. Wrap Up & Next Steps**
178 |
179 | - Recap the benefits of each chunking strategy and when to use them in extraction workflows.
180 | - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler**, focusing on customizing crawler behavior with hooks for a fine-tuned extraction process.
181 |
182 | * * *
183 |
184 | This outline provides a complete understanding of chunking strategies, explaining each method’s strengths and best-use scenarios to help users process large texts effectively in Crawl4AI.
185 |
186 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/14 custom workflows - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 14: Hooks and Custom Workflow with AsyncWebCrawler
4 |
5 | ### Quick Intro
6 |
7 | Cover hooks ( `on_browser_created`, `before_goto`, `after_goto`) to add custom workflows. Demo: Use hooks to add custom cookies or headers, log HTML, or trigger specific events on page load.
8 |
9 | Here’s a detailed outline for the **Hooks and Custom Workflow with AsyncWebCrawler** video, covering each hook’s purpose, usage, and example implementations.
10 |
11 | * * *
12 |
13 | ### **13\. Hooks and Custom Workflow with AsyncWebCrawler**
14 |
15 | #### **1\. Introduction to Hooks in Crawl4AI**
16 |
17 | - **What are Hooks**: Hooks are customizable entry points in the crawling process that allow users to inject custom actions or logic at specific stages.
18 | - **Why Use Hooks**:
19 | - They enable fine-grained control over the crawling workflow.
20 | - Useful for performing additional tasks (e.g., logging, modifying headers) dynamically during the crawl.
21 | - Hooks provide the flexibility to adapt the crawler to complex site structures or unique project needs.
22 |
23 | #### **2\. Overview of Available Hooks**
24 |
25 | - Crawl4AI offers seven key hooks to modify and control different stages in the crawling lifecycle:
26 | - `on_browser_created`
27 | - `on_user_agent_updated`
28 | - `on_execution_started`
29 | - `before_goto`
30 | - `after_goto`
31 | - `before_return_html`
32 | - `before_retrieve_html`
33 |
34 | #### **3\. Hook-by-Hook Explanation and Examples**
35 |
36 | * * *
37 |
38 | ##### **Hook 1: `on_browser_created`**
39 |
40 | - **Purpose**: Triggered right after the browser instance is created.
41 | - **Use Case**:
42 | - Initializing browser-specific settings or performing setup actions.
43 | - Configuring browser extensions or scripts before any page is opened.
44 | - **Example**:
45 |
46 |
47 |
48 | ```hljs python
49 | async def log_browser_creation(browser):
50 | print("Browser instance created:", browser)
51 |
52 | crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
53 |
54 | ```
55 |
56 | - **Explanation**: This hook logs the browser creation event, useful for tracking when a new browser instance starts.
57 |
58 | * * *
59 |
60 | ##### **Hook 2: `on_user_agent_updated`**
61 |
62 | - **Purpose**: Called whenever the user agent string is updated.
63 | - **Use Case**:
64 | - Modifying the user agent based on page requirements, e.g., changing to a mobile user agent for mobile-only pages.
65 | - **Example**:
66 |
67 |
68 |
69 | ```hljs scss
70 | def update_user_agent(user_agent):
71 | print(f"User Agent Updated: {user_agent}")
72 |
73 | crawler.crawler_strategy.set_hook('on_user_agent_updated', update_user_agent)
74 | crawler.update_user_agent("Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)")
75 |
76 | ```
77 |
78 | - **Explanation**: This hook provides a callback every time the user agent changes, helpful for debugging or dynamically altering user agent settings based on conditions.
79 |
80 | * * *
81 |
82 | ##### **Hook 3: `on_execution_started`**
83 |
84 | - **Purpose**: Called right before the crawler begins any interaction (e.g., JavaScript execution, clicks).
85 | - **Use Case**:
86 | - Performing setup actions, such as inserting cookies or initiating custom scripts.
87 | - **Example**:
88 |
89 |
90 |
91 | ```hljs python
92 | async def log_execution_start(page):
93 | print("Execution started on page:", page.url)
94 |
95 | crawler.crawler_strategy.set_hook('on_execution_started', log_execution_start)
96 |
97 | ```
98 |
99 | - **Explanation**: Logs the start of any major interaction on the page, ideal for cases where you want to monitor each interaction.
100 |
101 | * * *
102 |
103 | ##### **Hook 4: `before_goto`**
104 |
105 | - **Purpose**: Triggered before navigating to a new URL with `page.goto()`.
106 | - **Use Case**:
107 | - Modifying request headers or setting up conditions right before the page loads.
108 | - Adding headers or dynamically adjusting options for specific URLs.
109 | - **Example**:
110 |
111 |
112 |
113 | ```hljs python
114 | async def modify_headers_before_goto(page):
115 | await page.set_extra_http_headers({"X-Custom-Header": "CustomValue"})
116 | print("Custom headers set before navigation")
117 |
118 | crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
119 |
120 | ```
121 |
122 | - **Explanation**: This hook allows injecting headers or altering settings based on the page’s needs, particularly useful for pages with custom requirements.
123 |
124 | * * *
125 |
126 | ##### **Hook 5: `after_goto`**
127 |
128 | - **Purpose**: Executed immediately after a page has loaded (after `page.goto()`).
129 | - **Use Case**:
130 | - Checking the loaded page state, modifying the DOM, or performing post-navigation actions (e.g., scrolling).
131 | - **Example**:
132 |
133 |
134 |
135 | ```hljs python
136 | async def post_navigation_scroll(page):
137 | await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
138 | print("Scrolled to the bottom after navigation")
139 |
140 | crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
141 |
142 | ```
143 |
144 | - **Explanation**: This hook scrolls to the bottom of the page after loading, which can help load dynamically added content like infinite scroll elements.
145 |
146 | * * *
147 |
148 | ##### **Hook 6: `before_return_html`**
149 |
150 | - **Purpose**: Called right before HTML content is retrieved and returned.
151 | - **Use Case**:
152 | - Removing overlays or cleaning up the page for a cleaner HTML extraction.
153 | - **Example**:
154 |
155 |
156 |
157 | ```hljs python
158 | async def remove_advertisements(page, html):
159 | await page.evaluate("document.querySelectorAll('.ad-banner').forEach(el => el.remove());")
160 | print("Advertisements removed before returning HTML")
161 |
162 | crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements)
163 |
164 | ```
165 |
166 | - **Explanation**: The hook removes ad banners from the HTML before it’s retrieved, ensuring a cleaner data extraction.
167 |
168 | * * *
169 |
170 | ##### **Hook 7: `before_retrieve_html`**
171 |
172 | - **Purpose**: Runs right before Crawl4AI initiates HTML retrieval.
173 | - **Use Case**:
174 | - Finalizing any page adjustments (e.g., setting timers, waiting for specific elements).
175 | - **Example**:
176 |
177 |
178 |
179 | ```hljs python
180 | async def wait_for_content_before_retrieve(page):
181 | await page.wait_for_selector('.main-content')
182 | print("Main content loaded, ready to retrieve HTML")
183 |
184 | crawler.crawler_strategy.set_hook('before_retrieve_html', wait_for_content_before_retrieve)
185 |
186 | ```
187 |
188 | - **Explanation**: This hook waits for the main content to load before retrieving the HTML, ensuring that all essential content is captured.
189 |
190 | #### **4\. Setting Hooks in Crawl4AI**
191 |
192 | - **How to Set Hooks**:
193 | - Use `set_hook` to define a custom function for each hook.
194 | - Each hook function can be asynchronous (useful for actions like waiting or retrieving async data).
195 | - **Example Setup**:
196 |
197 |
198 |
199 | ```hljs bash
200 | crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
201 | crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
202 | crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
203 |
204 | ```
205 |
206 |
207 | #### **5\. Complete Example: Using Hooks for a Customized Crawl Workflow**
208 |
209 | - **Goal**: Log each key step, set custom headers before navigation, and clean up the page before retrieving HTML.
210 | - **Example Code**:
211 |
212 |
213 |
214 | ```hljs python
215 | async def custom_crawl():
216 | async with AsyncWebCrawler() as crawler:
217 | # Set hooks for custom workflow
218 | crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
219 | crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
220 | crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
221 | crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements)
222 |
223 | # Perform the crawl
224 | url = "https://example.com"
225 | result = await crawler.arun(url=url)
226 | print(result.html) # Display or process HTML
227 |
228 | ```
229 |
230 |
231 | #### **6\. Benefits of Using Hooks in Custom Crawling Workflows**
232 |
233 | - **Enhanced Control**: Hooks offer precise control over each stage, allowing adjustments based on content and structure.
234 | - **Efficient Modifications**: Avoid reloading or restarting the session; hooks can alter actions dynamically.
235 | - **Context-Sensitive Actions**: Hooks enable custom logic tailored to specific pages or sections, maximizing extraction quality.
236 |
237 | #### **7\. Wrap Up & Next Steps**
238 |
239 | - Recap how hooks empower customized workflows in Crawl4AI, enabling flexibility at every stage.
240 | - Tease the next video: **Automating Post-Processing with Crawl4AI**, covering automated steps after data extraction.
241 |
242 | * * *
243 |
244 | This outline provides a thorough understanding of hooks, their practical applications, and examples for customizing the crawling workflow in Crawl4AI.
245 |
246 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/2 advanced features - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 2: Overview of Advanced Features
4 |
5 | ### Quick Intro
6 |
7 | A general overview of advanced features like hooks, CSS selectors, and JSON CSS extraction.
8 |
9 | Here's a condensed outline for an **Overview of Advanced Features** video covering Crawl4AI's powerful customization and extraction options:
10 |
11 | * * *
12 |
13 | ### **Overview of Advanced Features**
14 |
15 | 1) **Introduction to Advanced Features**:
16 |
17 | - Briefly introduce Crawl4AI’s advanced tools, which let users go beyond basic crawling to customize and fine-tune their scraping workflows.
18 |
19 | 2) **Taking Screenshots**:
20 |
21 | - Explain the screenshot capability for capturing page state and verifying content.
22 | - **Example**:
23 |
24 |
25 |
26 | ```hljs ini
27 | result = await crawler.arun(url="https://www.example.com", screenshot=True)
28 |
29 | ```
30 |
31 | - Mention that screenshots are saved as a base64 string in `result`, allowing easy decoding and saving.
32 |
33 | 3) **Media and Link Extraction**:
34 |
35 | - Demonstrate how to pull all media (images, videos) and links (internal and external) from a page for deeper analysis or content gathering.
36 | - **Example**:
37 |
38 |
39 |
40 | ```hljs python
41 | result = await crawler.arun(url="https://www.example.com")
42 | print("Media:", result.media)
43 | print("Links:", result.links)
44 |
45 | ```
46 |
47 |
48 | 4) **Custom User Agent**:
49 |
50 | - Show how to set a custom user agent to disguise the crawler or simulate specific devices/browsers.
51 | - **Example**:
52 |
53 |
54 |
55 | ```hljs ini
56 | result = await crawler.arun(url="https://www.example.com", user_agent="Mozilla/5.0 (compatible; MyCrawler/1.0)")
57 |
58 | ```
59 |
60 |
61 | 5) **Custom Hooks for Enhanced Control**:
62 |
63 | - Briefly cover how to use hooks, which allow custom actions like setting headers or handling login during the crawl.
64 | - **Example**: Setting a custom header with `before_get_url` hook.
65 |
66 |
67 |
68 | ```hljs python
69 | async def before_get_url(page):
70 | await page.set_extra_http_headers({"X-Test-Header": "test"})
71 |
72 | ```
73 |
74 |
75 | 6) **CSS Selectors for Targeted Extraction**:
76 |
77 | - Explain the use of CSS selectors to extract specific elements, ideal for structured data like articles or product details.
78 | - **Example**:
79 |
80 |
81 |
82 | ```hljs python
83 | result = await crawler.arun(url="https://www.example.com", css_selector="h2")
84 | print("H2 Tags:", result.extracted_content)
85 |
86 | ```
87 |
88 |
89 | 7) **Crawling Inside Iframes**:
90 |
91 | - Mention how enabling `process_iframes=True` allows extracting content within iframes, useful for sites with embedded content or ads.
92 | - **Example**:
93 |
94 |
95 |
96 | ```hljs ini
97 | result = await crawler.arun(url="https://www.example.com", process_iframes=True)
98 |
99 | ```
100 |
101 |
102 | 8) **Wrap-Up**:
103 |
104 | - Summarize these advanced features and how they allow users to customize every part of their web scraping experience.
105 | - Tease upcoming videos where each feature will be explored in detail.
106 |
107 | * * *
108 |
109 | This covers each advanced feature with a brief example, providing a useful overview to prepare viewers for the more in-depth videos.
110 |
111 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/3 browser setup - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 3: Browser Configurations & Headless Crawling
4 |
5 | ### Quick Intro
6 |
7 | Explain browser options ( `chromium`, `firefox`, `webkit`) and settings for headless mode, caching, and verbose logging.
8 |
9 | Here’s a streamlined outline for the **Browser Configurations & Headless Crawling** video:
10 |
11 | * * *
12 |
13 | ### **Browser Configurations & Headless Crawling**
14 |
15 | 1) **Overview of Browser Options**:
16 |
17 | - Crawl4AI supports three browser engines:
18 | - **Chromium** (default) - Highly compatible.
19 | - **Firefox** \- Great for specialized use cases.
20 | - **Webkit** \- Lightweight, ideal for basic needs.
21 | - **Example**:
22 |
23 |
24 |
25 | ```hljs ini
26 | # Using Chromium (default)
27 | crawler = AsyncWebCrawler(browser_type="chromium")
28 |
29 | # Using Firefox
30 | crawler = AsyncWebCrawler(browser_type="firefox")
31 |
32 | # Using WebKit
33 | crawler = AsyncWebCrawler(browser_type="webkit")
34 |
35 | ```
36 |
37 |
38 | 2) **Headless Mode**:
39 |
40 | - Headless mode runs the browser without a visible GUI, making it faster and less resource-intensive.
41 | - To enable or disable:
42 |
43 |
44 |
45 | ```hljs ini
46 | # Headless mode (default is True)
47 | crawler = AsyncWebCrawler(headless=True)
48 |
49 | # Disable headless mode for debugging
50 | crawler = AsyncWebCrawler(headless=False)
51 |
52 | ```
53 |
54 |
55 | 3) **Verbose Logging**:
56 | \- Use `verbose=True` to get detailed logs for each action, useful for debugging:
57 |
58 |
59 | ```hljs ini
60 | crawler = AsyncWebCrawler(verbose=True)
61 |
62 | ```
63 |
64 | 4) **Running a Basic Crawl with Configuration**:
65 | \- Example of a simple crawl with custom browser settings:
66 |
67 |
68 | ```hljs python
69 | async with AsyncWebCrawler(browser_type="firefox", headless=True, verbose=True) as crawler:
70 | result = await crawler.arun(url="https://www.example.com")
71 | print(result.markdown[:500]) # Show first 500 characters
72 |
73 | ```
74 |
75 | \- This example uses Firefox in headless mode with logging enabled, demonstrating the flexibility of Crawl4AI’s setup.
76 |
77 | 5) **Recap & Next Steps**:
78 | \- Recap the power of selecting different browsers and running headless mode for speed and efficiency.
79 | \- Tease the next video: **Proxy & Security Settings** for navigating blocked or restricted content and protecting IP identity.
80 |
81 | * * *
82 |
83 | This breakdown covers browser configuration essentials in Crawl4AI, providing users with practical steps to optimize their scraping setup.
84 |
85 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/4 proxy settings - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 4: Advanced Proxy and Security Settings
4 |
5 | ### Quick Intro
6 |
7 | Showcase proxy configurations (HTTP, SOCKS5, authenticated proxies). Demo: Use rotating proxies and set custom headers to avoid IP blocking and enhance security.
8 |
9 | Here’s a focused outline for the **Proxy and Security Settings** video:
10 |
11 | * * *
12 |
13 | ### **Proxy & Security Settings**
14 |
15 | 1) **Why Use Proxies in Web Crawling**:
16 |
17 | - Proxies are essential for bypassing IP-based restrictions, improving anonymity, and managing rate limits.
18 | - Crawl4AI supports simple proxies, authenticated proxies, and proxy rotation for robust web scraping.
19 |
20 | 2) **Basic Proxy Setup**:
21 |
22 | - **Using a Simple Proxy**:
23 |
24 |
25 |
26 | ```hljs ini
27 | # HTTP proxy
28 | crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080")
29 |
30 | # SOCKS proxy
31 | crawler = AsyncWebCrawler(proxy="socks5://proxy.example.com:1080")
32 |
33 | ```
34 |
35 |
36 | 3) **Authenticated Proxies**:
37 |
38 | - Use `proxy_config` for proxies requiring a username and password:
39 |
40 |
41 |
42 | ```hljs makefile
43 | proxy_config = {
44 | "server": "http://proxy.example.com:8080",
45 | "username": "user",
46 | "password": "pass"
47 | }
48 | crawler = AsyncWebCrawler(proxy_config=proxy_config)
49 |
50 | ```
51 |
52 |
53 | 4) **Rotating Proxies**:
54 |
55 | - Rotating proxies helps avoid IP bans by switching IP addresses for each request:
56 |
57 |
58 |
59 | ```hljs csharp
60 | async def get_next_proxy():
61 | # Define proxy rotation logic here
62 | return {"server": "http://next.proxy.com:8080"}
63 |
64 | async with AsyncWebCrawler() as crawler:
65 | for url in urls:
66 | proxy = await get_next_proxy()
67 | crawler.update_proxy(proxy)
68 | result = await crawler.arun(url=url)
69 |
70 | ```
71 |
72 | - This setup periodically switches the proxy for enhanced security and access.
73 |
74 | 5) **Custom Headers for Additional Security**:
75 |
76 | - Set custom headers to mask the crawler’s identity and avoid detection:
77 |
78 |
79 |
80 | ```hljs makefile
81 | headers = {
82 | "X-Forwarded-For": "203.0.113.195",
83 | "Accept-Language": "en-US,en;q=0.9",
84 | "Cache-Control": "no-cache",
85 | "Pragma": "no-cache"
86 | }
87 | crawler = AsyncWebCrawler(headers=headers)
88 |
89 | ```
90 |
91 |
92 | 6) **Combining Proxies with Magic Mode for Anti-Bot Protection**:
93 |
94 | - For sites with aggressive bot detection, combine `proxy` settings with `magic=True`:
95 |
96 |
97 |
98 | ```hljs csharp
99 | async with AsyncWebCrawler(proxy="http://proxy.example.com:8080", headers={"Accept-Language": "en-US"}) as crawler:
100 | result = await crawler.arun(
101 | url="https://example.com",
102 | magic=True # Enables anti-detection features
103 | )
104 |
105 | ```
106 |
107 | - **Magic Mode** automatically enables user simulation, random timing, and browser property masking.
108 |
109 | 7) **Wrap Up & Next Steps**:
110 |
111 | - Summarize the importance of proxies and anti-detection in accessing restricted content and avoiding bans.
112 | - Tease the next video: **JavaScript Execution and Handling Dynamic Content** for working with interactive and dynamically loaded pages.
113 |
114 | * * *
115 |
116 | This outline provides a practical guide to setting up proxies and security configurations, empowering users to navigate restricted sites while staying undetected.
117 |
118 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/5 dynamic content - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 5: JavaScript Execution and Dynamic Content Handling
4 |
5 | ### Quick Intro
6 |
7 | Explain JavaScript code injection with examples (e.g., simulating scrolling, clicking ‘load more’). Demo: Extract content from a page that uses dynamic loading with lazy-loaded images.
8 |
9 | Here’s a focused outline for the **JavaScript Execution and Dynamic Content Handling** video:
10 |
11 | * * *
12 |
13 | ### **JavaScript Execution & Dynamic Content Handling**
14 |
15 | 1) **Why JavaScript Execution Matters**:
16 |
17 | - Many modern websites load content dynamically via JavaScript, requiring special handling to access all elements.
18 | - Crawl4AI can execute JavaScript on pages, enabling it to interact with elements like “load more” buttons, infinite scrolls, and content that appears only after certain actions.
19 |
20 | 2) **Basic JavaScript Execution**:
21 |
22 | - Use `js_code` to execute JavaScript commands on a page:
23 |
24 |
25 |
26 | ```hljs makefile
27 | # Scroll to bottom of the page
28 | result = await crawler.arun(
29 | url="https://example.com",
30 | js_code="window.scrollTo(0, document.body.scrollHeight);"
31 | )
32 |
33 | ```
34 |
35 | - This command scrolls to the bottom, triggering any lazy-loaded or dynamically added content.
36 |
37 | 3) **Multiple Commands & Simulating Clicks**:
38 |
39 | - Combine multiple JavaScript commands to interact with elements like “load more” buttons:
40 |
41 |
42 |
43 | ```hljs makefile
44 | js_commands = [\
45 | "window.scrollTo(0, document.body.scrollHeight);",\
46 | "document.querySelector('.load-more').click();"\
47 | ]
48 | result = await crawler.arun(
49 | url="https://example.com",
50 | js_code=js_commands
51 | )
52 |
53 | ```
54 |
55 | - This script scrolls down and then clicks the “load more” button, useful for loading additional content blocks.
56 |
57 | 4) **Waiting for Dynamic Content**:
58 |
59 | - Use `wait_for` to ensure the page loads specific elements before proceeding:
60 |
61 |
62 |
63 | ```hljs makefile
64 | result = await crawler.arun(
65 | url="https://example.com",
66 | js_code="window.scrollTo(0, document.body.scrollHeight);",
67 | wait_for="css:.dynamic-content" # Wait for elements with class `.dynamic-content`
68 | )
69 |
70 | ```
71 |
72 | - This example waits until elements with `.dynamic-content` are loaded, helping to capture content that appears after JavaScript actions.
73 |
74 | 5) **Handling Complex Dynamic Content (e.g., Infinite Scroll)**:
75 |
76 | - Combine JavaScript execution with conditional waiting to handle infinite scrolls or paginated content:
77 |
78 |
79 |
80 | ```hljs csharp
81 | result = await crawler.arun(
82 | url="https://example.com",
83 | js_code=[\
84 | "window.scrollTo(0, document.body.scrollHeight);",\
85 | "const loadMore = document.querySelector('.load-more'); if (loadMore) loadMore.click();"\
86 | ],
87 | wait_for="js:() => document.querySelectorAll('.item').length > 10" # Wait until 10 items are loaded
88 | )
89 |
90 | ```
91 |
92 | - This example scrolls and clicks "load more" repeatedly, waiting each time for a specified number of items to load.
93 |
94 | 6) **Complete Example: Dynamic Content Handling with Extraction**:
95 |
96 | - Full example demonstrating a dynamic load and content extraction in one process:
97 |
98 |
99 |
100 | ```hljs csharp
101 | async with AsyncWebCrawler() as crawler:
102 | result = await crawler.arun(
103 | url="https://example.com",
104 | js_code=[\
105 | "window.scrollTo(0, document.body.scrollHeight);",\
106 | "document.querySelector('.load-more').click();"\
107 | ],
108 | wait_for="css:.main-content",
109 | css_selector=".main-content"
110 | )
111 | print(result.markdown[:500]) # Output the main content extracted
112 |
113 | ```
114 |
115 |
116 | 7) **Wrap Up & Next Steps**:
117 |
118 | - Recap how JavaScript execution allows access to dynamic content, enabling powerful interactions.
119 | - Tease the next video: **Content Cleaning and Fit Markdown** to show how Crawl4AI can extract only the most relevant content from complex pages.
120 |
121 | * * *
122 |
123 | This outline explains how to handle dynamic content and JavaScript-based interactions effectively, enabling users to scrape and interact with complex, modern websites.
124 |
125 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/6 magic mode - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 6: Magic Mode and Anti-Bot Protection
4 |
5 | ### Quick Intro
6 |
7 | Highlight `Magic Mode` and anti-bot features like user simulation, navigator overrides, and timing randomization. Demo: Access a site with anti-bot protection and show how `Magic Mode` seamlessly handles it.
8 |
9 | Here’s a concise outline for the **Magic Mode and Anti-Bot Protection** video:
10 |
11 | * * *
12 |
13 | ### **Magic Mode & Anti-Bot Protection**
14 |
15 | 1) **Why Anti-Bot Protection is Important**:
16 |
17 | - Many websites use bot detection mechanisms to block automated scraping. Crawl4AI’s anti-detection features help avoid IP bans, CAPTCHAs, and access restrictions.
18 | - **Magic Mode** is a one-step solution to enable a range of anti-bot features without complex configuration.
19 |
20 | 2) **Enabling Magic Mode**:
21 |
22 | - Simply set `magic=True` to activate Crawl4AI’s full anti-bot suite:
23 |
24 |
25 |
26 | ```hljs python
27 | result = await crawler.arun(
28 | url="https://example.com",
29 | magic=True # Enables all anti-detection features
30 | )
31 |
32 | ```
33 |
34 | - This enables a blend of stealth techniques, including masking automation signals, randomizing timings, and simulating real user behavior.
35 |
36 | 3) **What Magic Mode Does Behind the Scenes**:
37 |
38 | - **User Simulation**: Mimics human actions like mouse movements and scrolling.
39 | - **Navigator Overrides**: Hides signals that indicate an automated browser.
40 | - **Timing Randomization**: Adds random delays to simulate natural interaction patterns.
41 | - **Cookie Handling**: Accepts and manages cookies dynamically to avoid triggers from cookie pop-ups.
42 |
43 | 4) **Manual Anti-Bot Options (If Not Using Magic Mode)**:
44 |
45 | - For granular control, you can configure individual settings without Magic Mode:
46 |
47 |
48 |
49 | ```hljs python
50 | result = await crawler.arun(
51 | url="https://example.com",
52 | simulate_user=True, # Enables human-like behavior
53 | override_navigator=True # Hides automation fingerprints
54 | )
55 |
56 | ```
57 |
58 | - **Use Cases**: This approach allows more specific adjustments when certain anti-bot features are needed but others are not.
59 |
60 | 5) **Combining Proxies with Magic Mode**:
61 |
62 | - To avoid rate limits or IP blocks, combine Magic Mode with a proxy:
63 |
64 |
65 |
66 | ```hljs csharp
67 | async with AsyncWebCrawler(
68 | proxy="http://proxy.example.com:8080",
69 | headers={"Accept-Language": "en-US"}
70 | ) as crawler:
71 | result = await crawler.arun(
72 | url="https://example.com",
73 | magic=True # Full anti-detection
74 | )
75 |
76 | ```
77 |
78 | - This setup maximizes stealth by pairing anti-bot detection with IP obfuscation.
79 |
80 | 6) **Example of Anti-Bot Protection in Action**:
81 |
82 | - Full example with Magic Mode and proxies to scrape a protected page:
83 |
84 |
85 |
86 | ```hljs python
87 | async with AsyncWebCrawler() as crawler:
88 | result = await crawler.arun(
89 | url="https://example.com/protected-content",
90 | magic=True,
91 | proxy="http://proxy.example.com:8080",
92 | wait_for="css:.content-loaded" # Wait for the main content to load
93 | )
94 | print(result.markdown[:500]) # Display first 500 characters of the content
95 |
96 | ```
97 |
98 | - This example ensures seamless access to protected content by combining anti-detection and waiting for full content load.
99 |
100 | 7) **Wrap Up & Next Steps**:
101 |
102 | - Recap the power of Magic Mode and anti-bot features for handling restricted websites.
103 | - Tease the next video: **Content Cleaning and Fit Markdown** to show how to extract clean and focused content from a page.
104 |
105 | * * *
106 |
107 | This outline shows users how to easily avoid bot detection and access restricted content, demonstrating both the power and simplicity of Magic Mode in Crawl4AI.
108 |
109 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/7 content cleaning - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 7: Content Cleaning and Fit Markdown
4 |
5 | ### Quick Intro
6 |
7 | Explain content cleaning options, including `fit_markdown` to keep only the most relevant content. Demo: Extract and compare regular vs. fit markdown from a news site or blog.
8 |
9 | Here’s a streamlined outline for the **Content Cleaning and Fit Markdown** video:
10 |
11 | * * *
12 |
13 | ### **Content Cleaning & Fit Markdown**
14 |
15 | 1) **Overview of Content Cleaning in Crawl4AI**:
16 |
17 | - Explain that web pages often include extra elements like ads, navigation bars, footers, and popups.
18 | - Crawl4AI’s content cleaning features help extract only the main content, reducing noise and enhancing readability.
19 |
20 | 2) **Basic Content Cleaning Options**:
21 |
22 | - **Removing Unwanted Elements**: Exclude specific HTML tags, like forms or navigation bars:
23 |
24 |
25 |
26 | ```hljs python
27 | result = await crawler.arun(
28 | url="https://example.com",
29 | word_count_threshold=10, # Filter out blocks with fewer than 10 words
30 | excluded_tags=['form', 'nav'], # Exclude specific tags
31 | remove_overlay_elements=True # Remove popups and modals
32 | )
33 |
34 | ```
35 |
36 | - This example extracts content while excluding forms, navigation, and modal overlays, ensuring clean results.
37 |
38 | 3) **Fit Markdown for Main Content Extraction**:
39 |
40 | - **What is Fit Markdown**: Uses advanced analysis to identify the most relevant content (ideal for articles, blogs, and documentation).
41 | - **How it Works**: Analyzes content density, removes boilerplate elements, and maintains formatting for a clear output.
42 | - **Example**:
43 |
44 |
45 |
46 | ```hljs makefile
47 | result = await crawler.arun(url="https://example.com")
48 | main_content = result.fit_markdown # Extracted main content
49 | print(main_content[:500]) # Display first 500 characters
50 |
51 | ```
52 |
53 | - Fit Markdown is especially helpful for long-form content like news articles or blog posts.
54 |
55 | 4) **Comparing Fit Markdown with Regular Markdown**:
56 |
57 | - **Fit Markdown** returns the primary content without extraneous elements.
58 | - **Regular Markdown** includes all extracted text in markdown format.
59 | - Example to show the difference:
60 |
61 |
62 |
63 | ```hljs python
64 | all_content = result.markdown # Full markdown
65 | main_content = result.fit_markdown # Only the main content
66 |
67 | print(f"All Content Length: {len(all_content)}")
68 | print(f"Main Content Length: {len(main_content)}")
69 |
70 | ```
71 |
72 | - This comparison shows the effectiveness of Fit Markdown in focusing on essential content.
73 |
74 | 5) **Media and Metadata Handling with Content Cleaning**:
75 |
76 | - **Media Extraction**: Crawl4AI captures images and videos with metadata like alt text, descriptions, and relevance scores:
77 |
78 |
79 |
80 | ```hljs python
81 | for image in result.media["images"]:
82 | print(f"Source: {image['src']}, Alt Text: {image['alt']}, Relevance Score: {image['score']}")
83 |
84 | ```
85 |
86 | - **Use Case**: Useful for saving only relevant images or videos from an article or content-heavy page.
87 |
88 | 6) **Example of Clean Content Extraction in Action**:
89 |
90 | - Full example extracting cleaned content and Fit Markdown:
91 |
92 |
93 |
94 | ```hljs csharp
95 | async with AsyncWebCrawler() as crawler:
96 | result = await crawler.arun(
97 | url="https://example.com",
98 | word_count_threshold=10,
99 | excluded_tags=['nav', 'footer'],
100 | remove_overlay_elements=True
101 | )
102 | print(result.fit_markdown[:500]) # Show main content
103 |
104 | ```
105 |
106 | - This example demonstrates content cleaning with settings for filtering noise and focusing on the core text.
107 |
108 | 7) **Wrap Up & Next Steps**:
109 |
110 | - Summarize the power of Crawl4AI’s content cleaning features and Fit Markdown for capturing clean, relevant content.
111 | - Tease the next video: **Link Analysis and Smart Filtering** to focus on analyzing and filtering links within crawled pages.
112 |
113 | * * *
114 |
115 | This outline covers Crawl4AI’s content cleaning features and the unique benefits of Fit Markdown, showing users how to retrieve focused, high-quality content from web pages.
116 |
117 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/8 media handling - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 8: Media Handling: Images, Videos, and Audio
4 |
5 | ### Quick Intro
6 |
7 | Showcase Crawl4AI’s media extraction capabilities, including lazy-loaded media and metadata. Demo: Crawl a multimedia page, extract images, and show metadata (alt text, context, relevance score).
8 |
9 | Here’s a clear and focused outline for the **Media Handling: Images, Videos, and Audio** video:
10 |
11 | * * *
12 |
13 | ### **Media Handling: Images, Videos, and Audio**
14 |
15 | 1) **Overview of Media Extraction in Crawl4AI**:
16 |
17 | - Crawl4AI can detect and extract different types of media (images, videos, and audio) along with useful metadata.
18 | - This functionality is essential for gathering visual content from multimedia-heavy pages like e-commerce sites, news articles, and social media feeds.
19 |
20 | 2) **Image Extraction and Metadata**:
21 |
22 | - Crawl4AI captures images with detailed metadata, including:
23 | - **Source URL**: The direct URL to the image.
24 | - **Alt Text**: Image description if available.
25 | - **Relevance Score**: A score (0–10) indicating how relevant the image is to the main content.
26 | - **Context**: Text surrounding the image on the page.
27 | - **Example**:
28 |
29 |
30 |
31 | ```hljs python
32 | result = await crawler.arun(url="https://example.com")
33 |
34 | for image in result.media["images"]:
35 | print(f"Source: {image['src']}")
36 | print(f"Alt Text: {image['alt']}")
37 | print(f"Relevance Score: {image['score']}")
38 | print(f"Context: {image['context']}")
39 |
40 | ```
41 |
42 | - This example shows how to access each image’s metadata, making it easy to filter for the most relevant visuals.
43 |
44 | 3) **Handling Lazy-Loaded Images**:
45 |
46 | - Crawl4AI automatically supports lazy-loaded images, which are commonly used to optimize webpage loading.
47 | - **Example with Wait for Lazy-Loaded Content**:
48 |
49 |
50 |
51 | ```hljs makefile
52 | result = await crawler.arun(
53 | url="https://example.com",
54 | wait_for="css:img[data-src]", # Wait for lazy-loaded images
55 | delay_before_return_html=2.0 # Allow extra time for images to load
56 | )
57 |
58 | ```
59 |
60 | - This setup waits for lazy-loaded images to appear, ensuring they are fully captured.
61 |
62 | 4) **Video Extraction and Metadata**:
63 |
64 | - Crawl4AI captures video elements, including:
65 | - **Source URL**: The video’s direct URL.
66 | - **Type**: Format of the video (e.g., MP4).
67 | - **Thumbnail**: A poster or thumbnail image if available.
68 | - **Duration**: Video length, if metadata is provided.
69 | - **Example**:
70 |
71 |
72 |
73 | ```hljs python
74 | for video in result.media["videos"]:
75 | print(f"Video Source: {video['src']}")
76 | print(f"Type: {video['type']}")
77 | print(f"Thumbnail: {video.get('poster')}")
78 | print(f"Duration: {video.get('duration')}")
79 |
80 | ```
81 |
82 | - This allows users to gather video content and relevant details for further processing or analysis.
83 |
84 | 5) **Audio Extraction and Metadata**:
85 |
86 | - Audio elements can also be extracted, with metadata like:
87 | - **Source URL**: The audio file’s direct URL.
88 | - **Type**: Format of the audio file (e.g., MP3).
89 | - **Duration**: Length of the audio, if available.
90 | - **Example**:
91 |
92 |
93 |
94 | ```hljs python
95 | for audio in result.media["audios"]:
96 | print(f"Audio Source: {audio['src']}")
97 | print(f"Type: {audio['type']}")
98 | print(f"Duration: {audio.get('duration')}")
99 |
100 | ```
101 |
102 | - Useful for sites with podcasts, sound bites, or other audio content.
103 |
104 | 6) **Filtering Media by Relevance**:
105 |
106 | - Use metadata like relevance score to filter only the most useful media content:
107 |
108 |
109 |
110 | ```hljs ini
111 | relevant_images = [img for img in result.media["images"] if img['score'] > 5]
112 |
113 | ```
114 |
115 | - This is especially helpful for content-heavy pages where you only want media directly related to the main content.
116 |
117 | 7) **Example: Full Media Extraction with Content Filtering**:
118 |
119 | - Full example extracting images, videos, and audio along with filtering by relevance:
120 |
121 |
122 |
123 | ```hljs python
124 | async with AsyncWebCrawler() as crawler:
125 | result = await crawler.arun(
126 | url="https://example.com",
127 | word_count_threshold=10, # Filter content blocks for relevance
128 | exclude_external_images=True # Only keep internal images
129 | )
130 |
131 | # Display media summaries
132 | print(f"Relevant Images: {len(relevant_images)}")
133 | print(f"Videos: {len(result.media['videos'])}")
134 | print(f"Audio Clips: {len(result.media['audios'])}")
135 |
136 | ```
137 |
138 | - This example shows how to capture and filter various media types, focusing on what’s most relevant.
139 |
140 | 8) **Wrap Up & Next Steps**:
141 |
142 | - Recap the comprehensive media extraction capabilities, emphasizing how metadata helps users focus on relevant content.
143 | - Tease the next video: **Link Analysis and Smart Filtering** to explore how Crawl4AI handles internal, external, and social media links for more focused data gathering.
144 |
145 | * * *
146 |
147 | This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction.
148 |
149 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/9 link analysis - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | ## Episode 9: Link Analysis and Smart Filtering
4 |
5 | ### Quick Intro
6 |
7 | Walk through internal and external link classification, social media link filtering, and custom domain exclusion. Demo: Analyze links on a website, focusing on internal navigation vs. external or ad links.
8 |
9 | Here’s a focused outline for the **Link Analysis and Smart Filtering** video:
10 |
11 | * * *
12 |
13 | ### **Link Analysis & Smart Filtering**
14 |
15 | 1) **Importance of Link Analysis in Web Crawling**:
16 |
17 | - Explain that web pages often contain numerous links, including internal links, external links, social media links, and ads.
18 | - Crawl4AI’s link analysis and filtering options help extract only relevant links, enabling more targeted and efficient crawls.
19 |
20 | 2) **Automatic Link Classification**:
21 |
22 | - Crawl4AI categorizes links automatically into internal, external, and social media links.
23 | - **Example**:
24 |
25 |
26 |
27 | ```hljs makefile
28 | result = await crawler.arun(url="https://example.com")
29 |
30 | # Access internal and external links
31 | internal_links = result.links["internal"]
32 | external_links = result.links["external"]
33 |
34 | # Print first few links for each type
35 | print("Internal Links:", internal_links[:3])
36 | print("External Links:", external_links[:3])
37 |
38 | ```
39 |
40 |
41 | 3) **Filtering Out Unwanted Links**:
42 |
43 | - **Exclude External Links**: Remove all links pointing to external sites.
44 | - **Exclude Social Media Links**: Filter out social media domains like Facebook or Twitter.
45 | - **Example**:
46 |
47 |
48 |
49 | ```hljs python
50 | result = await crawler.arun(
51 | url="https://example.com",
52 | exclude_external_links=True, # Remove external links
53 | exclude_social_media_links=True # Remove social media links
54 | )
55 |
56 | ```
57 |
58 |
59 | 4) **Custom Domain Filtering**:
60 |
61 | - **Exclude Specific Domains**: Filter links from particular domains, e.g., ad sites.
62 | - **Custom Social Media Domains**: Add additional social media domains if needed.
63 | - **Example**:
64 |
65 |
66 |
67 | ```hljs csharp
68 | result = await crawler.arun(
69 | url="https://example.com",
70 | exclude_domains=["ads.com", "trackers.com"],
71 | exclude_social_media_domains=["facebook.com", "linkedin.com"]
72 | )
73 |
74 | ```
75 |
76 |
77 | 5) **Accessing Link Context and Metadata**:
78 |
79 | - Crawl4AI provides additional metadata for each link, including its text, type (e.g., navigation or content), and surrounding context.
80 | - **Example**:
81 |
82 |
83 |
84 | ```hljs python
85 | for link in result.links["internal"]:
86 | print(f"Link: {link['href']}, Text: {link['text']}, Context: {link['context']}")
87 |
88 | ```
89 |
90 | - **Use Case**: Helps users understand the relevance of links based on where they are placed on the page (e.g., navigation vs. article content).
91 |
92 | 6) **Example of Comprehensive Link Filtering and Analysis**:
93 |
94 | - Full example combining link filtering, metadata access, and contextual information:
95 |
96 |
97 |
98 | ```hljs python
99 | async with AsyncWebCrawler() as crawler:
100 | result = await crawler.arun(
101 | url="https://example.com",
102 | exclude_external_links=True,
103 | exclude_social_media_links=True,
104 | exclude_domains=["ads.com"],
105 | css_selector=".main-content" # Focus only on main content area
106 | )
107 | for link in result.links["internal"]:
108 | print(f"Internal Link: {link['href']}, Text: {link['text']}, Context: {link['context']}")
109 |
110 | ```
111 |
112 | - This example filters unnecessary links, keeping only internal and relevant links from the main content area.
113 |
114 | 7) **Wrap Up & Next Steps**:
115 |
116 | - Summarize the benefits of link filtering for efficient crawling and relevant content extraction.
117 | - Tease the next video: **Custom Headers, Identity Management, and User Simulation** to explain how to configure identity settings and simulate user behavior for stealthier crawls.
118 |
119 | * * *
120 |
121 | This outline provides a practical overview of Crawl4AI’s link analysis and filtering features, helping users target only essential links while eliminating distractions.
122 |
123 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/asyncwebcrawlerarun - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Complete Parameter Guide for arun()
2 |
3 | The following parameters can be passed to the `arun()` method. They are organized by their primary usage context and functionality.
4 |
5 | ## Core Parameters
6 |
7 | ```hljs graphql
8 | await crawler.arun(
9 | url="https://example.com", # Required: URL to crawl
10 | verbose=True, # Enable detailed logging
11 | bypass_cache=False, # Skip cache for this request
12 | warmup=True # Whether to run warmup check
13 | )
14 |
15 | ```
16 |
17 | ## Content Processing Parameters
18 |
19 | ### Text Processing
20 |
21 | ```hljs python
22 | await crawler.arun(
23 | word_count_threshold=10, # Minimum words per content block
24 | image_description_min_word_threshold=5, # Minimum words for image descriptions
25 | only_text=False, # Extract only text content
26 | excluded_tags=['form', 'nav'], # HTML tags to exclude
27 | keep_data_attributes=False, # Preserve data-* attributes
28 | )
29 |
30 | ```
31 |
32 | ### Content Selection
33 |
34 | ```hljs python
35 | await crawler.arun(
36 | css_selector=".main-content", # CSS selector for content extraction
37 | remove_forms=True, # Remove all form elements
38 | remove_overlay_elements=True, # Remove popups/modals/overlays
39 | )
40 |
41 | ```
42 |
43 | ### Link Handling
44 |
45 | ```hljs python
46 | await crawler.arun(
47 | exclude_external_links=True, # Remove external links
48 | exclude_social_media_links=True, # Remove social media links
49 | exclude_external_images=True, # Remove external images
50 | exclude_domains=["ads.example.com"], # Specific domains to exclude
51 | social_media_domains=[ # Additional social media domains\
52 | "facebook.com",\
53 | "twitter.com",\
54 | "instagram.com"\
55 | ]
56 | )
57 |
58 | ```
59 |
60 | ## Browser Control Parameters
61 |
62 | ### Basic Browser Settings
63 |
64 | ```hljs python
65 | await crawler.arun(
66 | headless=True, # Run browser in headless mode
67 | browser_type="chromium", # Browser engine: "chromium", "firefox", "webkit"
68 | page_timeout=60000, # Page load timeout in milliseconds
69 | user_agent="custom-agent", # Custom user agent
70 | )
71 |
72 | ```
73 |
74 | ### Navigation and Waiting
75 |
76 | ```hljs python
77 | await crawler.arun(
78 | wait_for="css:.dynamic-content", # Wait for element/condition
79 | delay_before_return_html=2.0, # Wait before returning HTML (seconds)
80 | )
81 |
82 | ```
83 |
84 | ### JavaScript Execution
85 |
86 | ```hljs graphql
87 | await crawler.arun(
88 | js_code=[ # JavaScript to execute (string or list)\
89 | "window.scrollTo(0, document.body.scrollHeight);",\
90 | "document.querySelector('.load-more').click();"\
91 | ],
92 | js_only=False, # Only execute JavaScript without reloading page
93 | )
94 |
95 | ```
96 |
97 | ### Anti-Bot Features
98 |
99 | ```hljs python
100 | await crawler.arun(
101 | magic=True, # Enable all anti-detection features
102 | simulate_user=True, # Simulate human behavior
103 | override_navigator=True # Override navigator properties
104 | )
105 |
106 | ```
107 |
108 | ### Session Management
109 |
110 | ```hljs python
111 | await crawler.arun(
112 | session_id="my_session", # Session identifier for persistent browsing
113 | )
114 |
115 | ```
116 |
117 | ### Screenshot Options
118 |
119 | ```hljs python
120 | await crawler.arun(
121 | screenshot=True, # Take page screenshot
122 | screenshot_wait_for=2.0, # Wait before screenshot (seconds)
123 | )
124 |
125 | ```
126 |
127 | ### Proxy Configuration
128 |
129 | ```hljs csharp
130 | await crawler.arun(
131 | proxy="http://proxy.example.com:8080", # Simple proxy URL
132 | proxy_config={ # Advanced proxy settings
133 | "server": "http://proxy.example.com:8080",
134 | "username": "user",
135 | "password": "pass"
136 | }
137 | )
138 |
139 | ```
140 |
141 | ## Content Extraction Parameters
142 |
143 | ### Extraction Strategy
144 |
145 | ```hljs graphql
146 | await crawler.arun(
147 | extraction_strategy=LLMExtractionStrategy(
148 | provider="ollama/llama2",
149 | schema=MySchema.schema(),
150 | instruction="Extract specific data"
151 | )
152 | )
153 |
154 | ```
155 |
156 | ### Chunking Strategy
157 |
158 | ```hljs python
159 | await crawler.arun(
160 | chunking_strategy=RegexChunking(
161 | patterns=[r'\n\n', r'\.\s+']
162 | )
163 | )
164 |
165 | ```
166 |
167 | ### HTML to Text Options
168 |
169 | ```hljs python
170 | await crawler.arun(
171 | html2text={
172 | "ignore_links": False,
173 | "ignore_images": False,
174 | "escape_dot": False,
175 | "body_width": 0,
176 | "protect_links": True,
177 | "unicode_snob": True
178 | }
179 | )
180 |
181 | ```
182 |
183 | ## Debug Options
184 |
185 | ```hljs python
186 | await crawler.arun(
187 | log_console=True, # Log browser console messages
188 | )
189 |
190 | ```
191 |
192 | ## Parameter Interactions and Notes
193 |
194 | 1. **Magic Mode Combinations**
195 |
196 |
197 |
198 | ```hljs python
199 | # Full anti-detection setup
200 | await crawler.arun(
201 | magic=True,
202 | headless=False,
203 | simulate_user=True,
204 | override_navigator=True
205 | )
206 |
207 | ```
208 |
209 | 2. **Dynamic Content Handling**
210 |
211 |
212 |
213 | ```hljs csharp
214 | # Handle lazy-loaded content
215 | await crawler.arun(
216 | js_code="window.scrollTo(0, document.body.scrollHeight);",
217 | wait_for="css:.lazy-content",
218 | delay_before_return_html=2.0
219 | )
220 |
221 | ```
222 |
223 | 3. **Content Extraction Pipeline**
224 |
225 |
226 |
227 | ```hljs python
228 | # Complete extraction setup
229 | await crawler.arun(
230 | css_selector=".main-content",
231 | word_count_threshold=20,
232 | extraction_strategy=my_strategy,
233 | chunking_strategy=my_chunking,
234 | process_iframes=True,
235 | remove_overlay_elements=True
236 | )
237 |
238 | ```
239 |
240 |
241 | ## Best Practices
242 |
243 | 1. **Performance Optimization**
244 |
245 |
246 |
247 | ```hljs python
248 | await crawler.arun(
249 | bypass_cache=False, # Use cache when possible
250 | word_count_threshold=10, # Filter out noise
251 | process_iframes=False # Skip iframes if not needed
252 | )
253 |
254 | ```
255 |
256 | 2. **Reliable Scraping**
257 |
258 |
259 |
260 | ```hljs python
261 | await crawler.arun(
262 | magic=True, # Enable anti-detection
263 | delay_before_return_html=1.0, # Wait for dynamic content
264 | page_timeout=60000 # Longer timeout for slow pages
265 | )
266 |
267 | ```
268 |
269 | 3. **Clean Content**
270 |
271 |
272 |
273 | ```hljs python
274 | await crawler.arun(
275 | remove_overlay_elements=True, # Remove popups
276 | excluded_tags=['nav', 'aside'],# Remove unnecessary elements
277 | keep_data_attributes=False # Remove data attributes
278 | )
279 |
280 | ```
281 |
282 |
283 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/browser configuration - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Browser Configuration
2 |
3 | Crawl4AI supports multiple browser engines and offers extensive configuration options for browser behavior.
4 |
5 | ## Browser Types
6 |
7 | Choose from three browser engines:
8 |
9 | ```hljs csharp
10 | # Chromium (default)
11 | async with AsyncWebCrawler(browser_type="chromium") as crawler:
12 | result = await crawler.arun(url="https://example.com")
13 |
14 | # Firefox
15 | async with AsyncWebCrawler(browser_type="firefox") as crawler:
16 | result = await crawler.arun(url="https://example.com")
17 |
18 | # WebKit
19 | async with AsyncWebCrawler(browser_type="webkit") as crawler:
20 | result = await crawler.arun(url="https://example.com")
21 |
22 | ```
23 |
24 | ## Basic Configuration
25 |
26 | Common browser settings:
27 |
28 | ```hljs python
29 | async with AsyncWebCrawler(
30 | headless=True, # Run in headless mode (no GUI)
31 | verbose=True, # Enable detailed logging
32 | sleep_on_close=False # No delay when closing browser
33 | ) as crawler:
34 | result = await crawler.arun(url="https://example.com")
35 |
36 | ```
37 |
38 | ## Identity Management
39 |
40 | Control how your crawler appears to websites:
41 |
42 | ```hljs csharp
43 | # Custom user agent
44 | async with AsyncWebCrawler(
45 | user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
46 | ) as crawler:
47 | result = await crawler.arun(url="https://example.com")
48 |
49 | # Custom headers
50 | headers = {
51 | "Accept-Language": "en-US,en;q=0.9",
52 | "Cache-Control": "no-cache"
53 | }
54 | async with AsyncWebCrawler(headers=headers) as crawler:
55 | result = await crawler.arun(url="https://example.com")
56 |
57 | ```
58 |
59 | ## Screenshot Capabilities
60 |
61 | Capture page screenshots with enhanced error handling:
62 |
63 | ```hljs python
64 | result = await crawler.arun(
65 | url="https://example.com",
66 | screenshot=True, # Enable screenshot
67 | screenshot_wait_for=2.0 # Wait 2 seconds before capture
68 | )
69 |
70 | if result.screenshot: # Base64 encoded image
71 | import base64
72 | with open("screenshot.png", "wb") as f:
73 | f.write(base64.b64decode(result.screenshot))
74 |
75 | ```
76 |
77 | ## Timeouts and Waiting
78 |
79 | Control page loading behavior:
80 |
81 | ```hljs makefile
82 | result = await crawler.arun(
83 | url="https://example.com",
84 | page_timeout=60000, # Page load timeout (ms)
85 | delay_before_return_html=2.0, # Wait before content capture
86 | wait_for="css:.dynamic-content" # Wait for specific element
87 | )
88 |
89 | ```
90 |
91 | ## JavaScript Execution
92 |
93 | Execute custom JavaScript before crawling:
94 |
95 | ```hljs makefile
96 | # Single JavaScript command
97 | result = await crawler.arun(
98 | url="https://example.com",
99 | js_code="window.scrollTo(0, document.body.scrollHeight);"
100 | )
101 |
102 | # Multiple commands
103 | js_commands = [\
104 | "window.scrollTo(0, document.body.scrollHeight);",\
105 | "document.querySelector('.load-more').click();"\
106 | ]
107 | result = await crawler.arun(
108 | url="https://example.com",
109 | js_code=js_commands
110 | )
111 |
112 | ```
113 |
114 | ## Proxy Configuration
115 |
116 | Use proxies for enhanced access:
117 |
118 | ```hljs csharp
119 | # Simple proxy
120 | async with AsyncWebCrawler(
121 | proxy="http://proxy.example.com:8080"
122 | ) as crawler:
123 | result = await crawler.arun(url="https://example.com")
124 |
125 | # Proxy with authentication
126 | proxy_config = {
127 | "server": "http://proxy.example.com:8080",
128 | "username": "user",
129 | "password": "pass"
130 | }
131 | async with AsyncWebCrawler(proxy_config=proxy_config) as crawler:
132 | result = await crawler.arun(url="https://example.com")
133 |
134 | ```
135 |
136 | ## Anti-Detection Features
137 |
138 | Enable stealth features to avoid bot detection:
139 |
140 | ```hljs python
141 | result = await crawler.arun(
142 | url="https://example.com",
143 | simulate_user=True, # Simulate human behavior
144 | override_navigator=True, # Mask automation signals
145 | magic=True # Enable all anti-detection features
146 | )
147 |
148 | ```
149 |
150 | ## Handling Dynamic Content
151 |
152 | Configure browser to handle dynamic content:
153 |
154 | ```hljs python
155 | # Wait for dynamic content
156 | result = await crawler.arun(
157 | url="https://example.com",
158 | wait_for="js:() => document.querySelector('.content').children.length > 10",
159 | process_iframes=True # Process iframe content
160 | )
161 |
162 | # Handle lazy-loaded images
163 | result = await crawler.arun(
164 | url="https://example.com",
165 | js_code="window.scrollTo(0, document.body.scrollHeight);",
166 | delay_before_return_html=2.0 # Wait for images to load
167 | )
168 |
169 | ```
170 |
171 | ## Comprehensive Example
172 |
173 | Here's how to combine various browser configurations:
174 |
175 | ```hljs python
176 | async def crawl_with_advanced_config(url: str):
177 | async with AsyncWebCrawler(
178 | # Browser setup
179 | browser_type="chromium",
180 | headless=True,
181 | verbose=True,
182 |
183 | # Identity
184 | user_agent="Custom User Agent",
185 | headers={"Accept-Language": "en-US"},
186 |
187 | # Proxy setup
188 | proxy="http://proxy.example.com:8080"
189 | ) as crawler:
190 | result = await crawler.arun(
191 | url=url,
192 | # Content handling
193 | process_iframes=True,
194 | screenshot=True,
195 |
196 | # Timing
197 | page_timeout=60000,
198 | delay_before_return_html=2.0,
199 |
200 | # Anti-detection
201 | magic=True,
202 | simulate_user=True,
203 |
204 | # Dynamic content
205 | js_code=[\
206 | "window.scrollTo(0, document.body.scrollHeight);",\
207 | "document.querySelector('.load-more')?.click();"\
208 | ],
209 | wait_for="css:.dynamic-content"
210 | )
211 |
212 | return {
213 | "content": result.markdown,
214 | "screenshot": result.screenshot,
215 | "success": result.success
216 | }
217 |
218 | ```
219 |
220 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/chunking - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | ## Chunking Strategies 📚
2 |
3 | Crawl4AI provides several powerful chunking strategies to divide text into manageable parts for further processing. Each strategy has unique characteristics and is suitable for different scenarios. Let's explore them one by one.
4 |
5 | ### RegexChunking
6 |
7 | `RegexChunking` splits text using regular expressions. This is ideal for creating chunks based on specific patterns like paragraphs or sentences.
8 |
9 | #### When to Use
10 |
11 | - Great for structured text with consistent delimiters.
12 | - Suitable for documents where specific patterns (e.g., double newlines, periods) indicate logical chunks.
13 |
14 | #### Parameters
15 |
16 | - `patterns` (list, optional): Regular expressions used to split the text. Default is to split by double newlines ( `['\n\n']`).
17 |
18 | #### Example
19 |
20 | ```hljs python
21 | from crawl4ai.chunking_strategy import RegexChunking
22 |
23 | # Define patterns for splitting text
24 | patterns = [r'\n\n', r'\. ']
25 | chunker = RegexChunking(patterns=patterns)
26 |
27 | # Sample text
28 | text = "This is a sample text. It will be split into chunks.\n\nThis is another paragraph."
29 |
30 | # Chunk the text
31 | chunks = chunker.chunk(text)
32 | print(chunks)
33 |
34 | ```
35 |
36 | ### NlpSentenceChunking
37 |
38 | `NlpSentenceChunking` uses NLP models to split text into sentences, ensuring accurate sentence boundaries.
39 |
40 | #### When to Use
41 |
42 | - Ideal for texts where sentence boundaries are crucial.
43 | - Useful for creating chunks that preserve grammatical structures.
44 |
45 | #### Parameters
46 |
47 | - None.
48 |
49 | #### Example
50 |
51 | ```hljs makefile
52 | from crawl4ai.chunking_strategy import NlpSentenceChunking
53 |
54 | chunker = NlpSentenceChunking()
55 |
56 | # Sample text
57 | text = "This is a sample text. It will be split into sentences. Here's another sentence."
58 |
59 | # Chunk the text
60 | chunks = chunker.chunk(text)
61 | print(chunks)
62 |
63 | ```
64 |
65 | ### TopicSegmentationChunking
66 |
67 | `TopicSegmentationChunking` employs the TextTiling algorithm to segment text into topic-based chunks. This method identifies thematic boundaries.
68 |
69 | #### When to Use
70 |
71 | - Perfect for long documents with distinct topics.
72 | - Useful when preserving topic continuity is more important than maintaining text order.
73 |
74 | #### Parameters
75 |
76 | - `num_keywords` (int, optional): Number of keywords for each topic segment. Default is `3`.
77 |
78 | #### Example
79 |
80 | ```hljs makefile
81 | from crawl4ai.chunking_strategy import TopicSegmentationChunking
82 |
83 | chunker = TopicSegmentationChunking(num_keywords=3)
84 |
85 | # Sample text
86 | text = "This document contains several topics. Topic one discusses AI. Topic two covers machine learning."
87 |
88 | # Chunk the text
89 | chunks = chunker.chunk(text)
90 | print(chunks)
91 |
92 | ```
93 |
94 | ### FixedLengthWordChunking
95 |
96 | `FixedLengthWordChunking` splits text into chunks based on a fixed number of words. This ensures each chunk has approximately the same length.
97 |
98 | #### When to Use
99 |
100 | - Suitable for processing large texts where uniform chunk size is important.
101 | - Useful when the number of words per chunk needs to be controlled.
102 |
103 | #### Parameters
104 |
105 | - `chunk_size` (int, optional): Number of words per chunk. Default is `100`.
106 |
107 | #### Example
108 |
109 | ```hljs makefile
110 | from crawl4ai.chunking_strategy import FixedLengthWordChunking
111 |
112 | chunker = FixedLengthWordChunking(chunk_size=10)
113 |
114 | # Sample text
115 | text = "This is a sample text. It will be split into chunks of fixed length."
116 |
117 | # Chunk the text
118 | chunks = chunker.chunk(text)
119 | print(chunks)
120 |
121 | ```
122 |
123 | ### SlidingWindowChunking
124 |
125 | `SlidingWindowChunking` uses a sliding window approach to create overlapping chunks. Each chunk has a fixed length, and the window slides by a specified step size.
126 |
127 | #### When to Use
128 |
129 | - Ideal for creating overlapping chunks to preserve context.
130 | - Useful for tasks where context from adjacent chunks is needed.
131 |
132 | #### Parameters
133 |
134 | - `window_size` (int, optional): Number of words in each chunk. Default is `100`.
135 | - `step` (int, optional): Number of words to slide the window. Default is `50`.
136 |
137 | #### Example
138 |
139 | ```hljs vbnet
140 | from crawl4ai.chunking_strategy import SlidingWindowChunking
141 |
142 | chunker = SlidingWindowChunking(window_size=10, step=5)
143 |
144 | # Sample text
145 | text = "This is a sample text. It will be split using a sliding window approach to preserve context."
146 |
147 | # Chunk the text
148 | chunks = chunker.chunk(text)
149 | print(chunks)
150 |
151 | ```
152 |
153 | With these chunking strategies, you can choose the best method to divide your text based on your specific needs. Whether you need precise sentence boundaries, topic-based segmentation, or uniform chunk sizes, Crawl4AI has you covered. Happy chunking! 📝✨
154 |
155 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/content processing - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Content Processing
2 |
3 | Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction.
4 |
5 | ## Content Cleaning
6 |
7 | ### Understanding Clean Content
8 |
9 | When crawling web pages, you often encounter a lot of noise - advertisements, navigation menus, footers, popups, and other irrelevant content. Crawl4AI automatically cleans this noise using several approaches:
10 |
11 | 1. **Basic Cleaning**: Removes unwanted HTML elements and attributes
12 | 2. **Content Relevance**: Identifies and preserves meaningful content blocks
13 | 3. **Layout Analysis**: Understands page structure to identify main content areas
14 |
15 | ```hljs python
16 | result = await crawler.arun(
17 | url="https://example.com",
18 | word_count_threshold=10, # Remove blocks with fewer words
19 | excluded_tags=['form', 'nav'], # Remove specific HTML tags
20 | remove_overlay_elements=True # Remove popups/modals
21 | )
22 |
23 | # Get clean content
24 | print(result.cleaned_html) # Cleaned HTML
25 | print(result.markdown) # Clean markdown version
26 |
27 | ```
28 |
29 | ### Fit Markdown: Smart Content Extraction
30 |
31 | One of Crawl4AI's most powerful features is `fit_markdown`. This feature uses advanced heuristics to identify and extract the main content from a webpage while excluding irrelevant elements.
32 |
33 | #### How Fit Markdown Works
34 |
35 | - Analyzes content density and distribution
36 | - Identifies content patterns and structures
37 | - Removes boilerplate content (headers, footers, sidebars)
38 | - Preserves the most relevant content blocks
39 | - Maintains content hierarchy and formatting
40 |
41 | #### Perfect For:
42 |
43 | - Blog posts and articles
44 | - News content
45 | - Documentation pages
46 | - Any page with a clear main content area
47 |
48 | #### Not Recommended For:
49 |
50 | - E-commerce product listings
51 | - Search results pages
52 | - Social media feeds
53 | - Pages with multiple equal-weight content sections
54 |
55 | ```hljs python
56 | result = await crawler.arun(url="https://example.com")
57 |
58 | # Get the most relevant content
59 | main_content = result.fit_markdown
60 |
61 | # Compare with regular markdown
62 | all_content = result.markdown
63 |
64 | print(f"Fit Markdown Length: {len(main_content)}")
65 | print(f"Regular Markdown Length: {len(all_content)}")
66 |
67 | ```
68 |
69 | #### Example Use Case
70 |
71 | ```hljs python
72 | async def extract_article_content(url: str) -> str:
73 | """Extract main article content from a blog or news site."""
74 | async with AsyncWebCrawler() as crawler:
75 | result = await crawler.arun(url=url)
76 |
77 | # fit_markdown will focus on the article content,
78 | # excluding navigation, ads, and other distractions
79 | return result.fit_markdown
80 |
81 | ```
82 |
83 | ## Media Processing
84 |
85 | Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance.
86 |
87 | ### Image Processing
88 |
89 | The library handles various image scenarios, including:
90 | \- Regular images
91 | \- Lazy-loaded images
92 | \- Background images
93 | \- Responsive images
94 | \- Image metadata and context
95 |
96 | ```hljs python
97 | result = await crawler.arun(url="https://example.com")
98 |
99 | for image in result.media["images"]:
100 | # Each image includes rich metadata
101 | print(f"Source: {image['src']}")
102 | print(f"Alt text: {image['alt']}")
103 | print(f"Description: {image['desc']}")
104 | print(f"Context: {image['context']}") # Surrounding text
105 | print(f"Relevance score: {image['score']}") # 0-10 score
106 |
107 | ```
108 |
109 | ### Handling Lazy-Loaded Content
110 |
111 | Crawl4aai already handles lazy loading for media elements. You can also customize the wait time for lazy-loaded content:
112 |
113 | ```hljs makefile
114 | result = await crawler.arun(
115 | url="https://example.com",
116 | wait_for="css:img[data-src]", # Wait for lazy images
117 | delay_before_return_html=2.0 # Additional wait time
118 | )
119 |
120 | ```
121 |
122 | ### Video and Audio Content
123 |
124 | The library extracts video and audio elements with their metadata:
125 |
126 | ```hljs python
127 | # Process videos
128 | for video in result.media["videos"]:
129 | print(f"Video source: {video['src']}")
130 | print(f"Type: {video['type']}")
131 | print(f"Duration: {video.get('duration')}")
132 | print(f"Thumbnail: {video.get('poster')}")
133 |
134 | # Process audio
135 | for audio in result.media["audios"]:
136 | print(f"Audio source: {audio['src']}")
137 | print(f"Type: {audio['type']}")
138 | print(f"Duration: {audio.get('duration')}")
139 |
140 | ```
141 |
142 | ## Link Analysis
143 |
144 | Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns.
145 |
146 | ### Link Classification
147 |
148 | The library automatically categorizes links into:
149 | \- Internal links (same domain)
150 | \- External links (different domains)
151 | \- Social media links
152 | \- Navigation links
153 | \- Content links
154 |
155 | ```hljs python
156 | result = await crawler.arun(url="https://example.com")
157 |
158 | # Analyze internal links
159 | for link in result.links["internal"]:
160 | print(f"Internal: {link['href']}")
161 | print(f"Link text: {link['text']}")
162 | print(f"Context: {link['context']}") # Surrounding text
163 | print(f"Type: {link['type']}") # nav, content, etc.
164 |
165 | # Analyze external links
166 | for link in result.links["external"]:
167 | print(f"External: {link['href']}")
168 | print(f"Domain: {link['domain']}")
169 | print(f"Type: {link['type']}")
170 |
171 | ```
172 |
173 | ### Smart Link Filtering
174 |
175 | Control which links are included in the results:
176 |
177 | ```hljs python
178 | result = await crawler.arun(
179 | url="https://example.com",
180 | exclude_external_links=True, # Remove external links
181 | exclude_social_media_links=True, # Remove social media links
182 | exclude_social_media_domains=[ # Custom social media domains\
183 | "facebook.com", "twitter.com", "instagram.com"\
184 | ],
185 | exclude_domains=["ads.example.com"] # Exclude specific domains
186 | )
187 |
188 | ```
189 |
190 | ## Metadata Extraction
191 |
192 | Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content:
193 |
194 | ```hljs python
195 | result = await crawler.arun(url="https://example.com")
196 |
197 | metadata = result.metadata
198 | print(f"Title: {metadata['title']}")
199 | print(f"Description: {metadata['description']}")
200 | print(f"Keywords: {metadata['keywords']}")
201 | print(f"Author: {metadata['author']}")
202 | print(f"Published Date: {metadata['published_date']}")
203 | print(f"Modified Date: {metadata['modified_date']}")
204 | print(f"Language: {metadata['language']}")
205 |
206 | ```
207 |
208 | ## Best Practices
209 |
210 | 1. **Use Fit Markdown for Articles**
211 |
212 |
213 |
214 | ```hljs ini
215 | # Perfect for blog posts, news articles, documentation
216 | content = result.fit_markdown
217 |
218 | ```
219 |
220 | 2. **Handle Media Appropriately**
221 |
222 |
223 |
224 | ```hljs ini
225 | # Filter by relevance score
226 | relevant_images = [\
227 | img for img in result.media["images"]\
228 | if img['score'] > 5\
229 | ]
230 |
231 | ```
232 |
233 | 3. **Combine Link Analysis with Content**
234 |
235 |
236 |
237 | ```hljs bash
238 | # Get content links with context
239 | content_links = [\
240 | link for link in result.links["internal"]\
241 | if link['type'] == 'content'\
242 | ]
243 |
244 | ```
245 |
246 | 4. **Clean Content with Purpose**
247 |
248 |
249 |
250 | ```hljs python
251 | # Customize cleaning based on your needs
252 | result = await crawler.arun(
253 | url=url,
254 | word_count_threshold=20, # Adjust based on content type
255 | keep_data_attributes=False, # Remove data attributes
256 | process_iframes=True # Include iframe content
257 | )
258 |
259 | ```
260 |
261 |
262 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/content selection - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Content Selection
2 |
3 | Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need.
4 |
5 | ## CSS Selectors
6 |
7 | The simplest way to extract specific content:
8 |
9 | ```hljs makefile
10 | # Extract specific content using CSS selector
11 | result = await crawler.arun(
12 | url="https://example.com",
13 | css_selector=".main-article" # Target main article content
14 | )
15 |
16 | # Multiple selectors
17 | result = await crawler.arun(
18 | url="https://example.com",
19 | css_selector="article h1, article .content" # Target heading and content
20 | )
21 |
22 | ```
23 |
24 | ## Content Filtering
25 |
26 | Control what content is included or excluded:
27 |
28 | ```hljs python
29 | result = await crawler.arun(
30 | url="https://example.com",
31 | # Content thresholds
32 | word_count_threshold=10, # Minimum words per block
33 |
34 | # Tag exclusions
35 | excluded_tags=['form', 'header', 'footer', 'nav'],
36 |
37 | # Link filtering
38 | exclude_external_links=True, # Remove external links
39 | exclude_social_media_links=True, # Remove social media links
40 |
41 | # Media filtering
42 | exclude_external_images=True # Remove external images
43 | )
44 |
45 | ```
46 |
47 | ## Iframe Content
48 |
49 | Process content inside iframes:
50 |
51 | ```hljs python
52 | result = await crawler.arun(
53 | url="https://example.com",
54 | process_iframes=True, # Extract iframe content
55 | remove_overlay_elements=True # Remove popups/modals that might block iframes
56 | )
57 |
58 | ```
59 |
60 | ## Structured Content Selection
61 |
62 | ### Using LLMs for Smart Selection
63 |
64 | Use LLMs to intelligently extract specific types of content:
65 |
66 | ```hljs python
67 | from pydantic import BaseModel
68 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
69 |
70 | class ArticleContent(BaseModel):
71 | title: str
72 | main_points: List[str]
73 | conclusion: str
74 |
75 | strategy = LLMExtractionStrategy(
76 | provider="ollama/nemotron", # Works with any supported LLM
77 | schema=ArticleContent.schema(),
78 | instruction="Extract the main article title, key points, and conclusion"
79 | )
80 |
81 | result = await crawler.arun(
82 | url="https://example.com",
83 | extraction_strategy=strategy
84 | )
85 | article = json.loads(result.extracted_content)
86 |
87 | ```
88 |
89 | ### Pattern-Based Selection
90 |
91 | For repeated content patterns (like product listings, news feeds):
92 |
93 | ```hljs makefile
94 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
95 |
96 | schema = {
97 | "name": "News Articles",
98 | "baseSelector": "article.news-item", # Repeated element
99 | "fields": [\
100 | {"name": "headline", "selector": "h2", "type": "text"},\
101 | {"name": "summary", "selector": ".summary", "type": "text"},\
102 | {"name": "category", "selector": ".category", "type": "text"},\
103 | {\
104 | "name": "metadata",\
105 | "type": "nested",\
106 | "fields": [\
107 | {"name": "author", "selector": ".author", "type": "text"},\
108 | {"name": "date", "selector": ".date", "type": "text"}\
109 | ]\
110 | }\
111 | ]
112 | }
113 |
114 | strategy = JsonCssExtractionStrategy(schema)
115 | result = await crawler.arun(
116 | url="https://example.com",
117 | extraction_strategy=strategy
118 | )
119 | articles = json.loads(result.extracted_content)
120 |
121 | ```
122 |
123 | ## Domain-Based Filtering
124 |
125 | Control content based on domains:
126 |
127 | ```hljs python
128 | result = await crawler.arun(
129 | url="https://example.com",
130 | exclude_domains=["ads.com", "tracker.com"],
131 | exclude_social_media_domains=["facebook.com", "twitter.com"], # Custom social media domains to exclude
132 | exclude_social_media_links=True
133 | )
134 |
135 | ```
136 |
137 | ## Media Selection
138 |
139 | Select specific types of media:
140 |
141 | ```hljs python
142 | result = await crawler.arun(url="https://example.com")
143 |
144 | # Access different media types
145 | images = result.media["images"] # List of image details
146 | videos = result.media["videos"] # List of video details
147 | audios = result.media["audios"] # List of audio details
148 |
149 | # Image with metadata
150 | for image in images:
151 | print(f"URL: {image['src']}")
152 | print(f"Alt text: {image['alt']}")
153 | print(f"Description: {image['desc']}")
154 | print(f"Relevance score: {image['score']}")
155 |
156 | ```
157 |
158 | ## Comprehensive Example
159 |
160 | Here's how to combine different selection methods:
161 |
162 | ```hljs python
163 | async def extract_article_content(url: str):
164 | # Define structured extraction
165 | article_schema = {
166 | "name": "Article",
167 | "baseSelector": "article.main",
168 | "fields": [\
169 | {"name": "title", "selector": "h1", "type": "text"},\
170 | {"name": "content", "selector": ".content", "type": "text"}\
171 | ]
172 | }
173 |
174 | # Define LLM extraction
175 | class ArticleAnalysis(BaseModel):
176 | key_points: List[str]
177 | sentiment: str
178 | category: str
179 |
180 | async with AsyncWebCrawler() as crawler:
181 | # Get structured content
182 | pattern_result = await crawler.arun(
183 | url=url,
184 | extraction_strategy=JsonCssExtractionStrategy(article_schema),
185 | word_count_threshold=10,
186 | excluded_tags=['nav', 'footer'],
187 | exclude_external_links=True
188 | )
189 |
190 | # Get semantic analysis
191 | analysis_result = await crawler.arun(
192 | url=url,
193 | extraction_strategy=LLMExtractionStrategy(
194 | provider="ollama/nemotron",
195 | schema=ArticleAnalysis.schema(),
196 | instruction="Analyze the article content"
197 | )
198 | )
199 |
200 | # Combine results
201 | return {
202 | "article": json.loads(pattern_result.extracted_content),
203 | "analysis": json.loads(analysis_result.extracted_content),
204 | "media": pattern_result.media
205 | }
206 |
207 | ```
208 |
209 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/cosine strategy - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Cosine Strategy
2 |
3 | The Cosine Strategy in Crawl4AI uses similarity-based clustering to identify and extract relevant content sections from web pages. This strategy is particularly useful when you need to find and extract content based on semantic similarity rather than structural patterns.
4 |
5 | ## How It Works
6 |
7 | The Cosine Strategy:
8 | 1\. Breaks down page content into meaningful chunks
9 | 2\. Converts text into vector representations
10 | 3\. Calculates similarity between chunks
11 | 4\. Clusters similar content together
12 | 5\. Ranks and filters content based on relevance
13 |
14 | ## Basic Usage
15 |
16 | ```hljs csharp
17 | from crawl4ai.extraction_strategy import CosineStrategy
18 |
19 | strategy = CosineStrategy(
20 | semantic_filter="product reviews", # Target content type
21 | word_count_threshold=10, # Minimum words per cluster
22 | sim_threshold=0.3 # Similarity threshold
23 | )
24 |
25 | async with AsyncWebCrawler() as crawler:
26 | result = await crawler.arun(
27 | url="https://example.com/reviews",
28 | extraction_strategy=strategy
29 | )
30 |
31 | content = result.extracted_content
32 |
33 | ```
34 |
35 | ## Configuration Options
36 |
37 | ### Core Parameters
38 |
39 | ```hljs python
40 | CosineStrategy(
41 | # Content Filtering
42 | semantic_filter: str = None, # Keywords/topic for content filtering
43 | word_count_threshold: int = 10, # Minimum words per cluster
44 | sim_threshold: float = 0.3, # Similarity threshold (0.0 to 1.0)
45 |
46 | # Clustering Parameters
47 | max_dist: float = 0.2, # Maximum distance for clustering
48 | linkage_method: str = 'ward', # Clustering linkage method
49 | top_k: int = 3, # Number of top categories to extract
50 |
51 | # Model Configuration
52 | model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', # Embedding model
53 |
54 | verbose: bool = False # Enable logging
55 | )
56 |
57 | ```
58 |
59 | ### Parameter Details
60 |
61 | 01. **semantic\_filter**
62 | 02. Sets the target topic or content type
63 | 03. Use keywords relevant to your desired content
64 | 04. Example: "technical specifications", "user reviews", "pricing information"
65 |
66 | 05. **sim\_threshold**
67 |
68 | 06. Controls how similar content must be to be grouped together
69 | 07. Higher values (e.g., 0.8) mean stricter matching
70 | 08. Lower values (e.g., 0.3) allow more variation
71 |
72 |
73 |
74 |
75 | ```hljs ini
76 | # Strict matching
77 | strategy = CosineStrategy(sim_threshold=0.8)
78 |
79 | # Loose matching
80 | strategy = CosineStrategy(sim_threshold=0.3)
81 |
82 | ```
83 |
84 | 09. **word\_count\_threshold**
85 |
86 | 10. Filters out short content blocks
87 | 11. Helps eliminate noise and irrelevant content
88 |
89 |
90 |
91 |
92 | ```hljs ini
93 | # Only consider substantial paragraphs
94 | strategy = CosineStrategy(word_count_threshold=50)
95 |
96 | ```
97 |
98 | 12. **top\_k**
99 |
100 | 13. Number of top content clusters to return
101 | 14. Higher values return more diverse content
102 |
103 |
104 |
105 | ```hljs ini
106 | # Get top 5 most relevant content clusters
107 | strategy = CosineStrategy(top_k=5)
108 |
109 | ```
110 |
111 |
112 | ## Use Cases
113 |
114 | ### 1\. Article Content Extraction
115 |
116 | ```hljs makefile
117 | strategy = CosineStrategy(
118 | semantic_filter="main article content",
119 | word_count_threshold=100, # Longer blocks for articles
120 | top_k=1 # Usually want single main content
121 | )
122 |
123 | result = await crawler.arun(
124 | url="https://example.com/blog/post",
125 | extraction_strategy=strategy
126 | )
127 |
128 | ```
129 |
130 | ### 2\. Product Review Analysis
131 |
132 | ```hljs makefile
133 | strategy = CosineStrategy(
134 | semantic_filter="customer reviews and ratings",
135 | word_count_threshold=20, # Reviews can be shorter
136 | top_k=10, # Get multiple reviews
137 | sim_threshold=0.4 # Allow variety in review content
138 | )
139 |
140 | ```
141 |
142 | ### 3\. Technical Documentation
143 |
144 | ```hljs makefile
145 | strategy = CosineStrategy(
146 | semantic_filter="technical specifications documentation",
147 | word_count_threshold=30,
148 | sim_threshold=0.6, # Stricter matching for technical content
149 | max_dist=0.3 # Allow related technical sections
150 | )
151 |
152 | ```
153 |
154 | ## Advanced Features
155 |
156 | ### Custom Clustering
157 |
158 | ```hljs bash
159 | strategy = CosineStrategy(
160 | linkage_method='complete', # Alternative clustering method
161 | max_dist=0.4, # Larger clusters
162 | model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' # Multilingual support
163 | )
164 |
165 | ```
166 |
167 | ### Content Filtering Pipeline
168 |
169 | ```hljs python
170 | strategy = CosineStrategy(
171 | semantic_filter="pricing plans features",
172 | word_count_threshold=15,
173 | sim_threshold=0.5,
174 | top_k=3
175 | )
176 |
177 | async def extract_pricing_features(url: str):
178 | async with AsyncWebCrawler() as crawler:
179 | result = await crawler.arun(
180 | url=url,
181 | extraction_strategy=strategy
182 | )
183 |
184 | if result.success:
185 | content = json.loads(result.extracted_content)
186 | return {
187 | 'pricing_features': content,
188 | 'clusters': len(content),
189 | 'similarity_scores': [item['score'] for item in content]
190 | }
191 |
192 | ```
193 |
194 | ## Best Practices
195 |
196 | 01. **Adjust Thresholds Iteratively**
197 | 02. Start with default values
198 | 03. Adjust based on results
199 | 04. Monitor clustering quality
200 |
201 | 05. **Choose Appropriate Word Count Thresholds**
202 |
203 | 06. Higher for articles (100+)
204 | 07. Lower for reviews/comments (20+)
205 | 08. Medium for product descriptions (50+)
206 |
207 | 09. **Optimize Performance**
208 |
209 |
210 |
211 | ```hljs graphql
212 | strategy = CosineStrategy(
213 | word_count_threshold=10, # Filter early
214 | top_k=5, # Limit results
215 | verbose=True # Monitor performance
216 | )
217 |
218 | ```
219 |
220 | 10. **Handle Different Content Types**
221 |
222 |
223 |
224 | ```hljs makefile
225 | # For mixed content pages
226 | strategy = CosineStrategy(
227 | semantic_filter="product features",
228 | sim_threshold=0.4, # More flexible matching
229 | max_dist=0.3, # Larger clusters
230 | top_k=3 # Multiple relevant sections
231 | )
232 |
233 | ```
234 |
235 |
236 | ## Error Handling
237 |
238 | ```hljs python
239 | try:
240 | result = await crawler.arun(
241 | url="https://example.com",
242 | extraction_strategy=strategy
243 | )
244 |
245 | if result.success:
246 | content = json.loads(result.extracted_content)
247 | if not content:
248 | print("No relevant content found")
249 | else:
250 | print(f"Extraction failed: {result.error_message}")
251 |
252 | except Exception as e:
253 | print(f"Error during extraction: {str(e)}")
254 |
255 | ```
256 |
257 | The Cosine Strategy is particularly effective when:
258 | \- Content structure is inconsistent
259 | \- You need semantic understanding
260 | \- You want to find similar content blocks
261 | \- Structure-based extraction (CSS/XPath) isn't reliable
262 |
263 | It works well with other strategies and can be used as a pre-processing step for LLM-based extraction.
264 |
265 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/crawlresult - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # CrawlResult
2 |
3 | The `CrawlResult` class represents the result of a web crawling operation. It provides access to various forms of extracted content and metadata from the crawled webpage.
4 |
5 | ## Class Definition
6 |
7 | ```hljs python
8 | class CrawlResult(BaseModel):
9 | """Result of a web crawling operation."""
10 |
11 | # Basic Information
12 | url: str # Crawled URL
13 | success: bool # Whether crawl succeeded
14 | status_code: Optional[int] = None # HTTP status code
15 | error_message: Optional[str] = None # Error message if failed
16 |
17 | # Content
18 | html: str # Raw HTML content
19 | cleaned_html: Optional[str] = None # Cleaned HTML
20 | fit_html: Optional[str] = None # Most relevant HTML content
21 | markdown: Optional[str] = None # HTML converted to markdown
22 | fit_markdown: Optional[str] = None # Most relevant markdown content
23 |
24 | # Extracted Data
25 | extracted_content: Optional[str] = None # Content from extraction strategy
26 | media: Dict[str, List[Dict]] = {} # Extracted media information
27 | links: Dict[str, List[Dict]] = {} # Extracted links
28 | metadata: Optional[dict] = None # Page metadata
29 |
30 | # Additional Data
31 | screenshot: Optional[str] = None # Base64 encoded screenshot
32 | session_id: Optional[str] = None # Session identifier
33 | response_headers: Optional[dict] = None # HTTP response headers
34 |
35 | ```
36 |
37 | ## Properties and Their Data Structures
38 |
39 | ### Basic Information
40 |
41 | ```hljs python
42 | # Access basic information
43 | result = await crawler.arun(url="https://example.com")
44 |
45 | print(result.url) # "https://example.com"
46 | print(result.success) # True/False
47 | print(result.status_code) # 200, 404, etc.
48 | print(result.error_message) # Error details if failed
49 |
50 | ```
51 |
52 | ### Content Properties
53 |
54 | #### HTML Content
55 |
56 | ```hljs ini
57 | # Raw HTML
58 | html_content = result.html
59 |
60 | # Cleaned HTML (removed ads, popups, etc.)
61 | clean_content = result.cleaned_html
62 |
63 | # Most relevant HTML content
64 | main_content = result.fit_html
65 |
66 | ```
67 |
68 | #### Markdown Content
69 |
70 | ```hljs ini
71 | # Full markdown version
72 | markdown_content = result.markdown
73 |
74 | # Most relevant markdown content
75 | main_content = result.fit_markdown
76 |
77 | ```
78 |
79 | ### Media Content
80 |
81 | The media dictionary contains organized media elements:
82 |
83 | ```hljs python
84 | # Structure
85 | media = {
86 | "images": [\
87 | {\
88 | "src": str, # Image URL\
89 | "alt": str, # Alt text\
90 | "desc": str, # Contextual description\
91 | "score": float, # Relevance score (0-10)\
92 | "type": str, # "image"\
93 | "width": int, # Image width (if available)\
94 | "height": int, # Image height (if available)\
95 | "context": str, # Surrounding text\
96 | "lazy": bool # Whether image was lazy-loaded\
97 | }\
98 | ],
99 | "videos": [\
100 | {\
101 | "src": str, # Video URL\
102 | "type": str, # "video"\
103 | "title": str, # Video title\
104 | "poster": str, # Thumbnail URL\
105 | "duration": str, # Video duration\
106 | "description": str # Video description\
107 | }\
108 | ],
109 | "audios": [\
110 | {\
111 | "src": str, # Audio URL\
112 | "type": str, # "audio"\
113 | "title": str, # Audio title\
114 | "duration": str, # Audio duration\
115 | "description": str # Audio description\
116 | }\
117 | ]
118 | }
119 |
120 | # Example usage
121 | for image in result.media["images"]:
122 | if image["score"] > 5: # High-relevance images
123 | print(f"High-quality image: {image['src']}")
124 | print(f"Context: {image['context']}")
125 |
126 | ```
127 |
128 | ### Link Analysis
129 |
130 | The links dictionary organizes discovered links:
131 |
132 | ```hljs python
133 | # Structure
134 | links = {
135 | "internal": [\
136 | {\
137 | "href": str, # URL\
138 | "text": str, # Link text\
139 | "title": str, # Title attribute\
140 | "type": str, # Link type (nav, content, etc.)\
141 | "context": str, # Surrounding text\
142 | "score": float # Relevance score\
143 | }\
144 | ],
145 | "external": [\
146 | {\
147 | "href": str, # External URL\
148 | "text": str, # Link text\
149 | "title": str, # Title attribute\
150 | "domain": str, # Domain name\
151 | "type": str, # Link type\
152 | "context": str # Surrounding text\
153 | }\
154 | ]
155 | }
156 |
157 | # Example usage
158 | for link in result.links["internal"]:
159 | print(f"Internal link: {link['href']}")
160 | print(f"Context: {link['context']}")
161 |
162 | ```
163 |
164 | ### Metadata
165 |
166 | The metadata dictionary contains page information:
167 |
168 | ```hljs python
169 | # Structure
170 | metadata = {
171 | "title": str, # Page title
172 | "description": str, # Meta description
173 | "keywords": List[str], # Meta keywords
174 | "author": str, # Author information
175 | "published_date": str, # Publication date
176 | "modified_date": str, # Last modified date
177 | "language": str, # Page language
178 | "canonical_url": str, # Canonical URL
179 | "og_data": Dict, # Open Graph data
180 | "twitter_data": Dict # Twitter card data
181 | }
182 |
183 | # Example usage
184 | if result.metadata:
185 | print(f"Title: {result.metadata['title']}")
186 | print(f"Author: {result.metadata.get('author', 'Unknown')}")
187 |
188 | ```
189 |
190 | ### Extracted Content
191 |
192 | Content from extraction strategies:
193 |
194 | ```hljs bash
195 | # For LLM or CSS extraction strategies
196 | if result.extracted_content:
197 | structured_data = json.loads(result.extracted_content)
198 | print(structured_data)
199 |
200 | ```
201 |
202 | ### Screenshot
203 |
204 | Base64 encoded screenshot:
205 |
206 | ```hljs python
207 | # Save screenshot if available
208 | if result.screenshot:
209 | import base64
210 |
211 | # Decode and save
212 | with open("screenshot.png", "wb") as f:
213 | f.write(base64.b64decode(result.screenshot))
214 |
215 | ```
216 |
217 | ## Usage Examples
218 |
219 | ### Basic Content Access
220 |
221 | ```hljs python
222 | async with AsyncWebCrawler() as crawler:
223 | result = await crawler.arun(url="https://example.com")
224 |
225 | if result.success:
226 | # Get clean content
227 | print(result.fit_markdown)
228 |
229 | # Process images
230 | for image in result.media["images"]:
231 | if image["score"] > 7:
232 | print(f"High-quality image: {image['src']}")
233 |
234 | ```
235 |
236 | ### Complete Data Processing
237 |
238 | ```hljs python
239 | async def process_webpage(url: str) -> Dict:
240 | async with AsyncWebCrawler() as crawler:
241 | result = await crawler.arun(url=url)
242 |
243 | if not result.success:
244 | raise Exception(f"Crawl failed: {result.error_message}")
245 |
246 | return {
247 | "content": result.fit_markdown,
248 | "images": [\
249 | img for img in result.media["images"]\
250 | if img["score"] > 5\
251 | ],
252 | "internal_links": [\
253 | link["href"] for link in result.links["internal"]\
254 | ],
255 | "metadata": result.metadata,
256 | "status": result.status_code
257 | }
258 |
259 | ```
260 |
261 | ### Error Handling
262 |
263 | ```hljs python
264 | async def safe_crawl(url: str) -> Dict:
265 | async with AsyncWebCrawler() as crawler:
266 | try:
267 | result = await crawler.arun(url=url)
268 |
269 | if not result.success:
270 | return {
271 | "success": False,
272 | "error": result.error_message,
273 | "status": result.status_code
274 | }
275 |
276 | return {
277 | "success": True,
278 | "content": result.fit_markdown,
279 | "status": result.status_code
280 | }
281 |
282 | except Exception as e:
283 | return {
284 | "success": False,
285 | "error": str(e),
286 | "status": None
287 | }
288 |
289 | ```
290 |
291 | ## Best Practices
292 |
293 | 1. **Always Check Success**
294 |
295 |
296 |
297 | ```hljs python
298 | if not result.success:
299 | print(f"Error: {result.error_message}")
300 | return
301 |
302 | ```
303 |
304 | 2. **Use fit\_markdown for Articles**
305 |
306 |
307 |
308 | ```hljs ini
309 | # Better for article content
310 | content = result.fit_markdown if result.fit_markdown else result.markdown
311 |
312 | ```
313 |
314 | 3. **Filter Media by Score**
315 |
316 |
317 |
318 | ```hljs ini
319 | relevant_images = [\
320 | img for img in result.media["images"]\
321 | if img["score"] > 5\
322 | ]
323 |
324 | ```
325 |
326 | 4. **Handle Missing Data**
327 |
328 |
329 |
330 | ```hljs ini
331 | metadata = result.metadata or {}
332 | title = metadata.get('title', 'Unknown Title')
333 |
334 | ```
335 |
336 |
337 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/home - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI
2 |
3 | Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
4 |
5 | ## Introduction
6 |
7 | Crawl4AI has one clear task: to make crawling and data extraction from web pages easy and efficient, especially for large language models (LLMs) and AI applications. Whether you are using it as a REST API or a Python library, Crawl4AI offers a robust and flexible solution with full asynchronous support.
8 |
9 | ## Quick Start
10 |
11 | Here's a quick example to show you how easy it is to use Crawl4AI with its asynchronous capabilities:
12 |
13 | ```hljs python
14 | import asyncio
15 | from crawl4ai import AsyncWebCrawler
16 |
17 | async def main():
18 | # Create an instance of AsyncWebCrawler
19 | async with AsyncWebCrawler(verbose=True) as crawler:
20 | # Run the crawler on a URL
21 | result = await crawler.arun(url="https://www.nbcnews.com/business")
22 |
23 | # Print the extracted content
24 | print(result.markdown)
25 |
26 | # Run the async main function
27 | asyncio.run(main())
28 |
29 | ```
30 |
31 | ## Key Features ✨
32 |
33 | - 🆓 Completely free and open-source
34 | - 🚀 Blazing fast performance, outperforming many paid services
35 | - 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
36 | - 📄 Fit markdown generation for extracting main article content.
37 | - 🌐 Multi-browser support (Chromium, Firefox, WebKit)
38 | - 🌍 Supports crawling multiple URLs simultaneously
39 | - 🎨 Extracts and returns all media tags (Images, Audio, and Video)
40 | - 🔗 Extracts all external and internal links
41 | - 📚 Extracts metadata from the page
42 | - 🔄 Custom hooks for authentication, headers, and page modifications
43 | - 🕵️ User-agent customization
44 | - 🖼️ Takes screenshots of pages with enhanced error handling
45 | - 📜 Executes multiple custom JavaScripts before crawling
46 | - 📊 Generates structured output without LLM using JsonCssExtractionStrategy
47 | - 📚 Various chunking strategies: topic-based, regex, sentence, and more
48 | - 🧠 Advanced extraction strategies: cosine clustering, LLM, and more
49 | - 🎯 CSS selector support for precise data extraction
50 | - 📝 Passes instructions/keywords to refine extraction
51 | - 🔒 Proxy support with authentication for enhanced access
52 | - 🔄 Session management for complex multi-page crawling
53 | - 🌐 Asynchronous architecture for improved performance
54 | - 🖼️ Improved image processing with lazy-loading detection
55 | - 🕰️ Enhanced handling of delayed content loading
56 | - 🔑 Custom headers support for LLM interactions
57 | - 🖼️ iframe content extraction for comprehensive analysis
58 | - ⏱️ Flexible timeout and delayed content retrieval options
59 |
60 | ## Documentation Structure
61 |
62 | Our documentation is organized into several sections:
63 |
64 | ### Basic Usage
65 |
66 | - [Installation](basic/installation/)
67 | - [Quick Start](basic/quickstart/)
68 | - [Simple Crawling](basic/simple-crawling/)
69 | - [Browser Configuration](basic/browser-config/)
70 | - [Content Selection](basic/content-selection/)
71 | - [Output Formats](basic/output-formats/)
72 | - [Page Interaction](basic/page-interaction/)
73 |
74 | ### Advanced Features
75 |
76 | - [Magic Mode](advanced/magic-mode/)
77 | - [Session Management](advanced/session-management/)
78 | - [Hooks & Authentication](advanced/hooks-auth/)
79 | - [Proxy & Security](advanced/proxy-security/)
80 | - [Content Processing](advanced/content-processing/)
81 |
82 | ### Extraction & Processing
83 |
84 | - [Extraction Strategies Overview](extraction/overview/)
85 | - [LLM Integration](extraction/llm/)
86 | - [CSS-Based Extraction](extraction/css/)
87 | - [Cosine Strategy](extraction/cosine/)
88 | - [Chunking Strategies](extraction/chunking/)
89 |
90 | ### API Reference
91 |
92 | - [AsyncWebCrawler](api/async-webcrawler/)
93 | - [CrawlResult](api/crawl-result/)
94 | - [Extraction Strategies](api/strategies/)
95 | - [arun() Method Parameters](api/arun/)
96 |
97 | ### Examples
98 |
99 | - Coming soon!
100 |
101 | ## Getting Started
102 |
103 | 1. Install Crawl4AI:
104 |
105 |
106 |
107 |
108 | ```hljs undefined
109 | pip install crawl4ai
110 |
111 | ```
112 |
113 | 2. Check out our [Quick Start Guide](basic/quickstart/) to begin crawling web pages.
114 |
115 | 3. Explore our [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) to see Crawl4AI in action.
116 |
117 |
118 | ## Support
119 |
120 | For questions, suggestions, or issues:
121 | \- GitHub Issues: [Report a Bug](https://github.com/unclecode/crawl4ai/issues)
122 | \- Twitter: [@unclecode](https://twitter.com/unclecode)
123 | \- Website: [crawl4ai.com](https://crawl4ai.com)
124 |
125 | Happy Crawling! 🕸️🚀
126 |
127 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/hooks auth - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Hooks & Auth for AsyncWebCrawler
2 |
3 | Crawl4AI's AsyncWebCrawler allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions that are called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This example demonstrates how to use various hooks to customize the asynchronous crawling process.
4 |
5 | ## Example: Using Crawler Hooks with AsyncWebCrawler
6 |
7 | Let's see how we can customize the AsyncWebCrawler using hooks! In this example, we'll:
8 |
9 | 1. Configure the browser when it's created.
10 | 2. Add custom headers before navigating to the URL.
11 | 3. Log the current URL after navigation.
12 | 4. Perform actions after JavaScript execution.
13 | 5. Log the length of the HTML before returning it.
14 |
15 | ### Hook Definitions
16 |
17 | ```hljs python
18 | import asyncio
19 | from crawl4ai import AsyncWebCrawler
20 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
21 | from playwright.async_api import Page, Browser
22 |
23 | async def on_browser_created(browser: Browser):
24 | print("[HOOK] on_browser_created")
25 | # Example customization: set browser viewport size
26 | context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
27 | page = await context.new_page()
28 |
29 | # Example customization: logging in to a hypothetical website
30 | await page.goto('https://example.com/login')
31 | await page.fill('input[name="username"]', 'testuser')
32 | await page.fill('input[name="password"]', 'password123')
33 | await page.click('button[type="submit"]')
34 | await page.wait_for_selector('#welcome')
35 |
36 | # Add a custom cookie
37 | await context.add_cookies([{'name': 'test_cookie', 'value': 'cookie_value', 'url': 'https://example.com'}])
38 |
39 | await page.close()
40 | await context.close()
41 |
42 | async def before_goto(page: Page):
43 | print("[HOOK] before_goto")
44 | # Example customization: add custom headers
45 | await page.set_extra_http_headers({'X-Test-Header': 'test'})
46 |
47 | async def after_goto(page: Page):
48 | print("[HOOK] after_goto")
49 | # Example customization: log the URL
50 | print(f"Current URL: {page.url}")
51 |
52 | async def on_execution_started(page: Page):
53 | print("[HOOK] on_execution_started")
54 | # Example customization: perform actions after JS execution
55 | await page.evaluate("console.log('Custom JS executed')")
56 |
57 | async def before_return_html(page: Page, html: str):
58 | print("[HOOK] before_return_html")
59 | # Example customization: log the HTML length
60 | print(f"HTML length: {len(html)}")
61 | return page
62 |
63 | ```
64 |
65 | ### Using the Hooks with the AsyncWebCrawler
66 |
67 | ```hljs css
68 | import asyncio
69 | from crawl4ai import AsyncWebCrawler
70 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
71 |
72 | async def main():
73 | print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!")
74 |
75 | crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True)
76 | crawler_strategy.set_hook('on_browser_created', on_browser_created)
77 | crawler_strategy.set_hook('before_goto', before_goto)
78 | crawler_strategy.set_hook('after_goto', after_goto)
79 | crawler_strategy.set_hook('on_execution_started', on_execution_started)
80 | crawler_strategy.set_hook('before_return_html', before_return_html)
81 |
82 | async with AsyncWebCrawler(verbose=True, crawler_strategy=crawler_strategy) as crawler:
83 | result = await crawler.arun(
84 | url="https://example.com",
85 | js_code="window.scrollTo(0, document.body.scrollHeight);",
86 | wait_for="footer"
87 | )
88 |
89 | print("📦 Crawler Hooks result:")
90 | print(result)
91 |
92 | asyncio.run(main())
93 |
94 | ```
95 |
96 | ### Explanation
97 |
98 | - `on_browser_created`: This hook is called when the Playwright browser is created. It sets up the browser context, logs in to a website, and adds a custom cookie.
99 | - `before_goto`: This hook is called right before Playwright navigates to the URL. It adds custom HTTP headers.
100 | - `after_goto`: This hook is called after Playwright navigates to the URL. It logs the current URL.
101 | - `on_execution_started`: This hook is called after any custom JavaScript is executed. It performs additional JavaScript actions.
102 | - `before_return_html`: This hook is called before returning the HTML content. It logs the length of the HTML content.
103 |
104 | ### Additional Ideas
105 |
106 | - **Handling authentication**: Use the `on_browser_created` hook to handle login processes or set authentication tokens.
107 | - **Dynamic header modification**: Modify headers based on the target URL or other conditions in the `before_goto` hook.
108 | - **Content verification**: Use the `after_goto` hook to verify that the expected content is present on the page.
109 | - **Custom JavaScript injection**: Inject and execute custom JavaScript using the `on_execution_started` hook.
110 | - **Content preprocessing**: Modify or analyze the HTML content in the `before_return_html` hook before it's returned.
111 |
112 | By using these hooks, you can customize the behavior of the AsyncWebCrawler to suit your specific needs, including handling authentication, modifying requests, and preprocessing content.
113 |
114 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/installation - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Installation 💻
2 |
3 | Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server.
4 |
5 | ## Option 1: Python Package Installation (Recommended)
6 |
7 | Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs:
8 |
9 | ### Basic Installation
10 |
11 | For basic web crawling and scraping tasks:
12 |
13 | ```hljs bash
14 | pip install crawl4ai
15 | playwright install # Install Playwright dependencies
16 |
17 | ```
18 |
19 | ### Installation with PyTorch
20 |
21 | For advanced text clustering (includes CosineSimilarity cluster strategy):
22 |
23 | ```hljs css
24 | pip install crawl4ai[torch]
25 |
26 | ```
27 |
28 | ### Installation with Transformers
29 |
30 | For text summarization and Hugging Face models:
31 |
32 | ```hljs css
33 | pip install crawl4ai[transformer]
34 |
35 | ```
36 |
37 | ### Full Installation
38 |
39 | For all features:
40 |
41 | ```hljs css
42 | pip install crawl4ai[all]
43 |
44 | ```
45 |
46 | ### Development Installation
47 |
48 | For contributors who plan to modify the source code:
49 |
50 | ```hljs bash
51 | git clone https://github.com/unclecode/crawl4ai.git
52 | cd crawl4ai
53 | pip install -e ".[all]"
54 | playwright install # Install Playwright dependencies
55 |
56 | ```
57 |
58 | 💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models:
59 |
60 | ```hljs undefined
61 | crawl4ai-download-models
62 |
63 | ```
64 |
65 | This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation.
66 |
67 | ## Option 2: Using Docker (Coming Soon)
68 |
69 | Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems.
70 |
71 | ## Option 3: Local Server Installation
72 |
73 | For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete.
74 |
75 | ## Verifying Your Installation
76 |
77 | After installation, you can verify that Crawl4AI is working correctly by running a simple Python script:
78 |
79 | ```hljs python
80 | import asyncio
81 | from crawl4ai import AsyncWebCrawler
82 |
83 | async def main():
84 | async with AsyncWebCrawler(verbose=True) as crawler:
85 | result = await crawler.arun(url="https://www.example.com")
86 | print(result.markdown[:500]) # Print first 500 characters
87 |
88 | if __name__ == "__main__":
89 | asyncio.run(main())
90 |
91 | ```
92 |
93 | This script should successfully crawl the example website and print the first 500 characters of the extracted content.
94 |
95 | ## Getting Help
96 |
97 | If you encounter any issues during installation or usage, please check the [documentation](https://crawl4ai.com/mkdocs/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues).
98 |
99 | Happy crawling! 🕷️🤖
100 |
101 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/json-css extractor basic - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # JSON CSS Extraction Strategy with AsyncWebCrawler
2 |
3 | The `JsonCssExtractionStrategy` is a powerful feature of Crawl4AI that allows you to extract structured data from web pages using CSS selectors. This method is particularly useful when you need to extract specific data points from a consistent HTML structure, such as tables or repeated elements. Here's how to use it with the AsyncWebCrawler.
4 |
5 | ## Overview
6 |
7 | The `JsonCssExtractionStrategy` works by defining a schema that specifies:
8 | 1\. A base CSS selector for the repeating elements
9 | 2\. Fields to extract from each element, each with its own CSS selector
10 |
11 | This strategy is fast and efficient, as it doesn't rely on external services like LLMs for extraction.
12 |
13 | ## Example: Extracting Cryptocurrency Prices from Coinbase
14 |
15 | Let's look at an example that extracts cryptocurrency prices from the Coinbase explore page.
16 |
17 | ```hljs python
18 | import json
19 | import asyncio
20 | from crawl4ai import AsyncWebCrawler
21 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
22 |
23 | async def extract_structured_data_using_css_extractor():
24 | print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
25 |
26 | # Define the extraction schema
27 | schema = {
28 | "name": "Coinbase Crypto Prices",
29 | "baseSelector": ".cds-tableRow-t45thuk",
30 | "fields": [\
31 | {\
32 | "name": "crypto",\
33 | "selector": "td:nth-child(1) h2",\
34 | "type": "text",\
35 | },\
36 | {\
37 | "name": "symbol",\
38 | "selector": "td:nth-child(1) p",\
39 | "type": "text",\
40 | },\
41 | {\
42 | "name": "price",\
43 | "selector": "td:nth-child(2)",\
44 | "type": "text",\
45 | }\
46 | ],
47 | }
48 |
49 | # Create the extraction strategy
50 | extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
51 |
52 | # Use the AsyncWebCrawler with the extraction strategy
53 | async with AsyncWebCrawler(verbose=True) as crawler:
54 | result = await crawler.arun(
55 | url="https://www.coinbase.com/explore",
56 | extraction_strategy=extraction_strategy,
57 | bypass_cache=True,
58 | )
59 |
60 | assert result.success, "Failed to crawl the page"
61 |
62 | # Parse the extracted content
63 | crypto_prices = json.loads(result.extracted_content)
64 | print(f"Successfully extracted {len(crypto_prices)} cryptocurrency prices")
65 | print(json.dumps(crypto_prices[0], indent=2))
66 |
67 | return crypto_prices
68 |
69 | # Run the async function
70 | asyncio.run(extract_structured_data_using_css_extractor())
71 |
72 | ```
73 |
74 | ## Explanation of the Schema
75 |
76 | The schema defines how to extract the data:
77 |
78 | - `name`: A descriptive name for the extraction task.
79 | - `baseSelector`: The CSS selector for the repeating elements (in this case, table rows).
80 | - `fields`: An array of fields to extract from each element:
81 | - `name`: The name to give the extracted data.
82 | - `selector`: The CSS selector to find the specific data within the base element.
83 | - `type`: The type of data to extract (usually "text" for textual content).
84 |
85 | ## Advantages of JsonCssExtractionStrategy
86 |
87 | 1. **Speed**: CSS selectors are fast to execute, making this method efficient for large datasets.
88 | 2. **Precision**: You can target exactly the elements you need.
89 | 3. **Structured Output**: The result is already structured as JSON, ready for further processing.
90 | 4. **No External Dependencies**: Unlike LLM-based strategies, this doesn't require any API calls to external services.
91 |
92 | ## Tips for Using JsonCssExtractionStrategy
93 |
94 | 1. **Inspect the Page**: Use browser developer tools to identify the correct CSS selectors.
95 | 2. **Test Selectors**: Verify your selectors in the browser console before using them in the script.
96 | 3. **Handle Dynamic Content**: If the page uses JavaScript to load content, you may need to combine this with JS execution (see the Advanced Usage section).
97 | 4. **Error Handling**: Always check the `result.success` flag and handle potential failures.
98 |
99 | ## Advanced Usage: Combining with JavaScript Execution
100 |
101 | For pages that load data dynamically, you can combine the `JsonCssExtractionStrategy` with JavaScript execution:
102 |
103 | ```hljs python
104 | async def extract_dynamic_structured_data():
105 | schema = {
106 | "name": "Dynamic Crypto Prices",
107 | "baseSelector": ".crypto-row",
108 | "fields": [\
109 | {"name": "name", "selector": ".crypto-name", "type": "text"},\
110 | {"name": "price", "selector": ".crypto-price", "type": "text"},\
111 | ]
112 | }
113 |
114 | js_code = """
115 | window.scrollTo(0, document.body.scrollHeight);
116 | await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds
117 | """
118 |
119 | extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
120 |
121 | async with AsyncWebCrawler(verbose=True) as crawler:
122 | result = await crawler.arun(
123 | url="https://example.com/crypto-prices",
124 | extraction_strategy=extraction_strategy,
125 | js_code=js_code,
126 | wait_for=".crypto-row:nth-child(20)", # Wait for 20 rows to load
127 | bypass_cache=True,
128 | )
129 |
130 | crypto_data = json.loads(result.extracted_content)
131 | print(f"Extracted {len(crypto_data)} cryptocurrency entries")
132 |
133 | asyncio.run(extract_dynamic_structured_data())
134 |
135 | ```
136 |
137 | This advanced example demonstrates how to:
138 | 1\. Execute JavaScript to trigger dynamic content loading.
139 | 2\. Wait for a specific condition (20 rows loaded) before extraction.
140 | 3\. Extract data from the dynamically loaded content.
141 |
142 | By mastering the `JsonCssExtractionStrategy`, you can efficiently extract structured data from a wide variety of web pages, making it a valuable tool in your web scraping toolkit.
143 |
144 | For more details on schema definitions and advanced extraction strategies, check out the [Advanced JsonCssExtraction](../css-advanced/).
145 |
146 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/llm strategy - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # LLM Extraction with AsyncWebCrawler
2 |
3 | Crawl4AI's AsyncWebCrawler allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages asynchronously. Below are two examples demonstrating how to use `LLMExtractionStrategy` for different purposes with the AsyncWebCrawler.
4 |
5 | ## Example 1: Extract Structured Data
6 |
7 | In this example, we use the `LLMExtractionStrategy` to extract structured data (model names and their fees) from the OpenAI pricing page.
8 |
9 | ```hljs python
10 | import os
11 | import json
12 | import asyncio
13 | from crawl4ai import AsyncWebCrawler
14 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
15 | from pydantic import BaseModel, Field
16 |
17 | class OpenAIModelFee(BaseModel):
18 | model_name: str = Field(..., description="Name of the OpenAI model.")
19 | input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
20 | output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
21 |
22 | async def extract_openai_fees():
23 | url = 'https://openai.com/api/pricing/'
24 |
25 | async with AsyncWebCrawler(verbose=True) as crawler:
26 | result = await crawler.arun(
27 | url=url,
28 | word_count_threshold=1,
29 | extraction_strategy=LLMExtractionStrategy(
30 | provider="openai/gpt-4o", # Or use ollama like provider="ollama/nemotron"
31 | api_token=os.getenv('OPENAI_API_KEY'),
32 | schema=OpenAIModelFee.model_json_schema(),
33 | extraction_type="schema",
34 | instruction="From the crawled content, extract all mentioned model names along with their "
35 | "fees for input and output tokens. Make sure not to miss anything in the entire content. "
36 | 'One extracted model JSON format should look like this: '
37 | '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
38 | ),
39 | bypass_cache=True,
40 | )
41 |
42 | model_fees = json.loads(result.extracted_content)
43 | print(f"Number of models extracted: {len(model_fees)}")
44 |
45 | with open(".data/openai_fees.json", "w", encoding="utf-8") as f:
46 | json.dump(model_fees, f, indent=2)
47 |
48 | asyncio.run(extract_openai_fees())
49 |
50 | ```
51 |
52 | ## Example 2: Extract Relevant Content
53 |
54 | In this example, we instruct the LLM to extract only content related to technology from the NBC News business page.
55 |
56 | ```hljs python
57 | import os
58 | import json
59 | import asyncio
60 | from crawl4ai import AsyncWebCrawler
61 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
62 |
63 | async def extract_tech_content():
64 | async with AsyncWebCrawler(verbose=True) as crawler:
65 | result = await crawler.arun(
66 | url="https://www.nbcnews.com/business",
67 | extraction_strategy=LLMExtractionStrategy(
68 | provider="openai/gpt-4o",
69 | api_token=os.getenv('OPENAI_API_KEY'),
70 | instruction="Extract only content related to technology"
71 | ),
72 | bypass_cache=True,
73 | )
74 |
75 | tech_content = json.loads(result.extracted_content)
76 | print(f"Number of tech-related items extracted: {len(tech_content)}")
77 |
78 | with open(".data/tech_content.json", "w", encoding="utf-8") as f:
79 | json.dump(tech_content, f, indent=2)
80 |
81 | asyncio.run(extract_tech_content())
82 |
83 | ```
84 |
85 | ## Advanced Usage: Combining JS Execution with LLM Extraction
86 |
87 | This example demonstrates how to combine JavaScript execution with LLM extraction to handle dynamic content:
88 |
89 | ```hljs python
90 | async def extract_dynamic_content():
91 | js_code = """
92 | const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
93 | if (loadMoreButton) {
94 | loadMoreButton.click();
95 | await new Promise(resolve => setTimeout(resolve, 2000));
96 | }
97 | """
98 |
99 | wait_for = """
100 | () => {
101 | const articles = document.querySelectorAll('article.tease-card');
102 | return articles.length > 10;
103 | }
104 | """
105 |
106 | async with AsyncWebCrawler(verbose=True) as crawler:
107 | result = await crawler.arun(
108 | url="https://www.nbcnews.com/business",
109 | js_code=js_code,
110 | wait_for=wait_for,
111 | css_selector="article.tease-card",
112 | extraction_strategy=LLMExtractionStrategy(
113 | provider="openai/gpt-4o",
114 | api_token=os.getenv('OPENAI_API_KEY'),
115 | instruction="Summarize each article, focusing on technology-related content"
116 | ),
117 | bypass_cache=True,
118 | )
119 |
120 | summaries = json.loads(result.extracted_content)
121 | print(f"Number of summarized articles: {len(summaries)}")
122 |
123 | with open(".data/tech_summaries.json", "w", encoding="utf-8") as f:
124 | json.dump(summaries, f, indent=2)
125 |
126 | asyncio.run(extract_dynamic_content())
127 |
128 | ```
129 |
130 | ## Customizing LLM Provider
131 |
132 | Crawl4AI uses the `litellm` library under the hood, which allows you to use any LLM provider you want. Just pass the correct model name and API token:
133 |
134 | ```hljs makefile
135 | extraction_strategy=LLMExtractionStrategy(
136 | provider="your_llm_provider/model_name",
137 | api_token="your_api_token",
138 | instruction="Your extraction instruction"
139 | )
140 |
141 | ```
142 |
143 | This flexibility allows you to integrate with various LLM providers and tailor the extraction process to your specific needs.
144 |
145 | ## Error Handling and Retries
146 |
147 | When working with external LLM APIs, it's important to handle potential errors and implement retry logic. Here's an example of how you might do this:
148 |
149 | ```hljs python
150 | import asyncio
151 | from tenacity import retry, stop_after_attempt, wait_exponential
152 |
153 | class LLMExtractionError(Exception):
154 | pass
155 |
156 | @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
157 | async def extract_with_retry(crawler, url, extraction_strategy):
158 | try:
159 | result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True)
160 | return json.loads(result.extracted_content)
161 | except Exception as e:
162 | raise LLMExtractionError(f"Failed to extract content: {str(e)}")
163 |
164 | async def main():
165 | async with AsyncWebCrawler(verbose=True) as crawler:
166 | try:
167 | content = await extract_with_retry(
168 | crawler,
169 | "https://www.example.com",
170 | LLMExtractionStrategy(
171 | provider="openai/gpt-4o",
172 | api_token=os.getenv('OPENAI_API_KEY'),
173 | instruction="Extract and summarize main points"
174 | )
175 | )
176 | print("Extracted content:", content)
177 | except LLMExtractionError as e:
178 | print(f"Extraction failed after retries: {e}")
179 |
180 | asyncio.run(main())
181 |
182 | ```
183 |
184 | This example uses the `tenacity` library to implement a retry mechanism with exponential backoff, which can help handle temporary failures or rate limiting from the LLM API.
185 |
186 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/magic mode - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Magic Mode & Anti-Bot Protection
2 |
3 | Crawl4AI provides powerful anti-detection capabilities, with Magic Mode being the simplest and most comprehensive solution.
4 |
5 | ## Magic Mode
6 |
7 | The easiest way to bypass anti-bot protections:
8 |
9 | ```hljs csharp
10 | async with AsyncWebCrawler() as crawler:
11 | result = await crawler.arun(
12 | url="https://example.com",
13 | magic=True # Enables all anti-detection features
14 | )
15 |
16 | ```
17 |
18 | Magic Mode automatically:
19 | \- Masks browser automation signals
20 | \- Simulates human-like behavior
21 | \- Overrides navigator properties
22 | \- Handles cookie consent popups
23 | \- Manages browser fingerprinting
24 | \- Randomizes timing patterns
25 |
26 | ## Manual Anti-Bot Options
27 |
28 | While Magic Mode is recommended, you can also configure individual anti-detection features:
29 |
30 | ```hljs python
31 | result = await crawler.arun(
32 | url="https://example.com",
33 | simulate_user=True, # Simulate human behavior
34 | override_navigator=True # Mask automation signals
35 | )
36 |
37 | ```
38 |
39 | Note: When `magic=True` is used, you don't need to set these individual options.
40 |
41 | ## Example: Handling Protected Sites
42 |
43 | ```hljs python
44 | async def crawl_protected_site(url: str):
45 | async with AsyncWebCrawler(headless=True) as crawler:
46 | result = await crawler.arun(
47 | url=url,
48 | magic=True,
49 | remove_overlay_elements=True, # Remove popups/modals
50 | page_timeout=60000 # Increased timeout for protection checks
51 | )
52 |
53 | return result.markdown if result.success else None
54 |
55 | ```
56 |
57 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/output formats - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Output Formats
2 |
3 | Crawl4AI provides multiple output formats to suit different needs, from raw HTML to structured data using LLM or pattern-based extraction.
4 |
5 | ## Basic Formats
6 |
7 | ```hljs ini
8 | result = await crawler.arun(url="https://example.com")
9 |
10 | # Access different formats
11 | raw_html = result.html # Original HTML
12 | clean_html = result.cleaned_html # Sanitized HTML
13 | markdown = result.markdown # Standard markdown
14 | fit_md = result.fit_markdown # Most relevant content in markdown
15 |
16 | ```
17 |
18 | ## Raw HTML
19 |
20 | Original, unmodified HTML from the webpage. Useful when you need to:
21 | \- Preserve the exact page structure
22 | \- Process HTML with your own tools
23 | \- Debug page issues
24 |
25 | ```hljs python
26 | result = await crawler.arun(url="https://example.com")
27 | print(result.html) # Complete HTML including headers, scripts, etc.
28 |
29 | ```
30 |
31 | ## Cleaned HTML
32 |
33 | Sanitized HTML with unnecessary elements removed. Automatically:
34 | \- Removes scripts and styles
35 | \- Cleans up formatting
36 | \- Preserves semantic structure
37 |
38 | ```hljs python
39 | result = await crawler.arun(
40 | url="https://example.com",
41 | excluded_tags=['form', 'header', 'footer'], # Additional tags to remove
42 | keep_data_attributes=False # Remove data-* attributes
43 | )
44 | print(result.cleaned_html)
45 |
46 | ```
47 |
48 | ## Standard Markdown
49 |
50 | HTML converted to clean markdown format. Great for:
51 | \- Content analysis
52 | \- Documentation
53 | \- Readability
54 |
55 | ```hljs python
56 | result = await crawler.arun(
57 | url="https://example.com",
58 | include_links_on_markdown=True # Include links in markdown
59 | )
60 | print(result.markdown)
61 |
62 | ```
63 |
64 | ## Fit Markdown
65 |
66 | Most relevant content extracted and converted to markdown. Ideal for:
67 | \- Article extraction
68 | \- Main content focus
69 | \- Removing boilerplate
70 |
71 | ```hljs python
72 | result = await crawler.arun(url="https://example.com")
73 | print(result.fit_markdown) # Only the main content
74 |
75 | ```
76 |
77 | ## Structured Data Extraction
78 |
79 | Crawl4AI offers two powerful approaches for structured data extraction:
80 |
81 | ### 1\. LLM-Based Extraction
82 |
83 | Use any LLM (OpenAI, HuggingFace, Ollama, etc.) to extract structured data with high accuracy:
84 |
85 | ```hljs python
86 | from pydantic import BaseModel
87 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
88 |
89 | class KnowledgeGraph(BaseModel):
90 | entities: List[dict]
91 | relationships: List[dict]
92 |
93 | strategy = LLMExtractionStrategy(
94 | provider="ollama/nemotron", # or "huggingface/...", "ollama/..."
95 | api_token="your-token", # not needed for Ollama
96 | schema=KnowledgeGraph.schema(),
97 | instruction="Extract entities and relationships from the content"
98 | )
99 |
100 | result = await crawler.arun(
101 | url="https://example.com",
102 | extraction_strategy=strategy
103 | )
104 | knowledge_graph = json.loads(result.extracted_content)
105 |
106 | ```
107 |
108 | ### 2\. Pattern-Based Extraction
109 |
110 | For pages with repetitive patterns (e.g., product listings, article feeds), use JsonCssExtractionStrategy:
111 |
112 | ```hljs makefile
113 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
114 |
115 | schema = {
116 | "name": "Product Listing",
117 | "baseSelector": ".product-card", # Repeated element
118 | "fields": [\
119 | {"name": "title", "selector": "h2", "type": "text"},\
120 | {"name": "price", "selector": ".price", "type": "text"},\
121 | {"name": "description", "selector": ".desc", "type": "text"}\
122 | ]
123 | }
124 |
125 | strategy = JsonCssExtractionStrategy(schema)
126 | result = await crawler.arun(
127 | url="https://example.com",
128 | extraction_strategy=strategy
129 | )
130 | products = json.loads(result.extracted_content)
131 |
132 | ```
133 |
134 | ## Content Customization
135 |
136 | ### HTML to Text Options
137 |
138 | Configure markdown conversion:
139 |
140 | ```hljs python
141 | result = await crawler.arun(
142 | url="https://example.com",
143 | html2text={
144 | "escape_dot": False,
145 | "body_width": 0,
146 | "protect_links": True,
147 | "unicode_snob": True
148 | }
149 | )
150 |
151 | ```
152 |
153 | ### Content Filters
154 |
155 | Control what content is included:
156 |
157 | ```hljs python
158 | result = await crawler.arun(
159 | url="https://example.com",
160 | word_count_threshold=10, # Minimum words per block
161 | exclude_external_links=True, # Remove external links
162 | exclude_external_images=True, # Remove external images
163 | excluded_tags=['form', 'nav'] # Remove specific HTML tags
164 | )
165 |
166 | ```
167 |
168 | ## Comprehensive Example
169 |
170 | Here's how to use multiple output formats together:
171 |
172 | ```hljs python
173 | async def crawl_content(url: str):
174 | async with AsyncWebCrawler() as crawler:
175 | # Extract main content with fit markdown
176 | result = await crawler.arun(
177 | url=url,
178 | word_count_threshold=10,
179 | exclude_external_links=True
180 | )
181 |
182 | # Get structured data using LLM
183 | llm_result = await crawler.arun(
184 | url=url,
185 | extraction_strategy=LLMExtractionStrategy(
186 | provider="ollama/nemotron",
187 | schema=YourSchema.schema(),
188 | instruction="Extract key information"
189 | )
190 | )
191 |
192 | # Get repeated patterns (if any)
193 | pattern_result = await crawler.arun(
194 | url=url,
195 | extraction_strategy=JsonCssExtractionStrategy(your_schema)
196 | )
197 |
198 | return {
199 | "main_content": result.fit_markdown,
200 | "structured_data": json.loads(llm_result.extracted_content),
201 | "pattern_data": json.loads(pattern_result.extracted_content),
202 | "media": result.media
203 | }
204 |
205 | ```
206 |
207 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/overview - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Extraction Strategies Overview
2 |
3 | Crawl4AI provides powerful extraction strategies to help you get structured data from web pages. Each strategy is designed for specific use cases and offers different approaches to data extraction.
4 |
5 | ## Available Strategies
6 |
7 | ### [LLM-Based Extraction](../llm/)
8 |
9 | `LLMExtractionStrategy` uses Language Models to extract structured data from web content. This approach is highly flexible and can understand content semantically.
10 |
11 | ```hljs python
12 | from pydantic import BaseModel
13 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
14 |
15 | class Product(BaseModel):
16 | name: str
17 | price: float
18 | description: str
19 |
20 | strategy = LLMExtractionStrategy(
21 | provider="ollama/llama2",
22 | schema=Product.schema(),
23 | instruction="Extract product details from the page"
24 | )
25 |
26 | result = await crawler.arun(
27 | url="https://example.com/product",
28 | extraction_strategy=strategy
29 | )
30 |
31 | ```
32 |
33 | **Best for:**
34 | \- Complex data structures
35 | \- Content requiring interpretation
36 | \- Flexible content formats
37 | \- Natural language processing
38 |
39 | ### [CSS-Based Extraction](../css/)
40 |
41 | `JsonCssExtractionStrategy` extracts data using CSS selectors. This is fast, reliable, and perfect for consistently structured pages.
42 |
43 | ```hljs javascript
44 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
45 |
46 | schema = {
47 | "name": "Product Listing",
48 | "baseSelector": ".product-card",
49 | "fields": [\
50 | {"name": "title", "selector": "h2", "type": "text"},\
51 | {"name": "price", "selector": ".price", "type": "text"},\
52 | {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"}\
53 | ]
54 | }
55 |
56 | strategy = JsonCssExtractionStrategy(schema)
57 |
58 | result = await crawler.arun(
59 | url="https://example.com/products",
60 | extraction_strategy=strategy
61 | )
62 |
63 | ```
64 |
65 | **Best for:**
66 | \- E-commerce product listings
67 | \- News article collections
68 | \- Structured content pages
69 | \- High-performance needs
70 |
71 | ### [Cosine Strategy](../cosine/)
72 |
73 | `CosineStrategy` uses similarity-based clustering to identify and extract relevant content sections.
74 |
75 | ```hljs python
76 | from crawl4ai.extraction_strategy import CosineStrategy
77 |
78 | strategy = CosineStrategy(
79 | semantic_filter="product reviews", # Content focus
80 | word_count_threshold=10, # Minimum words per cluster
81 | sim_threshold=0.3, # Similarity threshold
82 | max_dist=0.2, # Maximum cluster distance
83 | top_k=3 # Number of top clusters to extract
84 | )
85 |
86 | result = await crawler.arun(
87 | url="https://example.com/reviews",
88 | extraction_strategy=strategy
89 | )
90 |
91 | ```
92 |
93 | **Best for:**
94 | \- Content similarity analysis
95 | \- Topic clustering
96 | \- Relevant content extraction
97 | \- Pattern recognition in text
98 |
99 | ## Strategy Selection Guide
100 |
101 | Choose your strategy based on these factors:
102 |
103 | 01. **Content Structure**
104 | 02. Well-structured HTML → Use CSS Strategy
105 | 03. Natural language text → Use LLM Strategy
106 | 04. Mixed/Complex content → Use Cosine Strategy
107 |
108 | 05. **Performance Requirements**
109 |
110 | 06. Fastest: CSS Strategy
111 | 07. Moderate: Cosine Strategy
112 | 08. Variable: LLM Strategy (depends on provider)
113 |
114 | 09. **Accuracy Needs**
115 |
116 | 10. Highest structure accuracy: CSS Strategy
117 | 11. Best semantic understanding: LLM Strategy
118 | 12. Best content relevance: Cosine Strategy
119 |
120 | ## Combining Strategies
121 |
122 | You can combine strategies for more powerful extraction:
123 |
124 | ```hljs rust
125 | # First use CSS strategy for initial structure
126 | css_result = await crawler.arun(
127 | url="https://example.com",
128 | extraction_strategy=css_strategy
129 | )
130 |
131 | # Then use LLM for semantic analysis
132 | llm_result = await crawler.arun(
133 | url="https://example.com",
134 | extraction_strategy=llm_strategy
135 | )
136 |
137 | ```
138 |
139 | ## Common Use Cases
140 |
141 | 1. **E-commerce Scraping**
142 |
143 |
144 |
145 | ```hljs graphql
146 | # CSS Strategy for product listings
147 | schema = {
148 | "name": "Products",
149 | "baseSelector": ".product",
150 | "fields": [\
151 | {"name": "name", "selector": ".title", "type": "text"},\
152 | {"name": "price", "selector": ".price", "type": "text"}\
153 | ]
154 | }
155 |
156 | ```
157 |
158 | 2. **News Article Extraction**
159 |
160 |
161 |
162 | ```hljs python
163 | # LLM Strategy for article content
164 | class Article(BaseModel):
165 | title: str
166 | content: str
167 | author: str
168 | date: str
169 |
170 | strategy = LLMExtractionStrategy(
171 | provider="ollama/llama2",
172 | schema=Article.schema()
173 | )
174 |
175 | ```
176 |
177 | 3. **Content Analysis**
178 |
179 |
180 |
181 | ```hljs makefile
182 | # Cosine Strategy for topic analysis
183 | strategy = CosineStrategy(
184 | semantic_filter="technology trends",
185 | top_k=5
186 | )
187 |
188 | ```
189 |
190 |
191 | ## Best Practices
192 |
193 | 1. **Choose the Right Strategy**
194 | 2. Start with CSS for structured data
195 | 3. Use LLM for complex interpretation
196 | 4. Try Cosine for content relevance
197 |
198 | 5. **Optimize Performance**
199 |
200 | 6. Cache LLM results
201 | 7. Keep CSS selectors specific
202 | 8. Tune similarity thresholds
203 |
204 | 9. **Handle Errors**
205 |
206 |
207 |
208 | ```hljs python
209 | result = await crawler.arun(
210 | url="https://example.com",
211 | extraction_strategy=strategy
212 | )
213 |
214 | if not result.success:
215 | print(f"Extraction failed: {result.error_message}")
216 | else:
217 | data = json.loads(result.extracted_content)
218 |
219 | ```
220 |
221 |
222 | Each strategy has its strengths and optimal use cases. Explore the detailed documentation for each strategy to learn more about their specific features and configurations.
223 |
224 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/page interaction - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Page Interaction
2 |
3 | Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events.
4 |
5 | ## JavaScript Execution
6 |
7 | ### Basic Execution
8 |
9 | ```hljs makefile
10 | # Single JavaScript command
11 | result = await crawler.arun(
12 | url="https://example.com",
13 | js_code="window.scrollTo(0, document.body.scrollHeight);"
14 | )
15 |
16 | # Multiple commands
17 | js_commands = [\
18 | "window.scrollTo(0, document.body.scrollHeight);",\
19 | "document.querySelector('.load-more').click();",\
20 | "document.querySelector('#consent-button').click();"\
21 | ]
22 | result = await crawler.arun(
23 | url="https://example.com",
24 | js_code=js_commands
25 | )
26 |
27 | ```
28 |
29 | ## Wait Conditions
30 |
31 | ### CSS-Based Waiting
32 |
33 | Wait for elements to appear:
34 |
35 | ```hljs vbnet
36 | result = await crawler.arun(
37 | url="https://example.com",
38 | wait_for="css:.dynamic-content" # Wait for element with class 'dynamic-content'
39 | )
40 |
41 | ```
42 |
43 | ### JavaScript-Based Waiting
44 |
45 | Wait for custom conditions:
46 |
47 | ```hljs python
48 | # Wait for number of elements
49 | wait_condition = """() => {
50 | return document.querySelectorAll('.item').length > 10;
51 | }"""
52 |
53 | result = await crawler.arun(
54 | url="https://example.com",
55 | wait_for=f"js:{wait_condition}"
56 | )
57 |
58 | # Wait for dynamic content to load
59 | wait_for_content = """() => {
60 | const content = document.querySelector('.content');
61 | return content && content.innerText.length > 100;
62 | }"""
63 |
64 | result = await crawler.arun(
65 | url="https://example.com",
66 | wait_for=f"js:{wait_for_content}"
67 | )
68 |
69 | ```
70 |
71 | ## Handling Dynamic Content
72 |
73 | ### Load More Content
74 |
75 | Handle infinite scroll or load more buttons:
76 |
77 | ```hljs makefile
78 | # Scroll and wait pattern
79 | result = await crawler.arun(
80 | url="https://example.com",
81 | js_code=[\
82 | # Scroll to bottom\
83 | "window.scrollTo(0, document.body.scrollHeight);",\
84 | # Click load more if exists\
85 | "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();"\
86 | ],
87 | # Wait for new content
88 | wait_for="js:() => document.querySelectorAll('.item').length > previousCount"
89 | )
90 |
91 | ```
92 |
93 | ### Form Interaction
94 |
95 | Handle forms and inputs:
96 |
97 | ```hljs python
98 | js_form_interaction = """
99 | // Fill form fields
100 | document.querySelector('#search').value = 'search term';
101 | // Submit form
102 | document.querySelector('form').submit();
103 | """
104 |
105 | result = await crawler.arun(
106 | url="https://example.com",
107 | js_code=js_form_interaction,
108 | wait_for="css:.results" # Wait for results to load
109 | )
110 |
111 | ```
112 |
113 | ## Timing Control
114 |
115 | ### Delays and Timeouts
116 |
117 | Control timing of interactions:
118 |
119 | ```hljs makefile
120 | result = await crawler.arun(
121 | url="https://example.com",
122 | page_timeout=60000, # Page load timeout (ms)
123 | delay_before_return_html=2.0, # Wait before capturing content
124 | )
125 |
126 | ```
127 |
128 | ## Complex Interactions Example
129 |
130 | Here's an example of handling a dynamic page with multiple interactions:
131 |
132 | ```hljs python
133 | async def crawl_dynamic_content():
134 | async with AsyncWebCrawler() as crawler:
135 | # Initial page load
136 | result = await crawler.arun(
137 | url="https://example.com",
138 | # Handle cookie consent
139 | js_code="document.querySelector('.cookie-accept')?.click();",
140 | wait_for="css:.main-content"
141 | )
142 |
143 | # Load more content
144 | session_id = "dynamic_session" # Keep session for multiple interactions
145 |
146 | for page in range(3): # Load 3 pages of content
147 | result = await crawler.arun(
148 | url="https://example.com",
149 | session_id=session_id,
150 | js_code=[\
151 | # Scroll to bottom\
152 | "window.scrollTo(0, document.body.scrollHeight);",\
153 | # Store current item count\
154 | "window.previousCount = document.querySelectorAll('.item').length;",\
155 | # Click load more\
156 | "document.querySelector('.load-more')?.click();"\
157 | ],
158 | # Wait for new items
159 | wait_for="""() => {
160 | const currentCount = document.querySelectorAll('.item').length;
161 | return currentCount > window.previousCount;
162 | }""",
163 | # Only execute JS without reloading page
164 | js_only=True if page > 0 else False
165 | )
166 |
167 | # Process content after each load
168 | print(f"Page {page + 1} items:", len(result.cleaned_html))
169 |
170 | # Clean up session
171 | await crawler.crawler_strategy.kill_session(session_id)
172 |
173 | ```
174 |
175 | ## Using with Extraction Strategies
176 |
177 | Combine page interaction with structured extraction:
178 |
179 | ```hljs python
180 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
181 |
182 | # Pattern-based extraction after interaction
183 | schema = {
184 | "name": "Dynamic Items",
185 | "baseSelector": ".item",
186 | "fields": [\
187 | {"name": "title", "selector": "h2", "type": "text"},\
188 | {"name": "description", "selector": ".desc", "type": "text"}\
189 | ]
190 | }
191 |
192 | result = await crawler.arun(
193 | url="https://example.com",
194 | js_code="window.scrollTo(0, document.body.scrollHeight);",
195 | wait_for="css:.item:nth-child(10)", # Wait for 10 items
196 | extraction_strategy=JsonCssExtractionStrategy(schema)
197 | )
198 |
199 | # Or use LLM to analyze dynamic content
200 | class ContentAnalysis(BaseModel):
201 | topics: List[str]
202 | summary: str
203 |
204 | result = await crawler.arun(
205 | url="https://example.com",
206 | js_code="document.querySelector('.show-more').click();",
207 | wait_for="css:.full-content",
208 | extraction_strategy=LLMExtractionStrategy(
209 | provider="ollama/nemotron",
210 | schema=ContentAnalysis.schema(),
211 | instruction="Analyze the full content"
212 | )
213 | )
214 |
215 | ```
216 |
217 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/parameters table - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Parameter Reference Table
2 |
3 | | File Name | Parameter Name | Code Usage | Strategy/Class | Description |
4 | | --- | --- | --- | --- | --- |
5 | | async\_crawler\_strategy.py | user\_agent | `kwargs.get("user_agent")` | AsyncPlaywrightCrawlerStrategy | User agent string for browser identification |
6 | | async\_crawler\_strategy.py | proxy | `kwargs.get("proxy")` | AsyncPlaywrightCrawlerStrategy | Proxy server configuration for network requests |
7 | | async\_crawler\_strategy.py | proxy\_config | `kwargs.get("proxy_config")` | AsyncPlaywrightCrawlerStrategy | Detailed proxy configuration including auth |
8 | | async\_crawler\_strategy.py | headless | `kwargs.get("headless", True)` | AsyncPlaywrightCrawlerStrategy | Whether to run browser in headless mode |
9 | | async\_crawler\_strategy.py | browser\_type | `kwargs.get("browser_type", "chromium")` | AsyncPlaywrightCrawlerStrategy | Type of browser to use (chromium/firefox/webkit) |
10 | | async\_crawler\_strategy.py | headers | `kwargs.get("headers", {})` | AsyncPlaywrightCrawlerStrategy | Custom HTTP headers for requests |
11 | | async\_crawler\_strategy.py | verbose | `kwargs.get("verbose", False)` | AsyncPlaywrightCrawlerStrategy | Enable detailed logging output |
12 | | async\_crawler\_strategy.py | sleep\_on\_close | `kwargs.get("sleep_on_close", False)` | AsyncPlaywrightCrawlerStrategy | Add delay before closing browser |
13 | | async\_crawler\_strategy.py | use\_managed\_browser | `kwargs.get("use_managed_browser", False)` | AsyncPlaywrightCrawlerStrategy | Use managed browser instance |
14 | | async\_crawler\_strategy.py | user\_data\_dir | `kwargs.get("user_data_dir", None)` | AsyncPlaywrightCrawlerStrategy | Custom directory for browser profile data |
15 | | async\_crawler\_strategy.py | session\_id | `kwargs.get("session_id")` | AsyncPlaywrightCrawlerStrategy | Unique identifier for browser session |
16 | | async\_crawler\_strategy.py | override\_navigator | `kwargs.get("override_navigator", False)` | AsyncPlaywrightCrawlerStrategy | Override browser navigator properties |
17 | | async\_crawler\_strategy.py | simulate\_user | `kwargs.get("simulate_user", False)` | AsyncPlaywrightCrawlerStrategy | Simulate human-like behavior |
18 | | async\_crawler\_strategy.py | magic | `kwargs.get("magic", False)` | AsyncPlaywrightCrawlerStrategy | Enable advanced anti-detection features |
19 | | async\_crawler\_strategy.py | log\_console | `kwargs.get("log_console", False)` | AsyncPlaywrightCrawlerStrategy | Log browser console messages |
20 | | async\_crawler\_strategy.py | js\_only | `kwargs.get("js_only", False)` | AsyncPlaywrightCrawlerStrategy | Only execute JavaScript without page load |
21 | | async\_crawler\_strategy.py | page\_timeout | `kwargs.get("page_timeout", 60000)` | AsyncPlaywrightCrawlerStrategy | Timeout for page load in milliseconds |
22 | | async\_crawler\_strategy.py | ignore\_body\_visibility | `kwargs.get("ignore_body_visibility", True)` | AsyncPlaywrightCrawlerStrategy | Process page even if body is hidden |
23 | | async\_crawler\_strategy.py | js\_code | `kwargs.get("js_code", kwargs.get("js", self.js_code))` | AsyncPlaywrightCrawlerStrategy | Custom JavaScript code to execute |
24 | | async\_crawler\_strategy.py | wait\_for | `kwargs.get("wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait for specific element/condition |
25 | | async\_crawler\_strategy.py | process\_iframes | `kwargs.get("process_iframes", False)` | AsyncPlaywrightCrawlerStrategy | Extract content from iframes |
26 | | async\_crawler\_strategy.py | delay\_before\_return\_html | `kwargs.get("delay_before_return_html")` | AsyncPlaywrightCrawlerStrategy | Additional delay before returning HTML |
27 | | async\_crawler\_strategy.py | remove\_overlay\_elements | `kwargs.get("remove_overlay_elements", False)` | AsyncPlaywrightCrawlerStrategy | Remove pop-ups and overlay elements |
28 | | async\_crawler\_strategy.py | screenshot | `kwargs.get("screenshot")` | AsyncPlaywrightCrawlerStrategy | Take page screenshot |
29 | | async\_crawler\_strategy.py | screenshot\_wait\_for | `kwargs.get("screenshot_wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait before taking screenshot |
30 | | async\_crawler\_strategy.py | semaphore\_count | `kwargs.get("semaphore_count", 5)` | AsyncPlaywrightCrawlerStrategy | Concurrent request limit |
31 | | async\_webcrawler.py | verbose | `kwargs.get("verbose", False)` | AsyncWebCrawler | Enable detailed logging |
32 | | async\_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request |
33 | | async\_webcrawler.py | session\_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse |
34 | | async\_webcrawler.py | only\_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content |
35 | | async\_webcrawler.py | bypass\_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl |
36 |
37 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/proxy security - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Proxy & Security
2 |
3 | Configure proxy settings and enhance security features in Crawl4AI for reliable data extraction.
4 |
5 | ## Basic Proxy Setup
6 |
7 | Simple proxy configuration:
8 |
9 | ```hljs csharp
10 | # Using proxy URL
11 | async with AsyncWebCrawler(
12 | proxy="http://proxy.example.com:8080"
13 | ) as crawler:
14 | result = await crawler.arun(url="https://example.com")
15 |
16 | # Using SOCKS proxy
17 | async with AsyncWebCrawler(
18 | proxy="socks5://proxy.example.com:1080"
19 | ) as crawler:
20 | result = await crawler.arun(url="https://example.com")
21 |
22 | ```
23 |
24 | ## Authenticated Proxy
25 |
26 | Use proxy with authentication:
27 |
28 | ```hljs csharp
29 | proxy_config = {
30 | "server": "http://proxy.example.com:8080",
31 | "username": "user",
32 | "password": "pass"
33 | }
34 |
35 | async with AsyncWebCrawler(proxy_config=proxy_config) as crawler:
36 | result = await crawler.arun(url="https://example.com")
37 |
38 | ```
39 |
40 | ## Rotating Proxies
41 |
42 | Example using a proxy rotation service:
43 |
44 | ```hljs python
45 | async def get_next_proxy():
46 | # Your proxy rotation logic here
47 | return {"server": "http://next.proxy.com:8080"}
48 |
49 | async with AsyncWebCrawler() as crawler:
50 | # Update proxy for each request
51 | for url in urls:
52 | proxy = await get_next_proxy()
53 | crawler.update_proxy(proxy)
54 | result = await crawler.arun(url=url)
55 |
56 | ```
57 |
58 | ## Custom Headers
59 |
60 | Add security-related headers:
61 |
62 | ```hljs csharp
63 | headers = {
64 | "X-Forwarded-For": "203.0.113.195",
65 | "Accept-Language": "en-US,en;q=0.9",
66 | "Cache-Control": "no-cache",
67 | "Pragma": "no-cache"
68 | }
69 |
70 | async with AsyncWebCrawler(headers=headers) as crawler:
71 | result = await crawler.arun(url="https://example.com")
72 |
73 | ```
74 |
75 | ## Combining with Magic Mode
76 |
77 | For maximum protection, combine proxy with Magic Mode:
78 |
79 | ```hljs python
80 | async with AsyncWebCrawler(
81 | proxy="http://proxy.example.com:8080",
82 | headers={"Accept-Language": "en-US"}
83 | ) as crawler:
84 | result = await crawler.arun(
85 | url="https://example.com",
86 | magic=True # Enable all anti-detection features
87 | )
88 |
89 | ```
90 |
91 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/session management - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Session Management
2 |
3 | Session management in Crawl4AI allows you to maintain state across multiple requests and handle complex multi-page crawling tasks, particularly useful for dynamic websites.
4 |
5 | ## Basic Session Usage
6 |
7 | Use `session_id` to maintain state between requests:
8 |
9 | ```hljs csharp
10 | async with AsyncWebCrawler() as crawler:
11 | session_id = "my_session"
12 |
13 | # First request
14 | result1 = await crawler.arun(
15 | url="https://example.com/page1",
16 | session_id=session_id
17 | )
18 |
19 | # Subsequent request using same session
20 | result2 = await crawler.arun(
21 | url="https://example.com/page2",
22 | session_id=session_id
23 | )
24 |
25 | # Clean up when done
26 | await crawler.crawler_strategy.kill_session(session_id)
27 |
28 | ```
29 |
30 | ## Dynamic Content with Sessions
31 |
32 | Here's a real-world example of crawling GitHub commits across multiple pages:
33 |
34 | ```hljs python
35 | async def crawl_dynamic_content():
36 | async with AsyncWebCrawler(verbose=True) as crawler:
37 | url = "https://github.com/microsoft/TypeScript/commits/main"
38 | session_id = "typescript_commits_session"
39 | all_commits = []
40 |
41 | # Define navigation JavaScript
42 | js_next_page = """
43 | const button = document.querySelector('a[data-testid="pagination-next-button"]');
44 | if (button) button.click();
45 | """
46 |
47 | # Define wait condition
48 | wait_for = """() => {
49 | const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
50 | if (commits.length === 0) return false;
51 | const firstCommit = commits[0].textContent.trim();
52 | return firstCommit !== window.firstCommit;
53 | }"""
54 |
55 | # Define extraction schema
56 | schema = {
57 | "name": "Commit Extractor",
58 | "baseSelector": "li.Box-sc-g0xbh4-0",
59 | "fields": [\
60 | {\
61 | "name": "title",\
62 | "selector": "h4.markdown-title",\
63 | "type": "text",\
64 | "transform": "strip",\
65 | },\
66 | ],
67 | }
68 | extraction_strategy = JsonCssExtractionStrategy(schema)
69 |
70 | # Crawl multiple pages
71 | for page in range(3):
72 | result = await crawler.arun(
73 | url=url,
74 | session_id=session_id,
75 | extraction_strategy=extraction_strategy,
76 | js_code=js_next_page if page > 0 else None,
77 | wait_for=wait_for if page > 0 else None,
78 | js_only=page > 0,
79 | bypass_cache=True
80 | )
81 |
82 | if result.success:
83 | commits = json.loads(result.extracted_content)
84 | all_commits.extend(commits)
85 | print(f"Page {page + 1}: Found {len(commits)} commits")
86 |
87 | # Clean up session
88 | await crawler.crawler_strategy.kill_session(session_id)
89 | return all_commits
90 |
91 | ```
92 |
93 | ## Session Best Practices
94 |
95 | 1. **Session Naming**:
96 |
97 |
98 |
99 |
100 | ```hljs ini
101 | # Use descriptive session IDs
102 | session_id = "login_flow_session"
103 | session_id = "product_catalog_session"
104 |
105 | ```
106 |
107 | 2. **Resource Management**:
108 |
109 |
110 |
111 |
112 | ```hljs python
113 | try:
114 | # Your crawling code
115 | pass
116 | finally:
117 | # Always clean up sessions
118 | await crawler.crawler_strategy.kill_session(session_id)
119 |
120 | ```
121 |
122 | 3. **State Management**:
123 |
124 |
125 |
126 |
127 | ```hljs makefile
128 | # First page: login
129 | result = await crawler.arun(
130 | url="https://example.com/login",
131 | session_id=session_id,
132 | js_code="document.querySelector('form').submit();"
133 | )
134 |
135 | # Second page: verify login success
136 | result = await crawler.arun(
137 | url="https://example.com/dashboard",
138 | session_id=session_id,
139 | wait_for="css:.user-profile" # Wait for authenticated content
140 | )
141 |
142 | ```
143 |
144 |
145 | ## Common Use Cases
146 |
147 | 1. **Authentication Flows**
148 | 2. **Pagination Handling**
149 | 3. **Form Submissions**
150 | 4. **Multi-step Processes**
151 | 5. **Dynamic Content Navigation**
152 |
153 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/simple crawling - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Simple Crawling
2 |
3 | This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response.
4 |
5 | ## Basic Usage
6 |
7 | Here's the simplest way to crawl a webpage:
8 |
9 | ```hljs python
10 | import asyncio
11 | from crawl4ai import AsyncWebCrawler
12 |
13 | async def main():
14 | async with AsyncWebCrawler() as crawler:
15 | result = await crawler.arun(url="https://example.com")
16 | print(result.markdown) # Print clean markdown content
17 |
18 | if __name__ == "__main__":
19 | asyncio.run(main())
20 |
21 | ```
22 |
23 | ## Understanding the Response
24 |
25 | The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../../api/crawl-result/) for complete details):
26 |
27 | ```hljs python
28 | result = await crawler.arun(url="https://example.com")
29 |
30 | # Different content formats
31 | print(result.html) # Raw HTML
32 | print(result.cleaned_html) # Cleaned HTML
33 | print(result.markdown) # Markdown version
34 | print(result.fit_markdown) # Most relevant content in markdown
35 |
36 | # Check success status
37 | print(result.success) # True if crawl succeeded
38 | print(result.status_code) # HTTP status code (e.g., 200, 404)
39 |
40 | # Access extracted media and links
41 | print(result.media) # Dictionary of found media (images, videos, audio)
42 | print(result.links) # Dictionary of internal and external links
43 |
44 | ```
45 |
46 | ## Adding Basic Options
47 |
48 | Customize your crawl with these common options:
49 |
50 | ```hljs python
51 | result = await crawler.arun(
52 | url="https://example.com",
53 | word_count_threshold=10, # Minimum words per content block
54 | exclude_external_links=True, # Remove external links
55 | remove_overlay_elements=True, # Remove popups/modals
56 | process_iframes=True # Process iframe content
57 | )
58 |
59 | ```
60 |
61 | ## Handling Errors
62 |
63 | Always check if the crawl was successful:
64 |
65 | ```hljs python
66 | result = await crawler.arun(url="https://example.com")
67 | if not result.success:
68 | print(f"Crawl failed: {result.error_message}")
69 | print(f"Status code: {result.status_code}")
70 |
71 | ```
72 |
73 | ## Logging and Debugging
74 |
75 | Enable verbose mode for detailed logging:
76 |
77 | ```hljs csharp
78 | async with AsyncWebCrawler(verbose=True) as crawler:
79 | result = await crawler.arun(url="https://example.com")
80 |
81 | ```
82 |
83 | ## Complete Example
84 |
85 | Here's a more comprehensive example showing common usage patterns:
86 |
87 | ```hljs python
88 | import asyncio
89 | from crawl4ai import AsyncWebCrawler
90 |
91 | async def main():
92 | async with AsyncWebCrawler(verbose=True) as crawler:
93 | result = await crawler.arun(
94 | url="https://example.com",
95 | # Content filtering
96 | word_count_threshold=10,
97 | excluded_tags=['form', 'header'],
98 | exclude_external_links=True,
99 |
100 | # Content processing
101 | process_iframes=True,
102 | remove_overlay_elements=True,
103 |
104 | # Cache control
105 | bypass_cache=False # Use cache if available
106 | )
107 |
108 | if result.success:
109 | # Print clean content
110 | print("Content:", result.markdown[:500]) # First 500 chars
111 |
112 | # Process images
113 | for image in result.media["images"]:
114 | print(f"Found image: {image['src']}")
115 |
116 | # Process links
117 | for link in result.links["internal"]:
118 | print(f"Internal link: {link['href']}")
119 |
120 | else:
121 | print(f"Crawl failed: {result.error_message}")
122 |
123 | if __name__ == "__main__":
124 | asyncio.run(main())
125 |
126 | ```
127 |
128 | * * *
--------------------------------------------------------------------------------
/docs/crawl4ai/strategies - crawl4ai documentation.md:
--------------------------------------------------------------------------------
1 | # Extraction & Chunking Strategies API
2 |
3 | This documentation covers the API reference for extraction and chunking strategies in Crawl4AI.
4 |
5 | ## Extraction Strategies
6 |
7 | All extraction strategies inherit from the base `ExtractionStrategy` class and implement two key methods:
8 | \- `extract(url: str, html: str) -> List[Dict[str, Any]]`
9 | \- `run(url: str, sections: List[str]) -> List[Dict[str, Any]]`
10 |
11 | ### LLMExtractionStrategy
12 |
13 | Used for extracting structured data using Language Models.
14 |
15 | ```hljs python
16 | LLMExtractionStrategy(
17 | # Required Parameters
18 | provider: str = DEFAULT_PROVIDER, # LLM provider (e.g., "ollama/llama2")
19 | api_token: Optional[str] = None, # API token
20 |
21 | # Extraction Configuration
22 | instruction: str = None, # Custom extraction instruction
23 | schema: Dict = None, # Pydantic model schema for structured data
24 | extraction_type: str = "block", # "block" or "schema"
25 |
26 | # Chunking Parameters
27 | chunk_token_threshold: int = 4000, # Maximum tokens per chunk
28 | overlap_rate: float = 0.1, # Overlap between chunks
29 | word_token_rate: float = 0.75, # Word to token conversion rate
30 | apply_chunking: bool = True, # Enable/disable chunking
31 |
32 | # API Configuration
33 | base_url: str = None, # Base URL for API
34 | extra_args: Dict = {}, # Additional provider arguments
35 | verbose: bool = False # Enable verbose logging
36 | )
37 |
38 | ```
39 |
40 | ### CosineStrategy
41 |
42 | Used for content similarity-based extraction and clustering.
43 |
44 | ```hljs python
45 | CosineStrategy(
46 | # Content Filtering
47 | semantic_filter: str = None, # Topic/keyword filter
48 | word_count_threshold: int = 10, # Minimum words per cluster
49 | sim_threshold: float = 0.3, # Similarity threshold
50 |
51 | # Clustering Parameters
52 | max_dist: float = 0.2, # Maximum cluster distance
53 | linkage_method: str = 'ward', # Clustering method
54 | top_k: int = 3, # Top clusters to return
55 |
56 | # Model Configuration
57 | model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', # Embedding model
58 |
59 | verbose: bool = False # Enable verbose logging
60 | )
61 |
62 | ```
63 |
64 | ### JsonCssExtractionStrategy
65 |
66 | Used for CSS selector-based structured data extraction.
67 |
68 | ```hljs python
69 | JsonCssExtractionStrategy(
70 | schema: Dict[str, Any], # Extraction schema
71 | verbose: bool = False # Enable verbose logging
72 | )
73 |
74 | # Schema Structure
75 | schema = {
76 | "name": str, # Schema name
77 | "baseSelector": str, # Base CSS selector
78 | "fields": [ # List of fields to extract\
79 | {\
80 | "name": str, # Field name\
81 | "selector": str, # CSS selector\
82 | "type": str, # Field type: "text", "attribute", "html", "regex"\
83 | "attribute": str, # For type="attribute"\
84 | "pattern": str, # For type="regex"\
85 | "transform": str, # Optional: "lowercase", "uppercase", "strip"\
86 | "default": Any # Default value if extraction fails\
87 | }\
88 | ]
89 | }
90 |
91 | ```
92 |
93 | ## Chunking Strategies
94 |
95 | All chunking strategies inherit from `ChunkingStrategy` and implement the `chunk(text: str) -> list` method.
96 |
97 | ### RegexChunking
98 |
99 | Splits text based on regex patterns.
100 |
101 | ```hljs python
102 | RegexChunking(
103 | patterns: List[str] = None # Regex patterns for splitting
104 | # Default: [r'\n\n']
105 | )
106 |
107 | ```
108 |
109 | ### SlidingWindowChunking
110 |
111 | Creates overlapping chunks with a sliding window approach.
112 |
113 | ```hljs perl
114 | SlidingWindowChunking(
115 | window_size: int = 100, # Window size in words
116 | step: int = 50 # Step size between windows
117 | )
118 |
119 | ```
120 |
121 | ### OverlappingWindowChunking
122 |
123 | Creates chunks with specified overlap.
124 |
125 | ```hljs yaml
126 | OverlappingWindowChunking(
127 | window_size: int = 1000, # Chunk size in words
128 | overlap: int = 100 # Overlap size in words
129 | )
130 |
131 | ```
132 |
133 | ## Usage Examples
134 |
135 | ### LLM Extraction
136 |
137 | ```hljs python
138 | from pydantic import BaseModel
139 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
140 |
141 | # Define schema
142 | class Article(BaseModel):
143 | title: str
144 | content: str
145 | author: str
146 |
147 | # Create strategy
148 | strategy = LLMExtractionStrategy(
149 | provider="ollama/llama2",
150 | schema=Article.schema(),
151 | instruction="Extract article details"
152 | )
153 |
154 | # Use with crawler
155 | result = await crawler.arun(
156 | url="https://example.com/article",
157 | extraction_strategy=strategy
158 | )
159 |
160 | # Access extracted data
161 | data = json.loads(result.extracted_content)
162 |
163 | ```
164 |
165 | ### CSS Extraction
166 |
167 | ```hljs makefile
168 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
169 |
170 | # Define schema
171 | schema = {
172 | "name": "Product List",
173 | "baseSelector": ".product-card",
174 | "fields": [\
175 | {\
176 | "name": "title",\
177 | "selector": "h2.title",\
178 | "type": "text"\
179 | },\
180 | {\
181 | "name": "price",\
182 | "selector": ".price",\
183 | "type": "text",\
184 | "transform": "strip"\
185 | },\
186 | {\
187 | "name": "image",\
188 | "selector": "img",\
189 | "type": "attribute",\
190 | "attribute": "src"\
191 | }\
192 | ]
193 | }
194 |
195 | # Create and use strategy
196 | strategy = JsonCssExtractionStrategy(schema)
197 | result = await crawler.arun(
198 | url="https://example.com/products",
199 | extraction_strategy=strategy
200 | )
201 |
202 | ```
203 |
204 | ### Content Chunking
205 |
206 | ```hljs makefile
207 | from crawl4ai.chunking_strategy import OverlappingWindowChunking
208 |
209 | # Create chunking strategy
210 | chunker = OverlappingWindowChunking(
211 | window_size=500, # 500 words per chunk
212 | overlap=50 # 50 words overlap
213 | )
214 |
215 | # Use with extraction strategy
216 | strategy = LLMExtractionStrategy(
217 | provider="ollama/llama2",
218 | chunking_strategy=chunker
219 | )
220 |
221 | result = await crawler.arun(
222 | url="https://example.com/long-article",
223 | extraction_strategy=strategy
224 | )
225 |
226 | ```
227 |
228 | ## Best Practices
229 |
230 | 1. **Choose the Right Strategy**
231 | 2. Use `LLMExtractionStrategy` for complex, unstructured content
232 | 3. Use `JsonCssExtractionStrategy` for well-structured HTML
233 | 4. Use `CosineStrategy` for content similarity and clustering
234 |
235 | 5. **Optimize Chunking**
236 |
237 |
238 |
239 | ```hljs makefile
240 | # For long documents
241 | strategy = LLMExtractionStrategy(
242 | chunk_token_threshold=2000, # Smaller chunks
243 | overlap_rate=0.1 # 10% overlap
244 | )
245 |
246 | ```
247 |
248 | 6. **Handle Errors**
249 |
250 |
251 |
252 | ```hljs python
253 | try:
254 | result = await crawler.arun(
255 | url="https://example.com",
256 | extraction_strategy=strategy
257 | )
258 | if result.success:
259 | content = json.loads(result.extracted_content)
260 | except Exception as e:
261 | print(f"Extraction failed: {e}")
262 |
263 | ```
264 |
265 | 7. **Monitor Performance**
266 |
267 |
268 |
269 | ```hljs graphql
270 | strategy = CosineStrategy(
271 | verbose=True, # Enable logging
272 | word_count_threshold=20, # Filter short content
273 | top_k=5 # Limit results
274 | )
275 |
276 | ```
277 |
278 |
279 | * * *
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import shutil
4 | import re
5 | from fastapi import FastAPI, HTTPException
6 | from fastapi.staticfiles import StaticFiles
7 | from fastapi.responses import FileResponse
8 | from pydantic import BaseModel
9 | from typing import Optional, Dict
10 | from crawl4ai import AsyncWebCrawler
11 | import uuid
12 | import aiofiles
13 | from urllib.parse import urlparse, unquote
14 |
15 | app = FastAPI()
16 |
17 | # Serve static files
18 | app.mount("/static", StaticFiles(directory="static"), name="static")
19 |
20 | # Store crawl jobs status
21 | crawl_jobs: Dict[str, dict] = {}
22 |
23 | class CrawlRequest(BaseModel):
24 | url: str
25 | limit: int = 10
26 |
27 | class CrawlResponse(BaseModel):
28 | job_id: str
29 | status: str
30 | progress: int = 0
31 | total_pages: int = 0
32 | current_url: Optional[str] = None
33 |
34 | def clean_path(url: str, base_url: str) -> str:
35 | """Extract and clean the path from URL relative to base URL"""
36 | # URL decode both URLs to handle any encoded characters
37 | url = unquote(url)
38 | base_url = unquote(base_url)
39 |
40 | # Remove base URL to get the relative path
41 | path = url.replace(base_url, '')
42 |
43 | # If path starts with /, remove it
44 | path = path.lstrip('/')
45 |
46 | # Handle fragment identifiers (#)
47 | if '#' in path:
48 | path = path.split('#')[1] # Take the fragment part
49 | else:
50 | # Remove query parameters if no fragment
51 | path = path.split('?')[0]
52 |
53 | # If path is empty after cleaning, return empty string
54 | if not path:
55 | return ''
56 |
57 | # Clean special characters and convert spaces
58 | clean = re.sub(r'[^\w\s-]', '', path)
59 | clean = re.sub(r'\s+', '_', clean.strip())
60 | return clean.lower()
61 |
62 | async def process_url(url: str, output_dir: str, crawler: AsyncWebCrawler, job_id: str):
63 | """Process a single URL and save markdown"""
64 | try:
65 | result = await crawler.arun(
66 | url=url,
67 | remove_overlay_elements=True,
68 | bypass_cache=True
69 | )
70 |
71 | if result.success:
72 | # Get title from metadata
73 | metadata = result.metadata
74 | title = metadata['title']
75 | # Clean title for filename
76 | clean_title = re.sub(r'[^\w\s-]', '', title)
77 | clean_title = re.sub(r'\s+', '_', clean_title.strip())
78 |
79 | # Get and clean URL path
80 | path_suffix = clean_path(url, crawl_jobs[job_id]["base_url"])
81 |
82 | # Combine title and path for unique filename
83 | filename = f"{clean_title.lower()}"
84 | if path_suffix:
85 | filename += f"_{path_suffix}"
86 | filename += ".md"
87 |
88 | # Save markdown
89 | filepath = os.path.join(output_dir, filename)
90 | async with aiofiles.open(filepath, 'w') as f:
91 | await f.write(result.markdown)
92 |
93 | # Return internal links
94 | return result.links.get("internal", [])
95 | except Exception as e:
96 | print(f"Error processing {url}: {str(e)}")
97 | return []
98 |
99 | async def crawl_website(job_id: str, url: str, limit: int):
100 | """Recursively crawl website and update job status"""
101 | try:
102 | # Create output directory
103 | output_dir = f"output/output_{job_id}"
104 | os.makedirs(output_dir, exist_ok=True)
105 |
106 | # Store the base URL for this job
107 | crawl_jobs[job_id]["base_url"] = url
108 |
109 | # Initialize crawler
110 | async with AsyncWebCrawler(verbose=True) as crawler:
111 | processed_urls = set()
112 | urls_to_process = {url}
113 |
114 | while urls_to_process and len(processed_urls) < limit:
115 | current_url = urls_to_process.pop()
116 |
117 | if current_url in processed_urls:
118 | continue
119 |
120 | # Update job status
121 | crawl_jobs[job_id].update({
122 | "status": "processing",
123 | "progress": len(processed_urls),
124 | "current_url": current_url
125 | })
126 |
127 | # Process URL and get internal links
128 | internal_links = await process_url(current_url, output_dir, crawler, job_id)
129 | processed_urls.add(current_url)
130 |
131 | # Add new internal links that contain the base URL
132 | for link in internal_links:
133 | if isinstance(link, dict):
134 | link_url = link.get("href", "")
135 | else:
136 | link_url = link
137 |
138 | if link_url and link_url.startswith(url) and link_url not in processed_urls:
139 | urls_to_process.add(link_url)
140 |
141 | # Create zip file
142 | shutil.make_archive(output_dir, 'zip', output_dir)
143 |
144 | # Update final status
145 | crawl_jobs[job_id].update({
146 | "status": "completed",
147 | "progress": len(processed_urls),
148 | "total_pages": len(processed_urls)
149 | })
150 |
151 | # Cleanup output directory
152 | shutil.rmtree(output_dir)
153 |
154 | except Exception as e:
155 | crawl_jobs[job_id]["status"] = "failed"
156 | print(f"Crawl failed: {str(e)}")
157 |
158 | @app.post("/api/crawl", response_model=CrawlResponse)
159 | async def start_crawl(request: CrawlRequest):
160 | job_id = str(uuid.uuid4())
161 | crawl_jobs[job_id] = {
162 | "status": "starting",
163 | "progress": 0,
164 | "total_pages": 0,
165 | "base_url": request.url # Store the base URL
166 | }
167 |
168 | # Start crawl in background
169 | asyncio.create_task(crawl_website(job_id, request.url, request.limit))
170 |
171 | return CrawlResponse(
172 | job_id=job_id,
173 | status="starting",
174 | progress=0
175 | )
176 |
177 | @app.get("/api/status/{job_id}", response_model=CrawlResponse)
178 | async def get_status(job_id: str):
179 | if job_id not in crawl_jobs:
180 | raise HTTPException(status_code=404, detail="Job not found")
181 |
182 | job = crawl_jobs[job_id]
183 | return CrawlResponse(
184 | job_id=job_id,
185 | status=job["status"],
186 | progress=job["progress"],
187 | total_pages=job["total_pages"],
188 | current_url=job.get("current_url")
189 | )
190 |
191 | @app.get("/api/download/{job_id}")
192 | async def download_results(job_id: str):
193 | if job_id not in crawl_jobs:
194 | raise HTTPException(status_code=404, detail="Job not found")
195 |
196 | job = crawl_jobs[job_id]
197 | if job["status"] != "completed":
198 | raise HTTPException(status_code=400, detail="Job not completed")
199 |
200 | zip_path = f"output/output_{job_id}.zip"
201 | if not os.path.exists(zip_path):
202 | raise HTTPException(status_code=404, detail="Results not found")
203 |
204 | return FileResponse(
205 | zip_path,
206 | media_type="application/zip",
207 | filename="crawl_results.zip"
208 | )
209 |
210 | # Serve index.html
211 | @app.get("/")
212 | async def read_root():
213 | return FileResponse("static/index.html")
214 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "crawl4ai-frontend"
3 | version = "0.1.0"
4 | description = "A frontend for crawl4ai"
5 | authors = ["f4ww4z"]
6 | license = "MIT"
7 |
8 | [tool.poetry.dependencies]
9 | python = "^3.10"
10 | Crawl4AI = "^0.3.746"
11 | fastapi = {extras = ["standard"], version = "^0.115.5"}
12 | aiofiles = "^24.1.0"
13 |
14 | [tool.poetry.dev-dependencies]
15 | autopep8 = "^2.3.1"
16 | djlint = "^1.36.3"
17 |
18 | [tool.djlint]
19 | convert_errors_to_warnings = true
20 | indent = 2
21 |
22 | [tool.djlint.js]
23 | warn_on_js_errors = true
24 |
25 | [build-system]
26 | requires = ["poetry-core>=1.0.0"]
27 | build-backend = "poetry.core.masonry.api"
28 |
--------------------------------------------------------------------------------