├── .env.txt
├── .gitattributes
├── .github
    ├── DISCUSSION_TEMPLATE
    │   └── feature-requests.yml
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   └── config.yml
    ├── pull_request_template.md
    └── workflows
    │   └── main.yml
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTORS.md
├── Dockerfile
├── JOURNAL.md
├── LICENSE
├── MANIFEST.in
├── MISSION.md
├── README.md
├── ROADMAP.md
├── cliff.toml
├── crawl4ai
    ├── __init__.py
    ├── __version__.py
    ├── async_configs.py
    ├── async_crawler_strategy.py
    ├── async_database.py
    ├── async_dispatcher.py
    ├── async_logger.py
    ├── async_webcrawler.py
    ├── browser_manager.py
    ├── browser_profiler.py
    ├── cache_context.py
    ├── chunking_strategy.py
    ├── cli.py
    ├── components
    │   └── crawler_monitor.py
    ├── config.py
    ├── content_filter_strategy.py
    ├── content_scraping_strategy.py
    ├── crawlers
    │   ├── __init__.py
    │   ├── amazon_product
    │   │   ├── __init__.py
    │   │   └── crawler.py
    │   └── google_search
    │   │   ├── __init__.py
    │   │   ├── crawler.py
    │   │   └── script.js
    ├── deep_crawling
    │   ├── __init__.py
    │   ├── base_strategy.py
    │   ├── bff_strategy.py
    │   ├── bfs_strategy.py
    │   ├── crazy.py
    │   ├── dfs_strategy.py
    │   ├── filters.py
    │   └── scorers.py
    ├── docker_client.py
    ├── extraction_strategy.py
    ├── html2text
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── _typing.py
    │   ├── cli.py
    │   ├── config.py
    │   ├── elements.py
    │   └── utils.py
    ├── hub.py
    ├── install.py
    ├── js_snippet
    │   ├── __init__.py
    │   ├── navigator_overrider.js
    │   ├── remove_overlay_elements.js
    │   └── update_image_dimensions.js
    ├── legacy
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── crawler_strategy.py
    │   ├── database.py
    │   ├── docs_manager.py
    │   ├── llmtxt.py
    │   ├── version_manager.py
    │   └── web_crawler.py
    ├── markdown_generation_strategy.py
    ├── migrations.py
    ├── model_loader.py
    ├── models.py
    ├── processors
    │   └── pdf
    │   │   ├── __init__.py
    │   │   ├── processor.py
    │   │   └── utils.py
    ├── prompts.py
    ├── proxy_strategy.py
    ├── ssl_certificate.py
    ├── types.py
    ├── user_agent_generator.py
    └── utils.py
├── deploy
    └── docker
    │   ├── .dockerignore
    │   ├── .llm.env.example
    │   ├── README.md
    │   ├── api.py
    │   ├── auth.py
    │   ├── c4ai-code-context.md
    │   ├── c4ai-doc-context.md
    │   ├── config.yml
    │   ├── crawler_pool.py
    │   ├── job.py
    │   ├── mcp_bridge.py
    │   ├── requirements.txt
    │   ├── schemas.py
    │   ├── server.py
    │   ├── static
    │       └── playground
    │       │   └── index.html
    │   ├── supervisord.conf
    │   └── utils.py
├── docker-compose.yml
├── docs
    ├── apps
    │   └── linkdin
    │   │   ├── README.md
    │   │   ├── c4ai_discover.py
    │   │   ├── c4ai_insights.py
    │   │   ├── schemas
    │   │       ├── company_card.json
    │   │       └── people_card.json
    │   │   ├── snippets
    │   │       ├── company.html
    │   │       └── people.html
    │   │   └── templates
    │   │       ├── ai.js
    │   │       └── graph_view_template.html
    ├── assets
    │   ├── pitch-dark.png
    │   ├── pitch-dark.svg
    │   ├── powered-by-dark.svg
    │   ├── powered-by-disco.svg
    │   ├── powered-by-light.svg
    │   └── powered-by-night.svg
    ├── codebase
    │   ├── browser.md
    │   └── cli.md
    ├── deprecated
    │   └── docker-deployment.md
    ├── examples
    │   ├── README_BUILTIN_BROWSER.md
    │   ├── amazon_product_extraction_direct_url.py
    │   ├── amazon_product_extraction_using_hooks.py
    │   ├── amazon_product_extraction_using_use_javascript.py
    │   ├── arun_vs_arun_many.py
    │   ├── assets
    │   │   ├── audio.mp3
    │   │   ├── basic.png
    │   │   ├── cosine_extraction.png
    │   │   ├── css_js.png
    │   │   ├── css_selector.png
    │   │   ├── exec_script.png
    │   │   ├── llm_extraction.png
    │   │   ├── semantic_extraction_cosine.png
    │   │   └── semantic_extraction_llm.png
    │   ├── async_webcrawler_multiple_urls_example.py
    │   ├── browser_optimization_example.py
    │   ├── builtin_browser_example.py
    │   ├── chainlit.md
    │   ├── cli
    │   │   ├── browser.yml
    │   │   ├── crawler.yml
    │   │   ├── css_schema.json
    │   │   ├── extract.yml
    │   │   ├── extract_css.yml
    │   │   └── llm_schema.json
    │   ├── crawlai_vs_firecrawl.py
    │   ├── crawler_monitor_example.py
    │   ├── crypto_analysis_example.py
    │   ├── deepcrawl_example.py
    │   ├── dispatcher_example.py
    │   ├── docker
    │   │   ├── demo_docker_api.py
    │   │   └── demo_docker_polling.py
    │   ├── docker_config_obj.py
    │   ├── docker_example.py
    │   ├── docker_python_rest_api.py
    │   ├── docker_python_sdk.py
    │   ├── extraction_strategies_examples.py
    │   ├── full_page_screenshot_and_pdf_export.md
    │   ├── hello_world.py
    │   ├── hooks_example.py
    │   ├── identity_based_browsing.py
    │   ├── language_support_example.py
    │   ├── llm_extraction_openai_pricing.py
    │   ├── llm_markdown_generator.py
    │   ├── markdown
    │   │   ├── content_source_example.py
    │   │   └── content_source_short_example.py
    │   ├── network_console_capture_example.py
    │   ├── proxy_rotation_demo.py
    │   ├── quickstart.ipynb
    │   ├── quickstart.py
    │   ├── quickstart_examples_set_1.py
    │   ├── quickstart_examples_set_2.py
    │   ├── regex_extraction_quickstart.py
    │   ├── research_assistant.py
    │   ├── rest_call.py
    │   ├── sample_ecommerce.html
    │   ├── scraping_strategies_performance.py
    │   ├── serp_api_project_11_feb.py
    │   ├── session_id_example.py
    │   ├── ssl_example.py
    │   ├── storage_state_tutorial.md
    │   ├── summarize_page.py
    │   ├── tutorial_dynamic_clicks.md
    │   ├── tutorial_v0.5.py
    │   └── use_geo_location.py
    ├── md_v2
    │   ├── advanced
    │   │   ├── advanced-features.md
    │   │   ├── crawl-dispatcher.md
    │   │   ├── file-downloading.md
    │   │   ├── hooks-auth.md
    │   │   ├── identity-based-crawling.md
    │   │   ├── lazy-loading.md
    │   │   ├── multi-url-crawling.md
    │   │   ├── network-console-capture.md
    │   │   ├── proxy-security.md
    │   │   ├── session-management.md
    │   │   └── ssl-certificate.md
    │   ├── api
    │   │   ├── arun.md
    │   │   ├── arun_many.md
    │   │   ├── async-webcrawler.md
    │   │   ├── crawl-result.md
    │   │   ├── parameters.md
    │   │   └── strategies.md
    │   ├── ask_ai
    │   │   ├── ask-ai.css
    │   │   ├── ask-ai.js
    │   │   └── index.html
    │   ├── assets
    │   │   ├── DankMono-Bold.woff2
    │   │   ├── DankMono-Italic.woff2
    │   │   ├── DankMono-Regular.woff2
    │   │   ├── Monaco.woff
    │   │   ├── copy_code.js
    │   │   ├── dmvendor.css
    │   │   ├── docs.zip
    │   │   ├── floating_ask_ai_button.js
    │   │   ├── github_stats.js
    │   │   ├── highlight.css
    │   │   ├── highlight.min.js
    │   │   ├── highlight_init.js
    │   │   ├── images
    │   │   │   └── dispatcher.png
    │   │   ├── layout.css
    │   │   ├── mobile_menu.js
    │   │   ├── selection_ask_ai.js
    │   │   ├── styles.css
    │   │   └── toc.js
    │   ├── basic
    │   │   └── installation.md
    │   ├── blog
    │   │   ├── articles
    │   │   │   └── dockerize_hooks.md
    │   │   ├── index.md
    │   │   └── releases
    │   │   │   ├── 0.4.0.md
    │   │   │   ├── 0.4.1.md
    │   │   │   ├── 0.4.2.md
    │   │   │   ├── 0.5.0.md
    │   │   │   ├── 0.6.0.md
    │   │   │   └── v0.4.3b1.md
    │   ├── core
    │   │   ├── ask-ai.md
    │   │   ├── browser-crawler-config.md
    │   │   ├── cache-modes.md
    │   │   ├── cli.md
    │   │   ├── content-selection.md
    │   │   ├── crawler-result.md
    │   │   ├── deep-crawling.md
    │   │   ├── docker-deployment.md
    │   │   ├── examples.md
    │   │   ├── fit-markdown.md
    │   │   ├── installation.md
    │   │   ├── link-media.md
    │   │   ├── local-files.md
    │   │   ├── markdown-generation.md
    │   │   ├── page-interaction.md
    │   │   ├── quickstart.md
    │   │   └── simple-crawling.md
    │   ├── extraction
    │   │   ├── chunking.md
    │   │   ├── clustring-strategies.md
    │   │   ├── llm-strategies.md
    │   │   └── no-llm-strategies.md
    │   └── index.md
    ├── releases_review
    │   ├── Crawl4AI_v0.3.72_Release_Announcement.ipynb
    │   ├── v0.3.74.overview.py
    │   ├── v0_4_24_walkthrough.py
    │   └── v0_4_3b2_features_demo.py
    ├── snippets
    │   └── deep_crawl
    │   │   ├── 1.intro.py
    │   │   └── 2.filters.py
    └── tutorials
    │   └── coming_soon.md
├── mkdocs.yml
├── prompts
    └── prompt_net_requests.md
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── async
        ├── sample_wikipedia.html
        ├── test_0.4.2_browser_manager.py
        ├── test_0.4.2_config_params.py
        ├── test_async_doanloader.py
        ├── test_basic_crawling.py
        ├── test_caching.py
        ├── test_chunking_and_extraction_strategies.py
        ├── test_content_extraction.py
        ├── test_content_filter_bm25.py
        ├── test_content_filter_prune.py
        ├── test_content_scraper_strategy.py
        ├── test_crawler_strategy.py
        ├── test_database_operations.py
        ├── test_dispatchers.py
        ├── test_edge_cases.py
        ├── test_error_handling.py
        ├── test_evaluation_scraping_methods_performance.configs.py
        ├── test_markdown_genertor.py
        ├── test_parameters_and_options.py
        ├── test_performance.py
        └── test_screenshot.py
    ├── browser
        ├── docker
        │   ├── __init__.py
        │   └── test_docker_browser.py
        ├── manager
        │   └── demo_browser_manager.py
        ├── test_browser_manager.py
        ├── test_builtin_browser.py
        ├── test_builtin_strategy.py
        ├── test_cdp_strategy.py
        ├── test_combined.py
        ├── test_launch_standalone.py
        ├── test_parallel_crawling.py
        ├── test_playwright_strategy.py
        └── test_profiles.py
    ├── cli
        └── test_cli.py
    ├── docker
        ├── test_config_object.py
        ├── test_docker.py
        ├── test_dockerclient.py
        ├── test_rest_api_deep_crawl.py
        ├── test_serialization.py
        ├── test_server.py
        ├── test_server_requests.py
        └── test_server_token.py
    ├── docker_example.py
    ├── general
        ├── generate_dummy_site.py
        ├── test_acyn_crawl_wuth_http_crawler_strategy.py
        ├── test_advanced_deep_crawl.py
        ├── test_async_crawler_strategy.py
        ├── test_async_markdown_generator.py
        ├── test_async_webcrawler.py
        ├── test_cache_context.py
        ├── test_content_source_parameter.py
        ├── test_crawlers.py
        ├── test_deep_crawl.py
        ├── test_deep_crawl_filters.py
        ├── test_deep_crawl_scorers.py
        ├── test_http_crawler_strategy.py
        ├── test_llm_filter.py
        ├── test_mhtml.py
        ├── test_network_console_capture.py
        ├── test_robot_parser.py
        ├── test_schema_builder.py
        ├── test_stream.py
        ├── test_stream_dispatch.py
        └── tets_robot.py
    ├── hub
        └── test_simple.py
    ├── loggers
        └── test_logger.py
    ├── mcp
        ├── test_mcp_socket.py
        └── test_mcp_sse.py
    ├── memory
        ├── README.md
        ├── benchmark_report.py
        ├── cap_test.py
        ├── requirements.txt
        ├── run_benchmark.py
        ├── test_crawler_monitor.py
        ├── test_dispatcher_stress.py
        ├── test_docker_config_gen.py
        ├── test_stress_api.py
        ├── test_stress_api_xs.py
        ├── test_stress_docker_api.py
        └── test_stress_sdk.py
    ├── profiler
        └── test_crteate_profile.py
    ├── test_cli_docs.py
    ├── test_docker.py
    ├── test_llmtxt.py
    ├── test_main.py
    ├── test_scraping_strategy.py
    └── test_web_crawler.py


/.env.txt:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY = "YOUR_GROQ_API"
2 | OPENAI_API_KEY = "YOUR_OPENAI_API"
3 | ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API"
4 | # You can add more API keys here


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | *.html linguist-documentation
 3 | docs/* linguist-documentation
 4 | docs/examples/* linguist-documentation
 5 | docs/md_v2/* linguist-documentation
 6 | 
 7 | # Explicitly mark Python as the main language
 8 | *.py linguist-detectable=true
 9 | *.py linguist-language=Python
10 | 
11 | # Exclude HTML from language statistics
12 | *.html linguist-detectable=false
13 | 


--------------------------------------------------------------------------------
/.github/DISCUSSION_TEMPLATE/feature-requests.yml:
--------------------------------------------------------------------------------
 1 | title: "[Feature Request]: "
 2 | labels: ["⚙️ New"]
 3 | body:
 4 |   - type: markdown
 5 |     attributes:
 6 |       value: |
 7 |         Thank you for your interest in suggesting a new feature! Before you submit, please take a moment to check if already exists in
 8 |         this discussions category to avoid duplicates. 😊
 9 | 
10 |   - type: textarea
11 |     id: needs_to_be_done
12 |     attributes:
13 |       label: What needs to be done?
14 |       description: Please describe the feature or functionality you'd like to see.
15 |       placeholder: "e.g., Return alt text along with images scraped from a webpages in Result"
16 |     validations:
17 |       required: true
18 | 
19 |   - type: textarea
20 |     id: problem_to_solve
21 |     attributes:
22 |       label: What problem does this solve?
23 |       description: Explain the pain point or issue this feature will help address.
24 |       placeholder: "e.g., Bypass Captchas added by cloudflare"
25 |     validations:
26 |       required: true
27 | 
28 |   - type: textarea
29 |     id: target_users
30 |     attributes:
31 |       label: Target users/beneficiaries
32 |       description: Who would benefit from this feature? (e.g., specific teams, developers, users, etc.)
33 |       placeholder: "e.g., Marketing teams, developers"
34 |     validations:
35 |       required: false
36 | 
37 |   - type: textarea
38 |     id: current_workarounds
39 |     attributes:
40 |       label: Current alternatives/workarounds
41 |       description: Are there any existing solutions or workarounds? How does this feature improve upon them?
42 |       placeholder: "e.g., Users manually select the css classes mapped to data fields to extract them"
43 |     validations:
44 |       required: false
45 | 
46 |   - type: markdown
47 |     attributes:
48 |       value: |
49 |         ### 💡 Implementation Ideas
50 | 
51 |   - type: textarea
52 |     id: proposed_approach
53 |     attributes:
54 |       label: Proposed approach
55 |       description: Share any ideas you have for how this feature could be implemented. Point out any challenges your foresee
56 |        and the success metrics for this feature
57 |       placeholder: "e.g., Implement a breadth first traversal algorithm for scraper"
58 |     validations:
59 |       required: false
60 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Feature Requests
4 |     url: https://github.com/unclecode/crawl4ai/discussions/categories/feature-requests
5 |     about: "Suggest new features or enhancements for Crawl4AI"
6 |   - name: Forums - Q&A
7 |     url: https://github.com/unclecode/crawl4ai/discussions/categories/forums-q-a
8 |     about: "Ask questions or engage in general discussions about Crawl4AI"
9 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Summary
 2 | Please include a summary of the change and/or which issues are fixed.
 3 | 
 4 | eg: `Fixes #123` (Tag GitHub issue numbers in this format, so it automatically links the issues with your PR)
 5 | 
 6 | ## List of files changed and why
 7 | eg: quickstart.py - To update the example as per new changes
 8 | 
 9 | ## How Has This Been Tested?
10 | Please describe the tests that you ran to verify your changes.
11 | 
12 | ## Checklist:
13 | 
14 | - [ ] My code follows the style guidelines of this project
15 | - [ ] I have performed a self-review of my own code
16 | - [ ] I have commented my code, particularly in hard-to-understand areas
17 | - [ ] I have made corresponding changes to the documentation
18 | - [ ] I have added/updated unit tests that prove my fix is effective or that my feature works
19 | - [ ] New and existing unit tests pass locally with my changes
20 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Discord GitHub Notifications
 2 | 
 3 | on:
 4 |   issues:
 5 |     types: [opened]
 6 |   issue_comment:
 7 |     types: [created]
 8 |   pull_request:
 9 |     types: [opened]
10 |   discussion:
11 |     types: [created]
12 | 
13 | jobs:
14 |   notify-discord:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Set webhook based on event type
18 |         id: set-webhook
19 |         run: |
20 |           if [ "${{ github.event_name }}" == "discussion" ]; then
21 |             echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT
22 |           else
23 |             echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT
24 |           fi
25 | 
26 |       - name: Discord Notification
27 |         uses: Ilshidur/action-discord@master
28 |         env:
29 |           DISCORD_WEBHOOK: ${{ steps.set-webhook.outputs.webhook }}
30 |         with:
31 |           args: |
32 |             ${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) || 
33 |             github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) ||
34 |             github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) || 
35 |             format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }}
36 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
 1 | # Contributors to Crawl4AI
 2 | 
 3 | We would like to thank the following people for their contributions to Crawl4AI:
 4 | 
 5 | ## Core Team
 6 | 
 7 | - [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer
 8 | - [Nasrin](https://github.com/ntohidi) - Project Manager and Developer
 9 | - [Aravind Karnam](https://github.com/aravindkarnam) - Head of Community and Product 
10 | 
11 | ## Community Contributors
12 | 
13 | - [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined.
14 | - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors
15 | - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies
16 | - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for
17 | - [datehoer](https://github.com/datehoer) - Add browser prxy support
18 | 
19 | ## Pull Requests
20 | 
21 | - [dvschuyl](https://github.com/dvschuyl) - AsyncPlaywrightCrawlerStrategy page-evaluate context destroyed by navigation [#304](https://github.com/unclecode/crawl4ai/pull/304)
22 | - [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286)
23 | - [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293)
24 | - [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271)
25 | - [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298)
26 | 
27 | #### Feb-Alpha-1
28 | - [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651)
29 | - [tautikAg](https://github.com/tautikAg) - fix: [Markdown output has incorect spacing](https://github.com/unclecode/crawl4ai/issues/599)
30 | - [cardit1](https://github.com/cardit1) - fix: ['AsyncPlaywrightCrawlerStrategy' object has no attribute 'downloads_path'](https://github.com/unclecode/crawl4ai/issues/585)
31 | - [dmurat](https://github.com/dmurat) - fix: [ Incorrect rendering of inline code inside of links ](https://github.com/unclecode/crawl4ai/issues/583)
32 | - [Sparshsing](https://github.com/Sparshsing) - fix: [Relative Urls in the webpage not extracted properly ](https://github.com/unclecode/crawl4ai/issues/570)
33 | 
34 | 
35 | 
36 | ## Other Contributors
37 | 
38 | - [Gokhan](https://github.com/gkhngyk) 
39 | - [Shiv Kumar](https://github.com/shivkumar0757)
40 | - [QIN2DIM](https://github.com/QIN2DIM)
41 | 
42 | #### Typo fixes
43 | - [ssoydan](https://github.com/ssoydan)
44 | - [Darshan](https://github.com/Darshan2104)
45 | - [tuhinmallick](https://github.com/tuhinmallick)
46 | 
47 | ## Acknowledgements
48 | 
49 | We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better.
50 | 
51 | ---
52 | 
53 | If you've contributed to Crawl4AI and your name isn't on this list, please [open a pull request](https://github.com/unclecode/crawl4ai/pulls) with your name, link, and contribution, and we'll review it promptly.
54 | 
55 | Thank you all for your contributions!


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | recursive-include crawl4ai/js_snippet *.js


--------------------------------------------------------------------------------
/cliff.toml:
--------------------------------------------------------------------------------
 1 | [changelog]
 2 | # Template format
 3 | header = """
 4 | # Changelog\n
 5 | All notable changes to this project will be documented in this file.\n
 6 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 7 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n
 8 | """
 9 | 
10 | # Organize commits by type
11 | [git]
12 | conventional_commits = true
13 | filter_unconventional = true
14 | commit_parsers = [
15 |     { message = "^feat", group = "Added"},
16 |     { message = "^fix", group = "Fixed"},
17 |     { message = "^doc", group = "Documentation"},
18 |     { message = "^perf", group = "Performance"},
19 |     { message = "^refactor", group = "Changed"},
20 |     { message = "^style", group = "Changed"},
21 |     { message = "^test", group = "Testing"},
22 |     { message = "^chore\\(release\\): prepare for", skip = true},
23 |     { message = "^chore", group = "Miscellaneous Tasks"},
24 | ]


--------------------------------------------------------------------------------
/crawl4ai/__version__.py:
--------------------------------------------------------------------------------
1 | # crawl4ai/_version.py
2 | __version__ = "0.6.3"
3 | 
4 | 


--------------------------------------------------------------------------------
/crawl4ai/crawlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/crawl4ai/crawlers/__init__.py


--------------------------------------------------------------------------------
/crawl4ai/crawlers/amazon_product/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/crawl4ai/crawlers/amazon_product/__init__.py


--------------------------------------------------------------------------------
/crawl4ai/crawlers/amazon_product/crawler.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai.hub import BaseCrawler
 2 | 
 3 | __meta__ = {
 4 |     "version": "1.2.0",
 5 |     "tested_on": ["amazon.com"],
 6 |     "rate_limit": "50 RPM",
 7 |     "schema": {"product": ["name", "price"]}
 8 | }
 9 | 
10 | class AmazonProductCrawler(BaseCrawler):
11 |     async def run(self, url: str, **kwargs) -> str:
12 |         try:
13 |             self.logger.info(f"Crawling {url}")
14 |             return '{"product": {"name": "Test Amazon Product"}}'
15 |         except Exception as e:
16 |             self.logger.error(f"Crawl failed: {str(e)}")
17 |             return json.dumps({
18 |                 "error": str(e),
19 |                 "metadata": self.meta  # Include meta in error response
20 |             })            


--------------------------------------------------------------------------------
/crawl4ai/crawlers/google_search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/crawl4ai/crawlers/google_search/__init__.py


--------------------------------------------------------------------------------
/crawl4ai/crawlers/google_search/script.js:
--------------------------------------------------------------------------------
  1 | (() => {
  2 |     // Function to extract image data from Google Images page
  3 |     function extractImageData() {
  4 |         const keys = Object.keys(window.W_jd);
  5 |         let allImageData = [];
  6 |         let currentPosition = 0;
  7 | 
  8 |         // Get the symbol we'll use (from first valid entry)
  9 |         let targetSymbol;
 10 |         for (let key of keys) {
 11 |             try {
 12 |                 const symbols = Object.getOwnPropertySymbols(window.W_jd[key]);
 13 |                 if (symbols.length > 0) {
 14 |                     targetSymbol = symbols[0];
 15 |                     break;
 16 |                 }
 17 |             } catch (e) {
 18 |                 continue;
 19 |             }
 20 |         }
 21 | 
 22 |         if (!targetSymbol) return [];
 23 | 
 24 |         // Iterate through ALL keys
 25 |         for (let key of keys) {
 26 |             try {
 27 |                 const o1 = window.W_jd[key][targetSymbol]
 28 |                 if (!o1) continue;
 29 |                 const data = Object.values(o1)[0]
 30 |                 // const data = window.W_jd[key][targetSymbol]?.Ws;
 31 |                 // Check if this is a valid image data entry
 32 |                 if (data && Array.isArray(data[1])) {
 33 |                     const processedData = processImageEntry(data, currentPosition);
 34 |                     if (processedData) {
 35 |                         allImageData.push(processedData);
 36 |                         currentPosition++;
 37 |                     }
 38 |                 }
 39 |             } catch (e) {
 40 |                 continue;
 41 |             }
 42 |         }
 43 | 
 44 |         return allImageData;
 45 |     }
 46 | 
 47 |     function processImageEntry(entry, position) {
 48 |         const imageData = entry[1];
 49 |         if (!Array.isArray(imageData)) return null;
 50 | 
 51 |         // Extract the image ID
 52 |         const imageId = imageData[1];
 53 |         if (!imageId) return null;
 54 | 
 55 |         // Find the corresponding DOM element
 56 |         const domElement = document.querySelector(`[data-docid="${imageId}"]`);
 57 |         if (!domElement) return null;
 58 | 
 59 |         // Extract data from the array structure
 60 |         const [
 61 |             _,
 62 |             id,
 63 |             thumbnailInfo,
 64 |             imageInfo,
 65 |             __,
 66 |             ___,
 67 |             rgb,
 68 |             ____,
 69 |             _____,
 70 |             metadata
 71 |         ] = imageData;
 72 | 
 73 |         // Ensure we have the required data
 74 |         if (!thumbnailInfo || !imageInfo) return null;
 75 | 
 76 |         // Extract metadata from DOM
 77 |         const title = domElement?.querySelector('.toI8Rb')?.textContent?.trim();
 78 |         const source = domElement?.querySelector('.guK3rf')?.textContent?.trim();
 79 |         const link = domElement?.querySelector('a.EZAeBe')?.href;
 80 | 
 81 |         if (!link) return null;
 82 | 
 83 |         // Build Google Image URL
 84 |         const googleUrl = buildGoogleImageUrl(imageInfo[0], link, imageId, imageInfo[1], imageInfo[2]);
 85 | 
 86 |         return {
 87 |             title,
 88 |             imageUrl: imageInfo[0],
 89 |             imageWidth: imageInfo[2],
 90 |             imageHeight: imageInfo[1],
 91 |             thumbnailUrl: thumbnailInfo[0],
 92 |             thumbnailWidth: thumbnailInfo[2],
 93 |             thumbnailHeight: thumbnailInfo[1],
 94 |             source,
 95 |             domain: metadata['2000']?.[1] || new URL(link).hostname,
 96 |             link,
 97 |             googleUrl,
 98 |             position: position + 1
 99 |         };
100 |     }
101 | 
102 |     function buildGoogleImageUrl(imgUrl, refUrl, tbnid, height, width) {
103 |         const params = new URLSearchParams({
104 |             imgurl: imgUrl,
105 |             tbnid: tbnid,
106 |             imgrefurl: refUrl,
107 |             docid: tbnid,
108 |             w: width.toString(),
109 |             h: height.toString(),
110 |         });
111 | 
112 |         return `https://www.google.com/imgres?${params.toString()}`;
113 |     }
114 |     return extractImageData();
115 | })();


--------------------------------------------------------------------------------
/crawl4ai/deep_crawling/__init__.py:
--------------------------------------------------------------------------------
 1 | # deep_crawling/__init__.py
 2 | from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
 3 | from .bfs_strategy import BFSDeepCrawlStrategy
 4 | from .bff_strategy import BestFirstCrawlingStrategy
 5 | from .dfs_strategy import DFSDeepCrawlStrategy
 6 | from .filters import (
 7 |     FilterChain,
 8 |     ContentTypeFilter,
 9 |     DomainFilter,
10 |     URLFilter,
11 |     URLPatternFilter,
12 |     FilterStats,
13 |     ContentRelevanceFilter,
14 |     SEOFilter
15 | )
16 | from .scorers import (
17 |     KeywordRelevanceScorer,
18 |     URLScorer,
19 |     CompositeScorer,
20 |     DomainAuthorityScorer,
21 |     FreshnessScorer,
22 |     PathDepthScorer,
23 |     ContentTypeScorer
24 | )
25 | 
26 | __all__ = [
27 |     "DeepCrawlDecorator",
28 |     "DeepCrawlStrategy",
29 |     "BFSDeepCrawlStrategy",
30 |     "BestFirstCrawlingStrategy",
31 |     "DFSDeepCrawlStrategy",
32 |     "FilterChain",
33 |     "ContentTypeFilter",
34 |     "DomainFilter",
35 |     "URLFilter",
36 |     "URLPatternFilter",
37 |     "FilterStats",
38 |     "ContentRelevanceFilter",
39 |     "SEOFilter",
40 |     "KeywordRelevanceScorer",
41 |     "URLScorer",
42 |     "CompositeScorer",
43 |     "DomainAuthorityScorer",
44 |     "FreshnessScorer",
45 |     "PathDepthScorer",
46 |     "ContentTypeScorer",
47 | ]
48 | 


--------------------------------------------------------------------------------
/crawl4ai/html2text/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import main
2 | 
3 | main()
4 | 


--------------------------------------------------------------------------------
/crawl4ai/html2text/_typing.py:
--------------------------------------------------------------------------------
1 | class OutCallback:
2 |     def __call__(self, s: str) -> None:
3 |         ...
4 | 


--------------------------------------------------------------------------------
/crawl4ai/html2text/elements.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | 
 3 | 
 4 | class AnchorElement:
 5 |     __slots__ = ["attrs", "count", "outcount"]
 6 | 
 7 |     def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
 8 |         self.attrs = attrs
 9 |         self.count = count
10 |         self.outcount = outcount
11 | 
12 | 
13 | class ListElement:
14 |     __slots__ = ["name", "num"]
15 | 
16 |     def __init__(self, name: str, num: int):
17 |         self.name = name
18 |         self.num = num
19 | 


--------------------------------------------------------------------------------
/crawl4ai/hub.py:
--------------------------------------------------------------------------------
 1 | # crawl4ai/hub.py
 2 | from abc import ABC, abstractmethod
 3 | from typing import Dict, Type, Union
 4 | import logging
 5 | import importlib
 6 | from pathlib import Path
 7 | import inspect
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class BaseCrawler(ABC):
13 |     def __init__(self):
14 |         self.logger = logging.getLogger(self.__class__.__name__)
15 |         
16 |     @abstractmethod
17 |     async def run(self, url: str = "", **kwargs) -> str:
18 |         """
19 |         Implement this method to return JSON string.
20 |         Must accept URL + arbitrary kwargs for flexibility.
21 |         """
22 |         pass
23 | 
24 |     def __init_subclass__(cls, **kwargs):
25 |         """Enforce interface validation on subclassing"""
26 |         super().__init_subclass__(**kwargs)
27 |         
28 |         # Verify run method signature
29 |         run_method = cls.run
30 |         if not run_method.__code__.co_argcount >= 2:  # self + url
31 |             raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'")
32 |             
33 |         # Verify async nature
34 |         if not inspect.iscoroutinefunction(run_method):
35 |             raise TypeError(f"{cls.__name__}.run must be async")
36 | 
37 | class CrawlerHub:
38 |     _crawlers: Dict[str, Type[BaseCrawler]] = {}
39 | 
40 |     @classmethod
41 |     def _discover_crawlers(cls):
42 |         """Dynamically load crawlers from /crawlers in 3 lines"""
43 |         base_path = Path(__file__).parent / "crawlers"
44 |         for crawler_dir in base_path.iterdir():
45 |             if crawler_dir.is_dir():
46 |                 try:
47 |                     module = importlib.import_module(
48 |                         f"crawl4ai.crawlers.{crawler_dir.name}.crawler"
49 |                     )
50 |                     for attr in dir(module):
51 |                         cls._maybe_register_crawler(
52 |                             getattr(module, attr), crawler_dir.name
53 |                         )
54 |                 except Exception as e:
55 |                     logger.warning(f"Failed {crawler_dir.name}: {str(e)}")
56 | 
57 |     @classmethod
58 |     def _maybe_register_crawler(cls, obj, name: str):
59 |         """Brilliant one-liner registration"""
60 |         if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler:
61 |             module = importlib.import_module(obj.__module__)
62 |             obj.meta = getattr(module, "__meta__", {})
63 |             cls._crawlers[name] = obj
64 | 
65 |     @classmethod
66 |     def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
67 |         if not cls._crawlers:
68 |             cls._discover_crawlers()
69 |         return cls._crawlers.get(name)


--------------------------------------------------------------------------------
/crawl4ai/js_snippet/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | # Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free
 5 | def load_js_script(script_name):
 6 |     # Get the path of the current script
 7 |     current_script_path = os.path.dirname(os.path.realpath(__file__))
 8 |     # Get the path of the script to load
 9 |     script_path = os.path.join(current_script_path, script_name + ".js")
10 |     # Check if the script exists
11 |     if not os.path.exists(script_path):
12 |         raise ValueError(
13 |             f"Script {script_name} not found in the folder {current_script_path}"
14 |         )
15 |     # Load the content of the script
16 |     with open(script_path, "r") as f:
17 |         script_content = f.read()
18 |     return script_content
19 | 


--------------------------------------------------------------------------------
/crawl4ai/js_snippet/navigator_overrider.js:
--------------------------------------------------------------------------------
 1 | // Pass the Permissions Test.
 2 | const originalQuery = window.navigator.permissions.query;
 3 | window.navigator.permissions.query = (parameters) =>
 4 |     parameters.name === "notifications"
 5 |         ? Promise.resolve({ state: Notification.permission })
 6 |         : originalQuery(parameters);
 7 | Object.defineProperty(navigator, "webdriver", {
 8 |     get: () => undefined,
 9 | });
10 | window.navigator.chrome = {
11 |     runtime: {},
12 |     // Add other properties if necessary
13 | };
14 | Object.defineProperty(navigator, "plugins", {
15 |     get: () => [1, 2, 3, 4, 5],
16 | });
17 | Object.defineProperty(navigator, "languages", {
18 |     get: () => ["en-US", "en"],
19 | });
20 | Object.defineProperty(document, "hidden", {
21 |     get: () => false,
22 | });
23 | Object.defineProperty(document, "visibilityState", {
24 |     get: () => "visible",
25 | });
26 | 


--------------------------------------------------------------------------------
/crawl4ai/js_snippet/update_image_dimensions.js:
--------------------------------------------------------------------------------
 1 | () => {
 2 |     return new Promise((resolve) => {
 3 |         const filterImage = (img) => {
 4 |             // Filter out images that are too small
 5 |             if (img.width < 100 && img.height < 100) return false;
 6 | 
 7 |             // Filter out images that are not visible
 8 |             const rect = img.getBoundingClientRect();
 9 |             if (rect.width === 0 || rect.height === 0) return false;
10 | 
11 |             // Filter out images with certain class names (e.g., icons, thumbnails)
12 |             if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false;
13 | 
14 |             // Filter out images with certain patterns in their src (e.g., placeholder images)
15 |             if (img.src.includes("placeholder") || img.src.includes("icon")) return false;
16 | 
17 |             return true;
18 |         };
19 | 
20 |         const images = Array.from(document.querySelectorAll("img")).filter(filterImage);
21 |         let imagesLeft = images.length;
22 | 
23 |         if (imagesLeft === 0) {
24 |             resolve();
25 |             return;
26 |         }
27 | 
28 |         const checkImage = (img) => {
29 |             if (img.complete && img.naturalWidth !== 0) {
30 |                 img.setAttribute("width", img.naturalWidth);
31 |                 img.setAttribute("height", img.naturalHeight);
32 |                 imagesLeft--;
33 |                 if (imagesLeft === 0) resolve();
34 |             }
35 |         };
36 | 
37 |         images.forEach((img) => {
38 |             checkImage(img);
39 |             if (!img.complete) {
40 |                 img.onload = () => {
41 |                     checkImage(img);
42 |                 };
43 |                 img.onerror = () => {
44 |                     imagesLeft--;
45 |                     if (imagesLeft === 0) resolve();
46 |                 };
47 |             }
48 |         });
49 | 
50 |         // Fallback timeout of 5 seconds
51 |         // setTimeout(() => resolve(), 5000);
52 |         resolve();
53 |     });
54 | };
55 | 


--------------------------------------------------------------------------------
/crawl4ai/legacy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/crawl4ai/legacy/__init__.py


--------------------------------------------------------------------------------
/crawl4ai/legacy/cli.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import sys
  3 | import asyncio
  4 | from typing import List
  5 | from .docs_manager import DocsManager
  6 | from .async_logger import AsyncLogger
  7 | 
  8 | logger = AsyncLogger(verbose=True)
  9 | docs_manager = DocsManager(logger)
 10 | 
 11 | 
 12 | def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
 13 |     """Print formatted table with headers and rows"""
 14 |     widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
 15 |     border = "+" + "+".join("-" * (w + 2 * padding) for w in widths) + "+"
 16 | 
 17 |     def format_row(row):
 18 |         return (
 19 |             "|"
 20 |             + "|".join(
 21 |                 f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
 22 |                 for cell, w in zip(row, widths)
 23 |             )
 24 |             + "|"
 25 |         )
 26 | 
 27 |     click.echo(border)
 28 |     click.echo(format_row(headers))
 29 |     click.echo(border)
 30 |     for row in rows:
 31 |         click.echo(format_row(row))
 32 |     click.echo(border)
 33 | 
 34 | 
 35 | @click.group()
 36 | def cli():
 37 |     """Crawl4AI Command Line Interface"""
 38 |     pass
 39 | 
 40 | 
 41 | @cli.group()
 42 | def docs():
 43 |     """Documentation operations"""
 44 |     pass
 45 | 
 46 | 
 47 | @docs.command()
 48 | @click.argument("sections", nargs=-1)
 49 | @click.option(
 50 |     "--mode", type=click.Choice(["extended", "condensed"]), default="extended"
 51 | )
 52 | def combine(sections: tuple, mode: str):
 53 |     """Combine documentation sections"""
 54 |     try:
 55 |         asyncio.run(docs_manager.ensure_docs_exist())
 56 |         click.echo(docs_manager.generate(sections, mode))
 57 |     except Exception as e:
 58 |         logger.error(str(e), tag="ERROR")
 59 |         sys.exit(1)
 60 | 
 61 | 
 62 | @docs.command()
 63 | @click.argument("query")
 64 | @click.option("--top-k", "-k", default=5)
 65 | @click.option("--build-index", is_flag=True, help="Build index if missing")
 66 | def search(query: str, top_k: int, build_index: bool):
 67 |     """Search documentation"""
 68 |     try:
 69 |         result = docs_manager.search(query, top_k)
 70 |         if result == "No search index available. Call build_search_index() first.":
 71 |             if build_index or click.confirm("No search index found. Build it now?"):
 72 |                 asyncio.run(docs_manager.llm_text.generate_index_files())
 73 |                 result = docs_manager.search(query, top_k)
 74 |         click.echo(result)
 75 |     except Exception as e:
 76 |         click.echo(f"Error: {str(e)}", err=True)
 77 |         sys.exit(1)
 78 | 
 79 | 
 80 | @docs.command()
 81 | def update():
 82 |     """Update docs from GitHub"""
 83 |     try:
 84 |         asyncio.run(docs_manager.fetch_docs())
 85 |         click.echo("Documentation updated successfully")
 86 |     except Exception as e:
 87 |         click.echo(f"Error: {str(e)}", err=True)
 88 |         sys.exit(1)
 89 | 
 90 | 
 91 | @docs.command()
 92 | @click.option("--force-facts", is_flag=True, help="Force regenerate fact files")
 93 | @click.option("--clear-cache", is_flag=True, help="Clear BM25 cache")
 94 | def index(force_facts: bool, clear_cache: bool):
 95 |     """Build or rebuild search indexes"""
 96 |     try:
 97 |         asyncio.run(docs_manager.ensure_docs_exist())
 98 |         asyncio.run(
 99 |             docs_manager.llm_text.generate_index_files(
100 |                 force_generate_facts=force_facts, clear_bm25_cache=clear_cache
101 |             )
102 |         )
103 |         click.echo("Search indexes built successfully")
104 |     except Exception as e:
105 |         click.echo(f"Error: {str(e)}", err=True)
106 |         sys.exit(1)
107 | 
108 | 
109 | # Add docs list command
110 | @docs.command()
111 | def list():
112 |     """List available documentation sections"""
113 |     try:
114 |         sections = docs_manager.list()
115 |         print_table(["Sections"], [[section] for section in sections])
116 | 
117 |     except Exception as e:
118 |         click.echo(f"Error: {str(e)}", err=True)
119 |         sys.exit(1)
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     cli()
124 | 


--------------------------------------------------------------------------------
/crawl4ai/legacy/docs_manager.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import shutil
 3 | from pathlib import Path
 4 | from crawl4ai.async_logger import AsyncLogger
 5 | from crawl4ai.llmtxt import AsyncLLMTextManager
 6 | 
 7 | 
 8 | class DocsManager:
 9 |     def __init__(self, logger=None):
10 |         self.docs_dir = Path.home() / ".crawl4ai" / "docs"
11 |         self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt"
12 |         self.docs_dir.mkdir(parents=True, exist_ok=True)
13 |         self.logger = logger or AsyncLogger(verbose=True)
14 |         self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger)
15 | 
16 |     async def ensure_docs_exist(self):
17 |         """Fetch docs if not present"""
18 |         if not any(self.docs_dir.iterdir()):
19 |             await self.fetch_docs()
20 | 
21 |     async def fetch_docs(self) -> bool:
22 |         """Copy from local docs or download from GitHub"""
23 |         try:
24 |             # Try local first
25 |             if self.local_docs.exists() and (
26 |                 any(self.local_docs.glob("*.md"))
27 |                 or any(self.local_docs.glob("*.tokens"))
28 |             ):
29 |                 # Empty the local docs directory
30 |                 for file_path in self.docs_dir.glob("*.md"):
31 |                     file_path.unlink()
32 |                 # for file_path in self.docs_dir.glob("*.tokens"):
33 |                 #     file_path.unlink()
34 |                 for file_path in self.local_docs.glob("*.md"):
35 |                     shutil.copy2(file_path, self.docs_dir / file_path.name)
36 |                 # for file_path in self.local_docs.glob("*.tokens"):
37 |                 #     shutil.copy2(file_path, self.docs_dir / file_path.name)
38 |                 return True
39 | 
40 |             # Fallback to GitHub
41 |             response = requests.get(
42 |                 "https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt",
43 |                 headers={"Accept": "application/vnd.github.v3+json"},
44 |             )
45 |             response.raise_for_status()
46 | 
47 |             for item in response.json():
48 |                 if item["type"] == "file" and item["name"].endswith(".md"):
49 |                     content = requests.get(item["download_url"]).text
50 |                     with open(self.docs_dir / item["name"], "w", encoding="utf-8") as f:
51 |                         f.write(content)
52 |             return True
53 | 
54 |         except Exception as e:
55 |             self.logger.error(f"Failed to fetch docs: {str(e)}")
56 |             raise
57 | 
58 |     def list(self) -> list[str]:
59 |         """List available topics"""
60 |         names = [file_path.stem for file_path in self.docs_dir.glob("*.md")]
61 |         # Remove [0-9]+_ prefix
62 |         names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names]
63 |         # Exclude those end with .xs.md and .q.md
64 |         names = [
65 |             name
66 |             for name in names
67 |             if not name.endswith(".xs") and not name.endswith(".q")
68 |         ]
69 |         return names
70 | 
71 |     def generate(self, sections, mode="extended"):
72 |         return self.llm_text.generate(sections, mode)
73 | 
74 |     def search(self, query: str, top_k: int = 5):
75 |         return self.llm_text.search(query, top_k)
76 | 


--------------------------------------------------------------------------------
/crawl4ai/legacy/version_manager.py:
--------------------------------------------------------------------------------
 1 | # version_manager.py
 2 | from pathlib import Path
 3 | from packaging import version
 4 | from . import __version__
 5 | 
 6 | 
 7 | class VersionManager:
 8 |     def __init__(self):
 9 |         self.home_dir = Path.home() / ".crawl4ai"
10 |         self.version_file = self.home_dir / "version.txt"
11 | 
12 |     def get_installed_version(self):
13 |         """Get the version recorded in home directory"""
14 |         if not self.version_file.exists():
15 |             return None
16 |         try:
17 |             return version.parse(self.version_file.read_text().strip())
18 |         except:
19 |             return None
20 | 
21 |     def update_version(self):
22 |         """Update the version file to current library version"""
23 |         self.version_file.write_text(__version__.__version__)
24 | 
25 |     def needs_update(self):
26 |         """Check if database needs update based on version"""
27 |         installed = self.get_installed_version()
28 |         current = version.parse(__version__.__version__)
29 |         return installed is None or installed < current
30 | 


--------------------------------------------------------------------------------
/deploy/docker/.dockerignore:
--------------------------------------------------------------------------------
 1 | # .dockerignore
 2 | *
 3 | 
 4 | # Allow specific files and directories when using local installation
 5 | !crawl4ai/
 6 | !docs/
 7 | !deploy/docker/
 8 | !setup.py
 9 | !pyproject.toml
10 | !README.md
11 | !LICENSE
12 | !MANIFEST.in
13 | !setup.cfg
14 | !mkdocs.yml
15 | 
16 | .git/
17 | __pycache__/
18 | *.pyc
19 | *.pyo
20 | *.pyd
21 | .DS_Store
22 | .env
23 | .venv
24 | venv/
25 | tests/
26 | coverage.xml
27 | *.log
28 | *.swp
29 | *.egg-info/
30 | dist/
31 | build/


--------------------------------------------------------------------------------
/deploy/docker/.llm.env.example:
--------------------------------------------------------------------------------
1 | # LLM Provider Keys
2 | OPENAI_API_KEY=your_openai_key_here
3 | DEEPSEEK_API_KEY=your_deepseek_key_here
4 | ANTHROPIC_API_KEY=your_anthropic_key_here
5 | GROQ_API_KEY=your_groq_key_here
6 | TOGETHER_API_KEY=your_together_key_here
7 | MISTRAL_API_KEY=your_mistral_key_here
8 | GEMINI_API_TOKEN=your_gemini_key_here


--------------------------------------------------------------------------------
/deploy/docker/auth.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta, timezone
 3 | from typing import Dict, Optional
 4 | from jwt import JWT, jwk_from_dict
 5 | from jwt.utils import get_int_from_datetime
 6 | from fastapi import Depends, HTTPException
 7 | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 8 | from pydantic import EmailStr
 9 | from pydantic.main import BaseModel
10 | import base64
11 | 
12 | instance = JWT()
13 | security = HTTPBearer(auto_error=False)
14 | SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret")
15 | ACCESS_TOKEN_EXPIRE_MINUTES = 60
16 | 
17 | def get_jwk_from_secret(secret: str):
18 |     """Convert a secret string into a JWK object."""
19 |     secret_bytes = secret.encode('utf-8')
20 |     b64_secret = base64.urlsafe_b64encode(secret_bytes).rstrip(b'=').decode('utf-8')
21 |     return jwk_from_dict({"kty": "oct", "k": b64_secret})
22 | 
23 | def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
24 |     """Create a JWT access token with an expiration."""
25 |     to_encode = data.copy()
26 |     expire = datetime.now(timezone.utc) + (expires_delta or timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES))
27 |     to_encode.update({"exp": get_int_from_datetime(expire)})
28 |     signing_key = get_jwk_from_secret(SECRET_KEY)
29 |     return instance.encode(to_encode, signing_key, alg='HS256')
30 | 
31 | def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
32 |     """Verify the JWT token from the Authorization header."""
33 | 
34 |     if credentials is None:
35 |         return None
36 |     token = credentials.credentials
37 |     verifying_key = get_jwk_from_secret(SECRET_KEY)
38 |     try:
39 |         payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
40 |         return payload
41 |     except Exception:
42 |         raise HTTPException(status_code=401, detail="Invalid or expired token")
43 | 
44 | 
45 | def get_token_dependency(config: Dict):
46 |     """Return the token dependency if JWT is enabled, else a function that returns None."""
47 | 
48 |     if config.get("security", {}).get("jwt_enabled", False):
49 |         return verify_token
50 |     else:
51 |         return lambda: None
52 | 
53 | 
54 | class TokenRequest(BaseModel):
55 |     email: EmailStr


--------------------------------------------------------------------------------
/deploy/docker/config.yml:
--------------------------------------------------------------------------------
 1 | # Application Configuration
 2 | app:
 3 |   title: "Crawl4AI API"
 4 |   version: "1.0.0"
 5 |   host: "0.0.0.0"
 6 |   port: 11234
 7 |   reload: False
 8 |   workers: 1
 9 |   timeout_keep_alive: 300
10 | 
11 | # Default LLM Configuration
12 | llm:
13 |   provider: "openai/gpt-4o-mini"
14 |   api_key_env: "OPENAI_API_KEY"
15 |   # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
16 | 
17 | # Redis Configuration
18 | redis:
19 |   host: "localhost"
20 |   port: 6379
21 |   db: 0
22 |   password: ""
23 |   ssl: False
24 |   ssl_cert_reqs: None
25 |   ssl_ca_certs: None
26 |   ssl_certfile: None
27 |   ssl_keyfile: None
28 |   ssl_cert_reqs: None
29 |   ssl_ca_certs: None
30 |   ssl_certfile: None
31 |   ssl_keyfile: None
32 | 
33 | # Rate Limiting Configuration
34 | rate_limiting:
35 |   enabled: True
36 |   default_limit: "1000/minute"
37 |   trusted_proxies: []
38 |   storage_uri: "memory://"  # Use "redis://localhost:6379" for production
39 | 
40 | # Security Configuration
41 | security:
42 |   enabled: false 
43 |   jwt_enabled: false 
44 |   https_redirect: false
45 |   trusted_hosts: ["*"]
46 |   headers:
47 |     x_content_type_options: "nosniff"
48 |     x_frame_options: "DENY"
49 |     content_security_policy: "default-src 'self'"
50 |     strict_transport_security: "max-age=63072000; includeSubDomains"
51 | 
52 | # Crawler Configuration
53 | crawler:
54 |   base_config:
55 |     simulate_user: true
56 |   memory_threshold_percent: 95.0
57 |   rate_limiter:
58 |     enabled: true
59 |     base_delay: [1.0, 2.0]
60 |   timeouts:
61 |     stream_init: 30.0  # Timeout for stream initialization
62 |     batch_process: 300.0  # Timeout for batch processing
63 |   pool:
64 |     max_pages: 40                          # ← GLOBAL_SEM permits
65 |     idle_ttl_sec: 1800                     # ← 30 min janitor cutoff
66 |   browser:
67 |     kwargs:
68 |       headless: true
69 |       text_mode: true
70 |     extra_args:
71 |       # - "--single-process"
72 |       - "--no-sandbox"
73 |       - "--disable-dev-shm-usage"
74 |       - "--disable-gpu"
75 |       - "--disable-software-rasterizer"
76 |       - "--disable-web-security"
77 |       - "--allow-insecure-localhost"
78 |       - "--ignore-certificate-errors"
79 | 
80 | # Logging Configuration
81 | logging:
82 |   level: "INFO"
83 |   format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
84 | 
85 | # Observability Configuration
86 | observability:
87 |   prometheus:
88 |     enabled: True
89 |     endpoint: "/metrics"
90 |   health_check:
91 |     endpoint: "/health"


--------------------------------------------------------------------------------
/deploy/docker/crawler_pool.py:
--------------------------------------------------------------------------------
 1 | # crawler_pool.py  (new file)
 2 | import asyncio, json, hashlib, time, psutil
 3 | from contextlib import suppress
 4 | from typing import Dict
 5 | from crawl4ai import AsyncWebCrawler, BrowserConfig
 6 | from typing import Dict
 7 | from utils import load_config 
 8 | 
 9 | CONFIG = load_config()
10 | 
11 | POOL: Dict[str, AsyncWebCrawler] = {}
12 | LAST_USED: Dict[str, float] = {}
13 | LOCK = asyncio.Lock()
14 | 
15 | MEM_LIMIT  = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0)   # % RAM – refuse new browsers above this
16 | IDLE_TTL  = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)   # close if unused for 30 min
17 | 
18 | def _sig(cfg: BrowserConfig) -> str:
19 |     payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
20 |     return hashlib.sha1(payload.encode()).hexdigest()
21 | 
22 | async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
23 |     try:
24 |         sig = _sig(cfg)
25 |         async with LOCK:
26 |             if sig in POOL:
27 |                 LAST_USED[sig] = time.time();  
28 |                 return POOL[sig]
29 |             if psutil.virtual_memory().percent >= MEM_LIMIT:
30 |                 raise MemoryError("RAM pressure – new browser denied")
31 |             crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
32 |             await crawler.start()
33 |             POOL[sig] = crawler; LAST_USED[sig] = time.time()
34 |             return crawler
35 |     except MemoryError as e:
36 |         raise MemoryError(f"RAM pressure – new browser denied: {e}")
37 |     except Exception as e:
38 |         raise RuntimeError(f"Failed to start browser: {e}")
39 |     finally:
40 |         if sig in POOL:
41 |             LAST_USED[sig] = time.time()
42 |         else:
43 |             # If we failed to start the browser, we should remove it from the pool
44 |             POOL.pop(sig, None)
45 |             LAST_USED.pop(sig, None)
46 |         # If we failed to start the browser, we should remove it from the pool
47 | async def close_all():
48 |     async with LOCK:
49 |         await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
50 |         POOL.clear(); LAST_USED.clear()
51 | 
52 | async def janitor():
53 |     while True:
54 |         await asyncio.sleep(60)
55 |         now = time.time()
56 |         async with LOCK:
57 |             for sig, crawler in list(POOL.items()):
58 |                 if now - LAST_USED[sig] > IDLE_TTL:
59 |                     with suppress(Exception): await crawler.close()
60 |                     POOL.pop(sig, None); LAST_USED.pop(sig, None)
61 | 


--------------------------------------------------------------------------------
/deploy/docker/job.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Job endpoints (enqueue + poll) for long-running LL​M extraction and raw crawl.
  3 | Relies on the existing Redis task helpers in api.py
  4 | """
  5 | 
  6 | from typing import Dict, Optional, Callable
  7 | from fastapi import APIRouter, BackgroundTasks, Depends, Request
  8 | from pydantic import BaseModel, HttpUrl
  9 | 
 10 | from api import (
 11 |     handle_llm_request,
 12 |     handle_crawl_job,
 13 |     handle_task_status,
 14 | )
 15 | 
 16 | # ------------- dependency placeholders -------------
 17 | _redis = None        # will be injected from server.py
 18 | _config = None
 19 | _token_dep: Callable = lambda: None  # dummy until injected
 20 | 
 21 | # public router
 22 | router = APIRouter()
 23 | 
 24 | 
 25 | # === init hook called by server.py =========================================
 26 | def init_job_router(redis, config, token_dep) -> APIRouter:
 27 |     """Inject shared singletons and return the router for mounting."""
 28 |     global _redis, _config, _token_dep
 29 |     _redis, _config, _token_dep = redis, config, token_dep
 30 |     return router
 31 | 
 32 | 
 33 | # ---------- payload models --------------------------------------------------
 34 | class LlmJobPayload(BaseModel):
 35 |     url:    HttpUrl
 36 |     q:      str
 37 |     schema: Optional[str] = None
 38 |     cache:  bool = False
 39 | 
 40 | 
 41 | class CrawlJobPayload(BaseModel):
 42 |     urls:           list[HttpUrl]
 43 |     browser_config: Dict = {}
 44 |     crawler_config: Dict = {}
 45 | 
 46 | 
 47 | # ---------- LL​M job ---------------------------------------------------------
 48 | @router.post("/llm/job", status_code=202)
 49 | async def llm_job_enqueue(
 50 |         payload: LlmJobPayload,
 51 |         background_tasks: BackgroundTasks,
 52 |         request: Request,
 53 |         _td: Dict = Depends(lambda: _token_dep()),   # late-bound dep
 54 | ):
 55 |     return await handle_llm_request(
 56 |         _redis,
 57 |         background_tasks,
 58 |         request,
 59 |         str(payload.url),
 60 |         query=payload.q,
 61 |         schema=payload.schema,
 62 |         cache=payload.cache,
 63 |         config=_config,
 64 |     )
 65 | 
 66 | 
 67 | @router.get("/llm/job/{task_id}")
 68 | async def llm_job_status(
 69 |     request: Request,
 70 |     task_id: str,
 71 |     _td: Dict = Depends(lambda: _token_dep())
 72 | ):
 73 |     return await handle_task_status(_redis, task_id)
 74 | 
 75 | 
 76 | # ---------- CRAWL job -------------------------------------------------------
 77 | @router.post("/crawl/job", status_code=202)
 78 | async def crawl_job_enqueue(
 79 |         payload: CrawlJobPayload,
 80 |         background_tasks: BackgroundTasks,
 81 |         _td: Dict = Depends(lambda: _token_dep()),
 82 | ):
 83 |     return await handle_crawl_job(
 84 |         _redis,
 85 |         background_tasks,
 86 |         [str(u) for u in payload.urls],
 87 |         payload.browser_config,
 88 |         payload.crawler_config,
 89 |         config=_config,
 90 |     )
 91 | 
 92 | 
 93 | @router.get("/crawl/job/{task_id}")
 94 | async def crawl_job_status(
 95 |     request: Request,
 96 |     task_id: str,
 97 |     _td: Dict = Depends(lambda: _token_dep())
 98 | ):
 99 |     return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
100 | 


--------------------------------------------------------------------------------
/deploy/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi>=0.115.12
 2 | uvicorn>=0.34.2
 3 | gunicorn>=23.0.0
 4 | slowapi==0.1.9
 5 | prometheus-fastapi-instrumentator>=7.1.0
 6 | redis>=5.2.1
 7 | jwt>=1.3.1
 8 | dnspython>=2.7.0
 9 | email-validator==2.2.0
10 | sse-starlette==2.2.1
11 | pydantic>=2.11
12 | rank-bm25==0.2.2
13 | anyio==4.9.0
14 | PyJWT==2.10.1
15 | mcp>=1.6.0
16 | websockets>=15.0.1
17 | 


--------------------------------------------------------------------------------
/deploy/docker/schemas.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Dict
 2 | from enum import Enum
 3 | from pydantic import BaseModel, Field
 4 | from utils import FilterType
 5 | 
 6 | 
 7 | class CrawlRequest(BaseModel):
 8 |     urls: List[str] = Field(min_length=1, max_length=100)
 9 |     browser_config: Optional[Dict] = Field(default_factory=dict)
10 |     crawler_config: Optional[Dict] = Field(default_factory=dict)
11 | 
12 | class MarkdownRequest(BaseModel):
13 |     """Request body for the /md endpoint."""
14 |     url: str                    = Field(...,  description="Absolute http/https URL to fetch")
15 |     f:   FilterType             = Field(FilterType.FIT,
16 |                                         description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
17 |     q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
18 |     c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
19 | 
20 | 
21 | class RawCode(BaseModel):
22 |     code: str
23 | 
24 | class HTMLRequest(BaseModel):
25 |     url: str
26 |     
27 | class ScreenshotRequest(BaseModel):
28 |     url: str
29 |     screenshot_wait_for: Optional[float] = 2
30 |     output_path: Optional[str] = None
31 | 
32 | class PDFRequest(BaseModel):
33 |     url: str
34 |     output_path: Optional[str] = None
35 | 
36 | 
37 | class JSEndpointRequest(BaseModel):
38 |     url: str
39 |     scripts: List[str] = Field(
40 |         ...,
41 |         description="List of separated JavaScript snippets to execute"
42 |     )


--------------------------------------------------------------------------------
/deploy/docker/supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisord]
 2 | nodaemon=true                   ; Run supervisord in the foreground
 3 | logfile=/dev/null               ; Log supervisord output to stdout/stderr
 4 | logfile_maxbytes=0
 5 | 
 6 | [program:redis]
 7 | command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
 8 | user=appuser                    ; Run redis as our non-root user
 9 | autorestart=true
10 | priority=10
11 | stdout_logfile=/dev/stdout      ; Redirect redis stdout to container stdout
12 | stdout_logfile_maxbytes=0
13 | stderr_logfile=/dev/stderr      ; Redirect redis stderr to container stderr
14 | stderr_logfile_maxbytes=0
15 | 
16 | [program:gunicorn]
17 | command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 1 --threads 4 --timeout 1800 --graceful-timeout 30 --keep-alive 300 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
18 | directory=/app                  ; Working directory for the app
19 | user=appuser                    ; Run gunicorn as our non-root user
20 | autorestart=true
21 | priority=20
22 | environment=PYTHONUNBUFFERED=1  ; Ensure Python output is sent straight to logs
23 | stdout_logfile=/dev/stdout      ; Redirect gunicorn stdout to container stdout
24 | stdout_logfile_maxbytes=0
25 | stderr_logfile=/dev/stderr      ; Redirect gunicorn stderr to container stderr
26 | stderr_logfile_maxbytes=0
27 | 
28 | # Optional: Add filebeat or other logging agents here if needed


--------------------------------------------------------------------------------
/deploy/docker/utils.py:
--------------------------------------------------------------------------------
 1 | import dns.resolver
 2 | import logging
 3 | import yaml
 4 | from datetime import datetime
 5 | from enum import Enum
 6 | from pathlib import Path
 7 | from fastapi import Request
 8 | from typing import Dict, Optional
 9 | 
10 | class TaskStatus(str, Enum):
11 |     PROCESSING = "processing"
12 |     FAILED = "failed"
13 |     COMPLETED = "completed"
14 | 
15 | class FilterType(str, Enum):
16 |     RAW = "raw"
17 |     FIT = "fit"
18 |     BM25 = "bm25"
19 |     LLM = "llm"
20 | 
21 | def load_config() -> Dict:
22 |     """Load and return application configuration."""
23 |     config_path = Path(__file__).parent / "config.yml"
24 |     with open(config_path, "r") as config_file:
25 |         return yaml.safe_load(config_file)
26 | 
27 | def setup_logging(config: Dict) -> None:
28 |     """Configure application logging."""
29 |     logging.basicConfig(
30 |         level=config["logging"]["level"],
31 |         format=config["logging"]["format"]
32 |     )
33 | 
34 | def get_base_url(request: Request) -> str:
35 |     """Get base URL including scheme and host."""
36 |     return f"{request.url.scheme}://{request.url.netloc}"
37 | 
38 | def is_task_id(value: str) -> bool:
39 |     """Check if the value matches task ID pattern."""
40 |     return value.startswith("llm_") and "_" in value
41 | 
42 | def datetime_handler(obj: any) -> Optional[str]:
43 |     """Handle datetime serialization for JSON."""
44 |     if hasattr(obj, 'isoformat'):
45 |         return obj.isoformat()
46 |     raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
47 | 
48 | def should_cleanup_task(created_at: str, ttl_seconds: int = 3600) -> bool:
49 |     """Check if task should be cleaned up based on creation time."""
50 |     created = datetime.fromisoformat(created_at)
51 |     return (datetime.now() - created).total_seconds() > ttl_seconds
52 | 
53 | def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
54 |     """Decode Redis hash data from bytes to strings."""
55 |     return {k.decode('utf-8'): v.decode('utf-8') for k, v in hash_data.items()}
56 | 
57 | 
58 | 
59 | def verify_email_domain(email: str) -> bool:
60 |     try:
61 |         domain = email.split('@')[1]
62 |         # Try to resolve MX records for the domain.
63 |         records = dns.resolver.resolve(domain, 'MX')
64 |         return True if records else False
65 |     except Exception as e:
66 |         return False


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | # Shared configuration for all environments
 4 | x-base-config: &base-config
 5 |   ports:
 6 |     - "11235:11235"  # Gunicorn port
 7 |   env_file:
 8 |     - .llm.env       # API keys (create from .llm.env.example)
 9 |   environment:
10 |     - OPENAI_API_KEY=${OPENAI_API_KEY:-}
11 |     - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
12 |     - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
13 |     - GROQ_API_KEY=${GROQ_API_KEY:-}
14 |     - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
15 |     - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
16 |     - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
17 |   volumes:
18 |     - /dev/shm:/dev/shm  # Chromium performance
19 |   deploy:
20 |     resources:
21 |       limits:
22 |         memory: 4G
23 |       reservations:
24 |         memory: 1G
25 |   restart: unless-stopped
26 |   healthcheck:
27 |     test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
28 |     interval: 30s
29 |     timeout: 10s
30 |     retries: 3
31 |     start_period: 40s
32 |   user: "appuser"
33 | 
34 | services:
35 |   crawl4ai:
36 |     # 1. Default: Pull multi-platform test image from Docker Hub
37 |     # 2. Override with local image via: IMAGE=local-test docker compose up
38 |     image: ${IMAGE:-unclecode/crawl4ai:${TAG:-latest}}
39 |     
40 |     # Local build config (used with --build)
41 |     build:
42 |       context: .
43 |       dockerfile: Dockerfile
44 |       args:
45 |         INSTALL_TYPE: ${INSTALL_TYPE:-default}
46 |         ENABLE_GPU: ${ENABLE_GPU:-false}
47 |     
48 |     # Inherit shared config
49 |     <<: *base-config


--------------------------------------------------------------------------------
/docs/apps/linkdin/schemas/company_card.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "LinkedIn Company Card",
 3 |   "baseSelector": "div.search-results-container ul[role='list'] > li",
 4 |   "fields": [
 5 |     {
 6 |       "name": "handle",
 7 |       "selector": "a[href*='/company/']",
 8 |       "type": "attribute",
 9 |       "attribute": "href"
10 |     },
11 |     {
12 |       "name": "profile_image",
13 |       "selector": "a[href*='/company/'] img",
14 |       "type": "attribute",
15 |       "attribute": "src"
16 |     },
17 |     {
18 |       "name": "name",
19 |       "selector": "span[class*='t-16'] a",
20 |       "type": "text"
21 |     },
22 |     {
23 |       "name": "descriptor",
24 |       "selector": "div[class*='t-black t-normal']",
25 |       "type": "text"
26 |     },
27 |     {
28 |       "name": "about",
29 |       "selector": "p[class*='entity-result__summary--2-lines']",
30 |       "type": "text"
31 |     },
32 |     {
33 |       "name": "followers",
34 |       "selector": "div:contains('followers')",
35 |       "type": "regex",
36 |       "pattern": "(\\d+)\\s*followers"
37 |     }
38 |   ]
39 | }


--------------------------------------------------------------------------------
/docs/apps/linkdin/schemas/people_card.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "LinkedIn People Card",
 3 |   "baseSelector": "li.org-people-profile-card__profile-card-spacing",
 4 |   "fields": [
 5 |     {
 6 |       "name": "profile_url",
 7 |       "selector": "a.eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo",
 8 |       "type": "attribute",
 9 |       "attribute": "href"
10 |     },
11 |     {
12 |       "name": "name",
13 |       "selector": ".artdeco-entity-lockup__title .lt-line-clamp--single-line",
14 |       "type": "text"
15 |     },
16 |     {
17 |       "name": "headline",
18 |       "selector": ".artdeco-entity-lockup__subtitle .lt-line-clamp--multi-line",
19 |       "type": "text"
20 |     },
21 |     {
22 |       "name": "followers",
23 |       "selector": ".lt-line-clamp--multi-line.t-12",
24 |       "type": "text"
25 |     },
26 |     {
27 |       "name": "connection_degree",
28 |       "selector": ".artdeco-entity-lockup__badge .artdeco-entity-lockup__degree",
29 |       "type": "text"
30 |     },
31 |     {
32 |       "name": "avatar_url",
33 |       "selector": ".artdeco-entity-lockup__image img",
34 |       "type": "attribute",
35 |       "attribute": "src"
36 |     }
37 |   ]
38 | }


--------------------------------------------------------------------------------
/docs/apps/linkdin/templates/ai.js:
--------------------------------------------------------------------------------
 1 | // ==== File: ai.js ====
 2 | 
 3 | class ApiHandler {
 4 |     constructor(apiKey = null) {
 5 |       this.apiKey = apiKey || localStorage.getItem("openai_api_key") || "";
 6 |       console.log("ApiHandler ready");
 7 |     }
 8 |   
 9 |     setApiKey(k) {
10 |       this.apiKey = k.trim();
11 |       if (this.apiKey) localStorage.setItem("openai_api_key", this.apiKey);
12 |     }
13 |   
14 |     async *chatStream(messages, {model = "gpt-4o", temperature = 0.7} = {}) {
15 |       if (!this.apiKey) throw new Error("OpenAI API key missing");
16 |       const payload = {model, messages, stream: true, max_tokens: 1024};
17 |       const controller = new AbortController();
18 |   
19 |       const res = await fetch("https://api.openai.com/v1/chat/completions", {
20 |         method: "POST",
21 |         headers: {
22 |           "Content-Type": "application/json",
23 |           Authorization: `Bearer ${this.apiKey}`,
24 |         },
25 |         body: JSON.stringify(payload),
26 |         signal: controller.signal,
27 |       });
28 |       if (!res.ok) throw new Error(`OpenAI: ${res.statusText}`);
29 |       const reader = res.body.getReader();
30 |       const dec = new TextDecoder();
31 |   
32 |       let buf = "";
33 |       while (true) {
34 |         const {done, value} = await reader.read();
35 |         if (done) break;
36 |         buf += dec.decode(value, {stream: true});
37 |         for (const line of buf.split("\n")) {
38 |           if (!line.startsWith("data: ")) continue;
39 |           if (line.includes("[DONE]")) return;
40 |           const json = JSON.parse(line.slice(6));
41 |           const delta = json.choices?.[0]?.delta?.content;
42 |           if (delta) yield delta;
43 |         }
44 |         buf = buf.endsWith("\n") ? "" : buf; // keep partial line
45 |       }
46 |     }
47 |   }
48 |   
49 |   window.API = new ApiHandler();
50 |   


--------------------------------------------------------------------------------
/docs/assets/pitch-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/assets/pitch-dark.png


--------------------------------------------------------------------------------
/docs/assets/pitch-dark.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 500">
 2 |     <!-- Background -->
 3 |     <rect width="800" height="500" fill="#1a1a1a"/>
 4 |     
 5 |     <!-- Opportunities Section -->
 6 |     <g transform="translate(50,50)">
 7 |         <!-- Opportunity 1 Box -->
 8 |         <rect x="0" y="0" width="300" height="150" rx="10" fill="#1a2d3d" stroke="#64b5f6" stroke-width="2"/>
 9 |         <text x="150" y="30" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#64b5f6">Data Capitalization Opportunity</text>
10 |         <text x="150" y="60" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">
11 |             <tspan x="150" dy="0">Transform digital footprints into assets</tspan>
12 |             <tspan x="150" dy="20">Personal data as capital</tspan>
13 |             <tspan x="150" dy="20">Enterprise knowledge valuation</tspan>
14 |             <tspan x="150" dy="20">New form of wealth creation</tspan>
15 |         </text>
16 | 
17 |         <!-- Opportunity 2 Box -->
18 |         <rect x="0" y="200" width="300" height="150" rx="10" fill="#1a2d1a" stroke="#81c784" stroke-width="2"/>
19 |         <text x="150" y="230" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#81c784">Authentic Data Potential</text>
20 |         <text x="150" y="260" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">
21 |             <tspan x="150" dy="0">Vast reservoir of real insights</tspan>
22 |             <tspan x="150" dy="20">Enhanced AI development</tspan>
23 |             <tspan x="150" dy="20">Diverse human knowledge</tspan>
24 |             <tspan x="150" dy="20">Willing participation model</tspan>
25 |         </text>
26 |     </g>
27 | 
28 |     <!-- Development Pathway -->
29 |     <g transform="translate(450,50)">
30 |         <!-- Step 1 Box -->
31 |         <rect x="0" y="0" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
32 |         <text x="150" y="35" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">1. Open-Source Foundation</text>
33 |         <text x="150" y="65" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Data extraction engine &amp; community development</text>
34 | 
35 |         <!-- Step 2 Box -->
36 |         <rect x="0" y="125" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
37 |         <text x="150" y="160" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">2. Data Capitalization Platform</text>
38 |         <text x="150" y="190" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Tools to structure &amp; value digital assets</text>
39 | 
40 |         <!-- Step 3 Box -->
41 |         <rect x="0" y="250" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
42 |         <text x="150" y="285" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">3. Shared Data Marketplace</text>
43 |         <text x="150" y="315" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Economic platform for data exchange</text>
44 |     </g>
45 | 
46 |     <!-- Connecting Arrows -->
47 |     <g transform="translate(400,125)">
48 |         <path d="M-20,0 L40,0" stroke="#666" stroke-width="2" marker-end="url(#arrowhead)"/>
49 |         <path d="M-20,200 L40,200" stroke="#666" stroke-width="2" marker-end="url(#arrowhead)"/>
50 |     </g>
51 | 
52 |     <!-- Arrow Marker -->
53 |     <defs>
54 |         <marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
55 |             <polygon points="0 0, 10 3.5, 0 7" fill="#666"/>
56 |         </marker>
57 |     </defs>
58 | 
59 |     <!-- Vision Box at Bottom -->
60 |     <g transform="translate(200,420)">
61 |         <rect x="0" y="0" width="400" height="60" rx="10" fill="#2d2613" stroke="#ffd54f" stroke-width="2"/>
62 |         <text x="200" y="35" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ffd54f">Economic Vision: Shared Data Economy</text>
63 |     </g>
64 | </svg>


--------------------------------------------------------------------------------
/docs/assets/powered-by-dark.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
 2 |   <!-- Dark Theme -->
 3 |   <g>
 4 |     <defs>
 5 |       <pattern id="halftoneDark" width="4" height="4" patternUnits="userSpaceOnUse">
 6 |         <circle cx="2" cy="2" r="1" fill="#eee" opacity="0.1"/>
 7 |       </pattern>
 8 |       <pattern id="halftoneTextDark" width="3" height="3" patternUnits="userSpaceOnUse">
 9 |         <circle cx="1.5" cy="1.5" r="2" fill="#aaa" opacity="0.2"/>
10 |       </pattern>
11 |     </defs>
12 |     <!-- White border - added as outer rectangle -->
13 |     <rect width="120" height="35" rx="5" fill="#111"/>
14 |     <!-- Dark background slightly smaller to show thicker border -->
15 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="#1a1a1a"/>
16 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneDark)"/>
17 |     
18 |     <!-- Logo with halftone -->
19 |     <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#eee" stroke-width="2"/>
20 |     <path d="M18 17.5 L27 17.5" stroke="#eee" stroke-width="2"/>
21 |     <circle cx="22.5" cy="17.5" r="2" fill="#eee"/>
22 |     
23 |     <text x="40" y="23" fill="#eee" font-family="Arial, sans-serif" font-weight="500" font-size="14">Crawl4AI</text>
24 |   </g>
25 | </svg>


--------------------------------------------------------------------------------
/docs/assets/powered-by-disco.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
 2 |   <g>
 3 |     <defs>
 4 |       <pattern id="cyberdots" width="4" height="4" patternUnits="userSpaceOnUse">
 5 |         <circle cx="2" cy="2" r="1">
 6 |           <animate attributeName="fill" 
 7 |                    values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4" 
 8 |                    dur="6s" 
 9 |                    repeatCount="indefinite"/>
10 |           <animate attributeName="opacity" 
11 |                    values="0.2;0.4;0.2" 
12 |                    dur="4s" 
13 |                    repeatCount="indefinite"/>
14 |         </circle>
15 |       </pattern>
16 |       <filter id="neonGlow" x="-20%" y="-20%" width="140%" height="140%">
17 |         <feGaussianBlur stdDeviation="1" result="blur"/>
18 |         <feFlood flood-color="#FF2EC4" flood-opacity="0.2">
19 |           <animate attributeName="flood-color"
20 |                    values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
21 |                    dur="8s"
22 |                    repeatCount="indefinite"/>
23 |         </feFlood>
24 |         <feComposite in2="blur" operator="in"/>
25 |         <feMerge>
26 |           <feMergeNode/>
27 |           <feMergeNode in="SourceGraphic"/>
28 |         </feMerge>
29 |       </filter>
30 |     </defs>
31 |     
32 |     <rect width="120" height="35" rx="5" fill="#0A0A0F"/>
33 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="#16161E"/>
34 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#cyberdots)"/>
35 |     
36 |     <!-- Logo with animated neon -->
37 |     <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)">
38 |       <animate attributeName="stroke"
39 |                values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
40 |                dur="8s"
41 |                repeatCount="indefinite"/>
42 |     </path>
43 |     <path d="M18 17.5 L27 17.5" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)">
44 |       <animate attributeName="stroke"
45 |                values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
46 |                dur="8s"
47 |                repeatCount="indefinite"/>
48 |     </path>
49 |     <circle cx="22.5" cy="17.5" r="2" fill="#0BC5EA">
50 |       <animate attributeName="fill" 
51 |                values="#0BC5EA;#FF2EC4;#8B5CF6;#0BC5EA" 
52 |                dur="8s" 
53 |                repeatCount="indefinite"/>
54 |     </circle>
55 |     
56 |     <text x="40" y="23" font-family="Arial, sans-serif" font-weight="500" font-size="14" filter="url(#neonGlow)">
57 |       <animate attributeName="fill"
58 |                values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
59 |                dur="8s"
60 |                repeatCount="indefinite"/>
61 |       Crawl4AI
62 |     </text>
63 |   </g>
64 | </svg>


--------------------------------------------------------------------------------
/docs/assets/powered-by-light.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
 2 |   <g>
 3 |     <defs>
 4 |       <pattern id="halftoneLight" width="4" height="4" patternUnits="userSpaceOnUse">
 5 |         <circle cx="2" cy="2" r="1" fill="#111" opacity="0.1"/>
 6 |       </pattern>
 7 |     </defs>
 8 |     <!-- Dark border -->
 9 |     <rect width="120" height="35" rx="5" fill="#DDD"/>
10 |     <!-- Light background -->
11 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="#fff"/>
12 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneLight)"/>
13 |     
14 |     <!-- Logo -->
15 |     <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#111" stroke-width="2"/>
16 |     <path d="M18 17.5 L27 17.5" stroke="#111" stroke-width="2"/>
17 |     <circle cx="22.5" cy="17.5" r="2" fill="#111"/>
18 |     
19 |     <text x="40" y="23" fill="#111" font-family="Arial, sans-serif" font-weight="500" font-size="14">Crawl4AI</text>
20 |   </g>
21 | </svg>


--------------------------------------------------------------------------------
/docs/assets/powered-by-night.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
 2 |   <g>
 3 |     <defs>
 4 |       <pattern id="halftoneDark" width="4" height="4" patternUnits="userSpaceOnUse">
 5 |         <circle cx="2" cy="2" r="1" fill="#8B5CF6" opacity="0.1"/>
 6 |       </pattern>
 7 |       <filter id="neonGlow" x="-20%" y="-20%" width="140%" height="140%">
 8 |         <feGaussianBlur stdDeviation="1" result="blur"/>
 9 |         <feFlood flood-color="#8B5CF6" flood-opacity="0.2"/>
10 |         <feComposite in2="blur" operator="in"/>
11 |         <feMerge>
12 |           <feMergeNode/>
13 |           <feMergeNode in="SourceGraphic"/>
14 |         </feMerge>
15 |       </filter>
16 |     </defs>
17 |     <rect width="120" height="35" rx="5" fill="#0A0A0F"/>
18 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="#16161E"/>
19 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneDark)"/>
20 |     
21 |     <!-- Logo with neon glow -->
22 |     <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)"/>
23 |     <path d="M18 17.5 L27 17.5" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)"/>
24 |     <circle cx="22.5" cy="17.5" r="2" fill="#8B5CF6"/>
25 |     
26 |     <text x="40" y="23" fill="#fff" font-family="Arial, sans-serif" font-weight="500" font-size="14" filter="url(#neonGlow)">Crawl4AI</text>
27 |   </g>
28 | </svg>


--------------------------------------------------------------------------------
/docs/examples/README_BUILTIN_BROWSER.md:
--------------------------------------------------------------------------------
  1 | # Builtin Browser in Crawl4AI
  2 | 
  3 | This document explains the builtin browser feature in Crawl4AI and how to use it effectively.
  4 | 
  5 | ## What is the Builtin Browser?
  6 | 
  7 | The builtin browser is a persistent Chrome instance that Crawl4AI manages for you. It runs in the background and can be used by multiple crawling operations, eliminating the need to start and stop browsers for each crawl.
  8 | 
  9 | Benefits include:
 10 | - **Faster startup times** - The browser is already running, so your scripts start faster
 11 | - **Shared resources** - All your crawling scripts can use the same browser instance
 12 | - **Simplified management** - No need to worry about CDP URLs or browser processes
 13 | - **Persistent cookies and sessions** - Browser state persists between script runs
 14 | - **Less resource usage** - Only one browser instance for multiple scripts
 15 | 
 16 | ## Using the Builtin Browser
 17 | 
 18 | ### In Python Code
 19 | 
 20 | Using the builtin browser in your code is simple:
 21 | 
 22 | ```python
 23 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 24 | 
 25 | # Create browser config with builtin mode
 26 | browser_config = BrowserConfig(
 27 |     browser_mode="builtin",  # This is the key setting!
 28 |     headless=True            # Can be headless or not
 29 | )
 30 | 
 31 | # Create the crawler
 32 | crawler = AsyncWebCrawler(config=browser_config)
 33 | 
 34 | # Use it - no need to explicitly start()
 35 | result = await crawler.arun("https://example.com")
 36 | ```
 37 | 
 38 | Key points:
 39 | 1. Set `browser_mode="builtin"` in your BrowserConfig
 40 | 2. No need for explicit `start()` call - the crawler will automatically connect to the builtin browser
 41 | 3. No need to use a context manager or call `close()` - the browser stays running
 42 | 
 43 | ### Via CLI
 44 | 
 45 | The CLI provides commands to manage the builtin browser:
 46 | 
 47 | ```bash
 48 | # Start the builtin browser
 49 | crwl browser start
 50 | 
 51 | # Check its status
 52 | crwl browser status
 53 | 
 54 | # Open a visible window to see what the browser is doing
 55 | crwl browser view --url https://example.com
 56 | 
 57 | # Stop it when no longer needed
 58 | crwl browser stop
 59 | 
 60 | # Restart with different settings
 61 | crwl browser restart --no-headless
 62 | ```
 63 | 
 64 | When crawling via CLI, simply add the builtin browser mode:
 65 | 
 66 | ```bash
 67 | crwl https://example.com -b "browser_mode=builtin"
 68 | ```
 69 | 
 70 | ## How It Works
 71 | 
 72 | 1. When a crawler with `browser_mode="builtin"` is created:
 73 |    - It checks if a builtin browser is already running
 74 |    - If not, it automatically launches one
 75 |    - It connects to the browser via CDP (Chrome DevTools Protocol)
 76 | 
 77 | 2. The browser process continues running after your script exits
 78 |    - This means it's ready for the next crawl
 79 |    - You can manage it via the CLI commands
 80 | 
 81 | 3. During installation, Crawl4AI attempts to create a builtin browser automatically
 82 | 
 83 | ## Example
 84 | 
 85 | See the [builtin_browser_example.py](builtin_browser_example.py) file for a complete example.
 86 | 
 87 | Run it with:
 88 | 
 89 | ```bash
 90 | python builtin_browser_example.py
 91 | ```
 92 | 
 93 | ## When to Use
 94 | 
 95 | The builtin browser is ideal for:
 96 | - Scripts that run frequently
 97 | - Development and testing workflows
 98 | - Applications that need to minimize startup time
 99 | - Systems where you want to manage browser instances centrally
100 | 
101 | You might not want to use it when:
102 | - Running one-off scripts
103 | - When you need different browser configurations for different tasks
104 | - In environments where persistent processes are not allowed
105 | 
106 | ## Troubleshooting
107 | 
108 | If you encounter issues:
109 | 
110 | 1. Check the browser status:
111 |    ```
112 |    crwl browser status
113 |    ```
114 | 
115 | 2. Try restarting it:
116 |    ```
117 |    crwl browser restart
118 |    ```
119 | 
120 | 3. If problems persist, stop it and let Crawl4AI start a fresh one:
121 |    ```
122 |    crwl browser stop
123 |    ```


--------------------------------------------------------------------------------
/docs/examples/arun_vs_arun_many.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode
 4 | from crawl4ai.async_configs import CrawlerRunConfig
 5 | from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter
 6 | 
 7 | VERBOSE = False
 8 | 
 9 | async def crawl_sequential(urls):
10 |     config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
11 |     results = []
12 |     start_time = time.perf_counter()
13 |     async with AsyncWebCrawler() as crawler:
14 |         for url in urls:
15 |             result_container = await crawler.arun(url=url, config=config)
16 |             results.append(result_container[0])
17 |     total_time = time.perf_counter() - start_time
18 |     return total_time, results
19 | 
20 | async def crawl_parallel_dispatcher(urls):
21 |     config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
22 |     # Dispatcher with rate limiter enabled (default behavior)
23 |     dispatcher = MemoryAdaptiveDispatcher(
24 |         rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3),
25 |         max_session_permit=50,
26 |     )
27 |     start_time = time.perf_counter()
28 |     async with AsyncWebCrawler() as crawler:
29 |         result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
30 |         results = []
31 |         if isinstance(result_container, list):
32 |             results = result_container
33 |         else:
34 |             async for res in result_container:
35 |                 results.append(res)
36 |     total_time = time.perf_counter() - start_time
37 |     return total_time, results
38 | 
39 | async def crawl_parallel_no_rate_limit(urls):
40 |     config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
41 |     # Dispatcher with no rate limiter and a high session permit to avoid queuing
42 |     dispatcher = MemoryAdaptiveDispatcher(
43 |         rate_limiter=None,
44 |         max_session_permit=len(urls)  # allow all URLs concurrently
45 |     )
46 |     start_time = time.perf_counter()
47 |     async with AsyncWebCrawler() as crawler:
48 |         result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
49 |         results = []
50 |         if isinstance(result_container, list):
51 |             results = result_container
52 |         else:
53 |             async for res in result_container:
54 |                 results.append(res)
55 |     total_time = time.perf_counter() - start_time
56 |     return total_time, results
57 | 
58 | async def main():
59 |     urls = ["https://example.com"] * 100
60 |     print(f"Crawling {len(urls)} URLs sequentially...")
61 |     seq_time, seq_results = await crawl_sequential(urls)
62 |     print(f"Sequential crawling took: {seq_time:.2f} seconds\n")
63 |     
64 |     print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...")
65 |     disp_time, disp_results = await crawl_parallel_dispatcher(urls)
66 |     print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n")
67 |        
68 |     print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...")
69 |     no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls)
70 |     print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n")
71 |     
72 |     print("Crawl4ai - Crawling Comparison")
73 |     print("--------------------------------------------------------")
74 |     print(f"Sequential crawling took: {seq_time:.2f} seconds")
75 |     print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds")
76 |     print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds")
77 |     
78 | if __name__ == "__main__":
79 |     asyncio.run(main())
80 | 


--------------------------------------------------------------------------------
/docs/examples/assets/audio.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/audio.mp3


--------------------------------------------------------------------------------
/docs/examples/assets/basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/basic.png


--------------------------------------------------------------------------------
/docs/examples/assets/cosine_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/cosine_extraction.png


--------------------------------------------------------------------------------
/docs/examples/assets/css_js.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/css_js.png


--------------------------------------------------------------------------------
/docs/examples/assets/css_selector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/css_selector.png


--------------------------------------------------------------------------------
/docs/examples/assets/exec_script.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/exec_script.png


--------------------------------------------------------------------------------
/docs/examples/assets/llm_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/llm_extraction.png


--------------------------------------------------------------------------------
/docs/examples/assets/semantic_extraction_cosine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/semantic_extraction_cosine.png


--------------------------------------------------------------------------------
/docs/examples/assets/semantic_extraction_llm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/examples/assets/semantic_extraction_llm.png


--------------------------------------------------------------------------------
/docs/examples/async_webcrawler_multiple_urls_example.py:
--------------------------------------------------------------------------------
 1 | # File: async_webcrawler_multiple_urls_example.py
 2 | import os, sys
 3 | 
 4 | # append 2 parent directories to sys.path to import crawl4ai
 5 | parent_dir = os.path.dirname(
 6 |     os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 7 | )
 8 | sys.path.append(parent_dir)
 9 | 
10 | import asyncio
11 | from crawl4ai import AsyncWebCrawler
12 | 
13 | 
14 | async def main():
15 |     # Initialize the AsyncWebCrawler
16 |     async with AsyncWebCrawler(verbose=True) as crawler:
17 |         # List of URLs to crawl
18 |         urls = [
19 |             "https://example.com",
20 |             "https://python.org",
21 |             "https://github.com",
22 |             "https://stackoverflow.com",
23 |             "https://news.ycombinator.com",
24 |         ]
25 | 
26 |         # Set up crawling parameters
27 |         word_count_threshold = 100
28 | 
29 |         # Run the crawling process for multiple URLs
30 |         results = await crawler.arun_many(
31 |             urls=urls,
32 |             word_count_threshold=word_count_threshold,
33 |             bypass_cache=True,
34 |             verbose=True,
35 |         )
36 | 
37 |         # Process the results
38 |         for result in results:
39 |             if result.success:
40 |                 print(f"Successfully crawled: {result.url}")
41 |                 print(f"Title: {result.metadata.get('title', 'N/A')}")
42 |                 print(f"Word count: {len(result.markdown.split())}")
43 |                 print(
44 |                     f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
45 |                 )
46 |                 print(f"Number of images: {len(result.media.get('images', []))}")
47 |                 print("---")
48 |             else:
49 |                 print(f"Failed to crawl: {result.url}")
50 |                 print(f"Error: {result.error_message}")
51 |                 print("---")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     asyncio.run(main())
56 | 


--------------------------------------------------------------------------------
/docs/examples/builtin_browser_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Builtin Browser Example
 4 | 
 5 | This example demonstrates how to use Crawl4AI's builtin browser feature,
 6 | which simplifies the browser management process. With builtin mode:
 7 | 
 8 | - No need to manually start or connect to a browser
 9 | - No need to manage CDP URLs or browser processes
10 | - Automatically connects to an existing browser or launches one if needed
11 | - Browser persists between script runs, reducing startup time
12 | - No explicit cleanup or close() calls needed
13 | 
14 | The example also demonstrates "auto-starting" where you don't need to explicitly
15 | call start() method on the crawler.
16 | """
17 | 
18 | import asyncio
19 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
20 | import time
21 | 
22 | async def crawl_with_builtin_browser():
23 |     """
24 |     Simple example of crawling with the builtin browser.
25 |     
26 |     Key features:
27 |     1. browser_mode="builtin" in BrowserConfig
28 |     2. No explicit start() call needed
29 |     3. No explicit close() needed
30 |     """
31 |     print("\n=== Crawl4AI Builtin Browser Example ===\n")
32 |     
33 |     # Create a browser configuration with builtin mode
34 |     browser_config = BrowserConfig(
35 |         browser_mode="builtin",  # This is the key setting!
36 |         headless=True            # Can run headless for background operation
37 |     )
38 |     
39 |     # Create crawler run configuration
40 |     crawler_config = CrawlerRunConfig(
41 |         cache_mode=CacheMode.BYPASS,  # Skip cache for this demo
42 |         screenshot=True,              # Take a screenshot
43 |         verbose=True                  # Show verbose logging
44 |     )
45 |     
46 |     # Create the crawler instance
47 |     # Note: We don't need to use "async with" context manager
48 |     crawler = AsyncWebCrawler(config=browser_config)
49 |     
50 |     # Start crawling several URLs - no explicit start() needed!
51 |     # The crawler will automatically connect to the builtin browser
52 |     print("\n➡️ Crawling first URL...")
53 |     t0 = time.time()
54 |     result1 = await crawler.arun(
55 |         url="https://crawl4ai.com",
56 |         config=crawler_config
57 |     )
58 |     t1 = time.time()
59 |     print(f"✅ First URL crawled in {t1-t0:.2f} seconds")
60 |     print(f"   Got {len(result1.markdown.raw_markdown)} characters of content")
61 |     print(f"   Title: {result1.metadata.get('title', 'No title')}")
62 |     
63 |     # Try another URL - the browser is already running, so this should be faster
64 |     print("\n➡️ Crawling second URL...")
65 |     t0 = time.time()
66 |     result2 = await crawler.arun(
67 |         url="https://example.com",
68 |         config=crawler_config
69 |     )
70 |     t1 = time.time()
71 |     print(f"✅ Second URL crawled in {t1-t0:.2f} seconds")
72 |     print(f"   Got {len(result2.markdown.raw_markdown)} characters of content")
73 |     print(f"   Title: {result2.metadata.get('title', 'No title')}")
74 |     
75 |     # The builtin browser continues running in the background
76 |     # No need to explicitly close it
77 |     print("\n🔄 The builtin browser remains running for future use")
78 |     print("   You can use 'crwl browser status' to check its status")
79 |     print("   or 'crwl browser stop' to stop it when completely done")
80 | 
81 | async def main():
82 |     """Run the example"""
83 |     await crawl_with_builtin_browser()
84 | 
85 | if __name__ == "__main__":
86 |     asyncio.run(main())


--------------------------------------------------------------------------------
/docs/examples/chainlit.md:
--------------------------------------------------------------------------------
1 | # Welcome to Crawl4AI! 🚀🤖
2 | 
3 | Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.


--------------------------------------------------------------------------------
/docs/examples/cli/browser.yml:
--------------------------------------------------------------------------------
 1 | browser_type: "chromium"
 2 | headless: true
 3 | viewport_width: 1280
 4 | viewport_height: 800
 5 | user_agent_mode: "random"
 6 | verbose: true
 7 | text_mode: false
 8 | light_mode: false
 9 | ignore_https_errors: true
10 | java_script_enabled: true
11 | extra_args:
12 |   - "--disable-gpu"
13 |   - "--no-sandbox"


--------------------------------------------------------------------------------
/docs/examples/cli/crawler.yml:
--------------------------------------------------------------------------------
 1 | cache_mode: "bypass"
 2 | wait_until: "networkidle"
 3 | page_timeout: 30000
 4 | delay_before_return_html: 0.5
 5 | word_count_threshold: 100
 6 | scan_full_page: true
 7 | scroll_delay: 0.3
 8 | process_iframes: false
 9 | remove_overlay_elements: true
10 | magic: true
11 | verbose: true
12 | exclude_external_links: true
13 | exclude_social_media_links: true


--------------------------------------------------------------------------------
/docs/examples/cli/css_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ArticleExtractor",
 3 |   "baseSelector": ".cards[data-tax=news] .card__data",
 4 |   "fields": [
 5 |     {
 6 |       "name": "title",
 7 |       "selector": "h4.card__title",
 8 |       "type": "text"
 9 |     },
10 |     {
11 |       "name": "link",
12 |       "selector": "h4.card__title a", 
13 |       "type": "attribute",
14 |       "attribute": "href"
15 |     },
16 |     {
17 |       "name": "details",
18 |       "selector": ".card__details",
19 |       "type": "text"
20 |     },
21 |     {
22 |       "name": "topics",
23 |       "selector": ".card__topics.topics",
24 |       "type": "text"
25 |     }
26 |   ]
27 | }


--------------------------------------------------------------------------------
/docs/examples/cli/extract.yml:
--------------------------------------------------------------------------------
 1 | type: "llm"
 2 | provider: "openai/gpt-4o-mini"
 3 | api_token: "env:OPENAI_API_KEY"
 4 | instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
 5 | params:
 6 |   chunk_token_threshold: 4096
 7 |   overlap_rate: 0.1
 8 |   word_token_rate: 0.75
 9 |   temperature: 0.3
10 |   max_tokens: 1000
11 |   verbose: true


--------------------------------------------------------------------------------
/docs/examples/cli/extract_css.yml:
--------------------------------------------------------------------------------
1 | type: "json-css"
2 | params:
3 |   verbose: true 


--------------------------------------------------------------------------------
/docs/examples/cli/llm_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "NewsArticle",
 3 |   "type": "object",
 4 |   "properties": {
 5 |     "title": {
 6 |       "type": "string",
 7 |       "description": "The title/headline of the news article"
 8 |     },
 9 |     "link": {
10 |       "type": "string",
11 |       "description": "The URL or link to the full article"
12 |     },
13 |     "details": {
14 |       "type": "string", 
15 |       "description": "Brief summary or details about the article content"
16 |     },
17 |     "topics": {
18 |       "type": "array",
19 |       "items": {
20 |         "type": "string"
21 |       },
22 |       "description": "List of topics or categories associated with the article"
23 |     }
24 |   },
25 |   "required": ["title", "details"]
26 | }


--------------------------------------------------------------------------------
/docs/examples/crawlai_vs_firecrawl.py:
--------------------------------------------------------------------------------
 1 | import os, time
 2 | 
 3 | # append the path to the root of the project
 4 | import sys
 5 | import asyncio
 6 | 
 7 | sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 8 | from firecrawl import FirecrawlApp
 9 | from crawl4ai import AsyncWebCrawler
10 | 
11 | __data__ = os.path.join(os.path.dirname(__file__), "..", "..") + "/.data"
12 | 
13 | 
14 | async def compare():
15 |     app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
16 | 
17 |     # Tet Firecrawl with a simple crawl
18 |     start = time.time()
19 |     scrape_status = app.scrape_url(
20 |         "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
21 |     )
22 |     end = time.time()
23 |     print(f"Time taken: {end - start} seconds")
24 |     print(len(scrape_status["markdown"]))
25 |     # save the markdown content with provider name
26 |     with open(f"{__data__}/firecrawl_simple.md", "w") as f:
27 |         f.write(scrape_status["markdown"])
28 |     # Count how many "cldnry.s-nbcnews.com" are in the markdown
29 |     print(scrape_status["markdown"].count("cldnry.s-nbcnews.com"))
30 | 
31 |     async with AsyncWebCrawler() as crawler:
32 |         start = time.time()
33 |         result = await crawler.arun(
34 |             url="https://www.nbcnews.com/business",
35 |             # js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
36 |             word_count_threshold=0,
37 |             bypass_cache=True,
38 |             verbose=False,
39 |         )
40 |         end = time.time()
41 |         print(f"Time taken: {end - start} seconds")
42 |         print(len(result.markdown))
43 |         # save the markdown content with provider name
44 |         with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
45 |             f.write(result.markdown)
46 |         # count how many "cldnry.s-nbcnews.com" are in the markdown
47 |         print(result.markdown.count("cldnry.s-nbcnews.com"))
48 | 
49 |         start = time.time()
50 |         result = await crawler.arun(
51 |             url="https://www.nbcnews.com/business",
52 |             js_code=[
53 |                 "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
54 |             ],
55 |             word_count_threshold=0,
56 |             bypass_cache=True,
57 |             verbose=False,
58 |         )
59 |         end = time.time()
60 |         print(f"Time taken: {end - start} seconds")
61 |         print(len(result.markdown))
62 |         # save the markdown content with provider name
63 |         with open(f"{__data__}/crawl4ai_js.md", "w") as f:
64 |             f.write(result.markdown)
65 |         # count how many "cldnry.s-nbcnews.com" are in the markdown
66 |         print(result.markdown.count("cldnry.s-nbcnews.com"))
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     asyncio.run(compare())
71 | 


--------------------------------------------------------------------------------
/docs/examples/docker_python_sdk.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai.docker_client import Crawl4aiDockerClient
 3 | from crawl4ai import (
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig
 6 | )
 7 | 
 8 | async def main():
 9 |     async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
10 |         # If jwt is enabled, authenticate first
11 |         # await client.authenticate("test@example.com")
12 |         
13 |         # Non-streaming crawl
14 |         results = await client.crawl(
15 |             ["https://example.com", "https://python.org"],
16 |             browser_config=BrowserConfig(headless=True),
17 |             crawler_config=CrawlerRunConfig()
18 |         )
19 |         print(f"Non-streaming results: {results}")
20 |         
21 |         # Streaming crawl
22 |         crawler_config = CrawlerRunConfig(stream=True)
23 |         async for result in await client.crawl(
24 |             ["https://example.com", "https://python.org"],
25 |             browser_config=BrowserConfig(headless=True),
26 |             crawler_config=crawler_config
27 |         ):
28 |             print(f"Streamed result: {result}")
29 |         
30 |         # Get schema
31 |         schema = await client.get_schema()
32 |         print(f"Schema: {schema}")
33 | 
34 | if __name__ == "__main__":
35 |     asyncio.run(main())


--------------------------------------------------------------------------------
/docs/examples/full_page_screenshot_and_pdf_export.md:
--------------------------------------------------------------------------------
 1 | # Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI
 2 | 
 3 | When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences.
 4 | 
 5 | **The New Approach:**
 6 | We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic.
 7 | 
 8 | **Key Benefits:**
 9 | - **Reliability:** The PDF export never times out and works regardless of page length.
10 | - **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing.
11 | - **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime.
12 | 
13 | **Simple Example:**
14 | ```python
15 | import os
16 | import sys
17 | import asyncio
18 | from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
19 | 
20 | # Adjust paths as needed
21 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
22 | sys.path.append(parent_dir)
23 | __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
24 | 
25 | async def main():
26 |     async with AsyncWebCrawler() as crawler:
27 |         # Request both PDF and screenshot
28 |         result = await crawler.arun(
29 |             url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
30 |             config=CrawlerRunConfig(
31 |                 cache_mode=CacheMode.BYPASS,
32 |                 pdf=True,
33 |                 screenshot=True
34 |             )
35 |         )
36 |         
37 |         if result.success:
38 |             # Save screenshot
39 |             if result.screenshot:
40 |                 from base64 import b64decode
41 |                 with open(os.path.join(__location__, "screenshot.png"), "wb") as f:
42 |                     f.write(b64decode(result.screenshot))
43 |             
44 |             # Save PDF
45 |             if result.pdf:
46 |                 with open(os.path.join(__location__, "page.pdf"), "wb") as f:
47 |                     f.write(result.pdf)
48 | 
49 | if __name__ == "__main__":
50 |     asyncio.run(main())
51 | ```
52 | 
53 | **What Happens Under the Hood:**
54 | - Crawl4AI navigates to the target page.
55 | - If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length.
56 | - If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling.
57 | - Finally, you get your PDF and/or screenshot ready to use.
58 | 
59 | **Conclusion:**
60 | With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages.


--------------------------------------------------------------------------------
/docs/examples/hello_world.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import (
 3 |     AsyncWebCrawler,
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig,
 6 |     DefaultMarkdownGenerator,
 7 |     PruningContentFilter,
 8 |     CrawlResult
 9 | )
10 | 
11 | 
12 | async def main():
13 |     browser_config = BrowserConfig(
14 |         headless=False,
15 |         verbose=True,
16 |     )
17 |     async with AsyncWebCrawler(config=browser_config) as crawler:
18 |         crawler_config = CrawlerRunConfig(
19 |             markdown_generator=DefaultMarkdownGenerator(
20 |                 content_filter=PruningContentFilter()
21 |             ),
22 |         )
23 |         result: CrawlResult = await crawler.arun(
24 |             url="https://www.helloworld.org", config=crawler_config
25 |         )
26 |         print(result.markdown.raw_markdown[:500])
27 | 
28 | if __name__ == "__main__":
29 |     asyncio.run(main())
30 | 


--------------------------------------------------------------------------------
/docs/examples/language_support_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
 3 | 
 4 | 
 5 | async def main():
 6 |     # Example 1: Setting language when creating the crawler
 7 |     crawler1 = AsyncWebCrawler(
 8 |         crawler_strategy=AsyncPlaywrightCrawlerStrategy(
 9 |             headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
10 |         )
11 |     )
12 |     result1 = await crawler1.arun("https://www.example.com")
13 |     print(
14 |         "Example 1 result:", result1.extracted_content[:100]
15 |     )  # Print first 100 characters
16 | 
17 |     # Example 2: Setting language before crawling
18 |     crawler2 = AsyncWebCrawler()
19 |     crawler2.crawler_strategy.headers[
20 |         "Accept-Language"
21 |     ] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
22 |     result2 = await crawler2.arun("https://www.example.com")
23 |     print("Example 2 result:", result2.extracted_content[:100])
24 | 
25 |     # Example 3: Setting language when calling arun method
26 |     crawler3 = AsyncWebCrawler()
27 |     result3 = await crawler3.arun(
28 |         "https://www.example.com",
29 |         headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"},
30 |     )
31 |     print("Example 3 result:", result3.extracted_content[:100])
32 | 
33 |     # Example 4: Crawling multiple pages with different languages
34 |     urls = [
35 |         ("https://www.example.com", "fr-FR,fr;q=0.9"),
36 |         ("https://www.example.org", "es-ES,es;q=0.9"),
37 |         ("https://www.example.net", "de-DE,de;q=0.9"),
38 |     ]
39 | 
40 |     crawler4 = AsyncWebCrawler()
41 |     results = await asyncio.gather(
42 |         *[crawler4.arun(url, headers={"Accept-Language": lang}) for url, lang in urls]
43 |     )
44 | 
45 |     for url, result in zip([u for u, _ in urls], results):
46 |         print(f"Result for {url}:", result.extracted_content[:100])
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     asyncio.run(main())
51 | 


--------------------------------------------------------------------------------
/docs/examples/llm_extraction_openai_pricing.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai import LLMConfig
 2 | from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
 3 | import asyncio
 4 | import os
 5 | import json
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | url = "https://openai.com/api/pricing/"
 9 | 
10 | 
11 | class OpenAIModelFee(BaseModel):
12 |     model_name: str = Field(..., description="Name of the OpenAI model.")
13 |     input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
14 |     output_fee: str = Field(
15 |         ..., description="Fee for output token for the OpenAI model."
16 |     )
17 | 
18 | async def main():
19 |     # Use AsyncWebCrawler
20 |     async with AsyncWebCrawler() as crawler:
21 |         result = await crawler.arun(
22 |             url=url,
23 |             word_count_threshold=1,
24 |             extraction_strategy=LLMExtractionStrategy(
25 |                 # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
26 |                 llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
27 |                 schema=OpenAIModelFee.model_json_schema(),
28 |                 extraction_type="schema",
29 |                 instruction="From the crawled content, extract all mentioned model names along with their "
30 |                 "fees for input and output tokens. Make sure not to miss anything in the entire content. "
31 |                 "One extracted model JSON format should look like this: "
32 |                 '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
33 |             ),
34 |         )
35 |         print("Success:", result.success)
36 |         model_fees = json.loads(result.extracted_content)
37 |         print(len(model_fees))
38 | 
39 |         with open(".data/data.json", "w", encoding="utf-8") as f:
40 |             f.write(result.extracted_content)
41 | 
42 | 
43 | asyncio.run(main())
44 | 


--------------------------------------------------------------------------------
/docs/examples/llm_markdown_generator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import asyncio
 3 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 4 | from crawl4ai import LLMConfig
 5 | from crawl4ai.content_filter_strategy import LLMContentFilter
 6 | 
 7 | async def test_llm_filter():
 8 |     # Create an HTML source that needs intelligent filtering
 9 |     url = "https://docs.python.org/3/tutorial/classes.html"
10 |     
11 |     browser_config = BrowserConfig(
12 |         headless=True,
13 |         verbose=True
14 |     )
15 |     
16 |     # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
17 |     run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
18 |     
19 |     async with AsyncWebCrawler(config=browser_config) as crawler:
20 |         # First get the raw HTML
21 |         result = await crawler.arun(url, config=run_config)
22 |         html = result.cleaned_html
23 | 
24 |         # Initialize LLM filter with focused instruction
25 |         filter = LLMContentFilter(
26 |             llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
27 |             instruction="""
28 |             Focus on extracting the core educational content about Python classes.
29 |             Include:
30 |             - Key concepts and their explanations
31 |             - Important code examples
32 |             - Essential technical details
33 |             Exclude:
34 |             - Navigation elements
35 |             - Sidebars
36 |             - Footer content
37 |             - Version information
38 |             - Any non-essential UI elements
39 |             
40 |             Format the output as clean markdown with proper code blocks and headers.
41 |             """,
42 |             verbose=True
43 |         )
44 |         
45 |         filter = LLMContentFilter(
46 |             llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
47 |             chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
48 |             ignore_cache = True,
49 |             instruction="""
50 |             Extract the main educational content while preserving its original wording and substance completely. Your task is to:
51 | 
52 |             1. Maintain the exact language and terminology used in the main content
53 |             2. Keep all technical explanations, examples, and educational content intact
54 |             3. Preserve the original flow and structure of the core content
55 |             4. Remove only clearly irrelevant elements like:
56 |             - Navigation menus
57 |             - Advertisement sections
58 |             - Cookie notices
59 |             - Footers with site information
60 |             - Sidebars with external links
61 |             - Any UI elements that don't contribute to learning
62 | 
63 |             The goal is to create a clean markdown version that reads exactly like the original article, 
64 |             keeping all valuable content but free from distracting elements. Imagine you're creating 
65 |             a perfect reading experience where nothing valuable is lost, but all noise is removed.
66 |             """,
67 |             verbose=True
68 |         )        
69 | 
70 |         # Apply filtering
71 |         filtered_content = filter.filter_content(html)
72 |         
73 |         # Show results
74 |         print("\nFiltered Content Length:", len(filtered_content))
75 |         print("\nFirst 500 chars of filtered content:")
76 |         if filtered_content:
77 |             print(filtered_content[0][:500])
78 |         
79 |         # Save on disc the markdown version
80 |         with open("filtered_content.md", "w", encoding="utf-8") as f:
81 |             f.write("\n".join(filtered_content))
82 |         
83 |         # Show token usage
84 |         filter.show_usage()
85 | 
86 | if __name__ == "__main__":
87 |     asyncio.run(test_llm_filter())


--------------------------------------------------------------------------------
/docs/examples/markdown/content_source_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example showing how to use the content_source parameter to control HTML input for markdown generation.
 3 | """
 4 | import asyncio
 5 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
 6 | 
 7 | async def demo_content_source():
 8 |     """Demonstrates different content_source options for markdown generation."""
 9 |     url = "https://example.com"  # Simple demo site
10 |     
11 |     print("Crawling with different content_source options...")
12 |     
13 |     # --- Example 1: Default Behavior (cleaned_html) ---
14 |     # This uses the HTML after it has been processed by the scraping strategy
15 |     # The HTML is cleaned, simplified, and optimized for readability
16 |     default_generator = DefaultMarkdownGenerator()  # content_source="cleaned_html" is default
17 |     default_config = CrawlerRunConfig(markdown_generator=default_generator)
18 |     
19 |     # --- Example 2: Raw HTML ---
20 |     # This uses the original HTML directly from the webpage
21 |     # Preserves more original content but may include navigation, ads, etc.
22 |     raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
23 |     raw_config = CrawlerRunConfig(markdown_generator=raw_generator)
24 |     
25 |     # --- Example 3: Fit HTML ---
26 |     # This uses preprocessed HTML optimized for schema extraction
27 |     # Better for structured data extraction but may lose some formatting
28 |     fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
29 |     fit_config = CrawlerRunConfig(markdown_generator=fit_generator)
30 |     
31 |     # Execute all three crawlers in sequence
32 |     async with AsyncWebCrawler() as crawler:
33 |         # Default (cleaned_html)
34 |         result_default = await crawler.arun(url=url, config=default_config)
35 |         
36 |         # Raw HTML
37 |         result_raw = await crawler.arun(url=url, config=raw_config)
38 |         
39 |         # Fit HTML
40 |         result_fit = await crawler.arun(url=url, config=fit_config)
41 |     
42 |     # Print a summary of the results
43 |     print("\nMarkdown Generation Results:\n")
44 |     
45 |     print("1. Default (cleaned_html):")
46 |     print(f"   Length: {len(result_default.markdown.raw_markdown)} chars")
47 |     print(f"   First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")
48 |     
49 |     print("2. Raw HTML:")
50 |     print(f"   Length: {len(result_raw.markdown.raw_markdown)} chars")
51 |     print(f"   First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")
52 |     
53 |     print("3. Fit HTML:")
54 |     print(f"   Length: {len(result_fit.markdown.raw_markdown)} chars")
55 |     print(f"   First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")
56 |     
57 |     # Demonstrate differences in output
58 |     print("\nKey Takeaways:")
59 |     print("- cleaned_html: Best for readable, focused content")
60 |     print("- raw_html: Preserves more original content, but may include noise")
61 |     print("- fit_html: Optimized for schema extraction and structured data")
62 | 
63 | if __name__ == "__main__":
64 |     asyncio.run(demo_content_source())


--------------------------------------------------------------------------------
/docs/examples/markdown/content_source_short_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
 3 | """
 4 | 
 5 | import asyncio
 6 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
 7 | 
 8 | async def demo_markdown_source_config():
 9 |     print("\n=== Demo: Configuring Markdown Source ===")
10 | 
11 |     # Example 1: Generate markdown from cleaned HTML (default behavior)
12 |     cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
13 |     config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
14 | 
15 |     async with AsyncWebCrawler() as crawler:
16 |         result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
17 |         print("Markdown from Cleaned HTML (default):")
18 |         print(f"  Length: {len(result_cleaned.markdown.raw_markdown)}")
19 |         print(f"  Start: {result_cleaned.markdown.raw_markdown[:100]}...")
20 | 
21 |     # Example 2: Generate markdown directly from raw HTML
22 |     raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
23 |     config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
24 | 
25 |     async with AsyncWebCrawler() as crawler:
26 |         result_raw = await crawler.arun(url="https://example.com", config=config_raw)
27 |         print("\nMarkdown from Raw HTML:")
28 |         print(f"  Length: {len(result_raw.markdown.raw_markdown)}")
29 |         print(f"  Start: {result_raw.markdown.raw_markdown[:100]}...")
30 | 
31 |     # Example 3: Generate markdown from preprocessed 'fit' HTML
32 |     fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
33 |     config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
34 | 
35 |     async with AsyncWebCrawler() as crawler:
36 |         result_fit = await crawler.arun(url="https://example.com", config=config_fit)
37 |         print("\nMarkdown from Fit HTML:")
38 |         print(f"  Length: {len(result_fit.markdown.raw_markdown)}")
39 |         print(f"  Start: {result_fit.markdown.raw_markdown[:100]}...")
40 | 
41 | if __name__ == "__main__":
42 |     asyncio.run(demo_markdown_source_config())


--------------------------------------------------------------------------------
/docs/examples/rest_call.py:
--------------------------------------------------------------------------------
 1 | import requests, base64, os
 2 | 
 3 | data = {
 4 |     "urls": ["https://www.nbcnews.com/business"],
 5 |     "screenshot": True,
 6 | }
 7 | 
 8 | response = requests.post("https://crawl4ai.com/crawl", json=data)
 9 | result = response.json()["results"][0]
10 | print(result.keys())
11 | # dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
12 | # 'links', 'screenshot', 'markdown', 'extracted_content',
13 | # 'metadata', 'error_message'])
14 | with open("screenshot.png", "wb") as f:
15 |     f.write(base64.b64decode(result["screenshot"]))
16 | 
17 | # Example of filtering the content using CSS selectors
18 | data = {
19 |     "urls": ["https://www.nbcnews.com/business"],
20 |     "css_selector": "article",
21 |     "screenshot": True,
22 | }
23 | 
24 | # Example of executing a JS script on the page before extracting the content
25 | data = {
26 |     "urls": ["https://www.nbcnews.com/business"],
27 |     "screenshot": True,
28 |     "js": [
29 |         """
30 |     const loadMoreButton = Array.from(document.querySelectorAll('button')).
31 |     find(button => button.textContent.includes('Load More'));
32 |     loadMoreButton && loadMoreButton.click();
33 |     """
34 |     ],
35 | }
36 | 
37 | # Example of using a custom extraction strategy
38 | data = {
39 |     "urls": ["https://www.nbcnews.com/business"],
40 |     "extraction_strategy": "CosineStrategy",
41 |     "extraction_strategy_args": {"semantic_filter": "inflation rent prices"},
42 | }
43 | 
44 | # Example of using LLM to extract content
45 | data = {
46 |     "urls": ["https://www.nbcnews.com/business"],
47 |     "extraction_strategy": "LLMExtractionStrategy",
48 |     "extraction_strategy_args": {
49 |         "provider": "groq/llama3-8b-8192",
50 |         "api_token": os.environ.get("GROQ_API_KEY"),
51 |         "instruction": """I am interested in only financial news, 
52 |         and translate them in French.""",
53 |     },
54 | }
55 | 


--------------------------------------------------------------------------------
/docs/examples/session_id_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import (
 3 |     AsyncWebCrawler,
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig,
 6 |     DefaultMarkdownGenerator,
 7 |     PruningContentFilter,
 8 |     CrawlResult
 9 | )
10 | 
11 |                    
12 | 
13 | async def main():    
14 |     browser_config = BrowserConfig(
15 |         headless=False, 
16 |         verbose=True,
17 |     )
18 |     async with AsyncWebCrawler(config=browser_config) as crawler:
19 |         crawler_config = CrawlerRunConfig(
20 |             session_id= "hello_world", # This help us to use the same page 
21 |         )
22 |         result : CrawlResult = await crawler.arun(
23 |             url="https://www.helloworld.org", config=crawler_config
24 |         )
25 |         # Add a breakpoint here, then you will the page is open and browser is not closed
26 |         print(result.markdown.raw_markdown[:500])
27 |         
28 |         new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True)
29 |         result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page
30 |             url="https://www.helloworld.org", config= new_config
31 |         )
32 |         print(result.js_execution_result) # You should see {'data':'hello'} in the console
33 |         
34 |         # Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config 
35 |         page, context = crawler.crawler_strategy.get_page(new_config)
36 | 
37 | if __name__ == "__main__":
38 |     asyncio.run(main())
39 | 


--------------------------------------------------------------------------------
/docs/examples/ssl_example.py:
--------------------------------------------------------------------------------
 1 | """Example showing how to work with SSL certificates in Crawl4AI."""
 2 | 
 3 | import asyncio
 4 | import os
 5 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 6 | 
 7 | # Create tmp directory if it doesn't exist
 8 | parent_dir = os.path.dirname(
 9 |     os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10 | )
11 | tmp_dir = os.path.join(parent_dir, "tmp")
12 | os.makedirs(tmp_dir, exist_ok=True)
13 | 
14 | 
15 | async def main():
16 |     # Configure crawler to fetch SSL certificate
17 |     config = CrawlerRunConfig(
18 |         fetch_ssl_certificate=True,
19 |         cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
20 |     )
21 | 
22 |     async with AsyncWebCrawler() as crawler:
23 |         result = await crawler.arun(url="https://example.com", config=config)
24 | 
25 |         if result.success and result.ssl_certificate:
26 |             cert = result.ssl_certificate
27 | 
28 |             # 1. Access certificate properties directly
29 |             print("\nCertificate Information:")
30 |             print(f"Issuer: {cert.issuer.get('CN', '')}")
31 |             print(f"Valid until: {cert.valid_until}")
32 |             print(f"Fingerprint: {cert.fingerprint}")
33 | 
34 |             # 2. Export certificate in different formats
35 |             cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
36 |             print("\nCertificate exported to:")
37 |             print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
38 | 
39 |             pem_data = cert.to_pem(
40 |                 os.path.join(tmp_dir, "certificate.pem")
41 |             )  # For web servers
42 |             print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
43 | 
44 |             der_data = cert.to_der(
45 |                 os.path.join(tmp_dir, "certificate.der")
46 |             )  # For Java apps
47 |             print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     asyncio.run(main())
52 | 


--------------------------------------------------------------------------------
/docs/examples/summarize_page.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from crawl4ai.web_crawler import WebCrawler
 4 | from crawl4ai.chunking_strategy import *
 5 | from crawl4ai.extraction_strategy import *
 6 | from crawl4ai.crawler_strategy import *
 7 | 
 8 | url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
 9 | 
10 | crawler = WebCrawler()
11 | crawler.warmup()
12 | 
13 | from pydantic import BaseModel, Field
14 | 
15 | 
16 | class PageSummary(BaseModel):
17 |     title: str = Field(..., description="Title of the page.")
18 |     summary: str = Field(..., description="Summary of the page.")
19 |     brief_summary: str = Field(..., description="Brief summary of the page.")
20 |     keywords: list = Field(..., description="Keywords assigned to the page.")
21 | 
22 | 
23 | result = crawler.run(
24 |     url=url,
25 |     word_count_threshold=1,
26 |     extraction_strategy=LLMExtractionStrategy(
27 |         provider="openai/gpt-4o",
28 |         api_token=os.getenv("OPENAI_API_KEY"),
29 |         schema=PageSummary.model_json_schema(),
30 |         extraction_type="schema",
31 |         apply_chunking=False,
32 |         instruction="From the crawled content, extract the following details: "
33 |         "1. Title of the page "
34 |         "2. Summary of the page, which is a detailed summary "
35 |         "3. Brief summary of the page, which is a paragraph text "
36 |         "4. Keywords assigned to the page, which is a list of keywords. "
37 |         "The extracted JSON format should look like this: "
38 |         '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
39 |     ),
40 |     bypass_cache=True,
41 | )
42 | 
43 | page_summary = json.loads(result.extracted_content)
44 | 
45 | print(page_summary)
46 | 
47 | with open(".data/page_summary.json", "w", encoding="utf-8") as f:
48 |     f.write(result.extracted_content)
49 | 


--------------------------------------------------------------------------------
/docs/examples/use_geo_location.py:
--------------------------------------------------------------------------------
 1 | # use_geo_location.py
 2 | """
 3 | Example: override locale, timezone, and geolocation using Crawl4ai patterns.
 4 | 
 5 | This demo uses `AsyncWebCrawler.arun()` to fetch a page with
 6 | browser context primed for specific locale, timezone, and GPS,
 7 | and saves a screenshot for visual verification.
 8 | """
 9 | 
10 | import asyncio
11 | import base64
12 | from pathlib import Path
13 | from typing import List
14 | from crawl4ai import (
15 |     AsyncWebCrawler,
16 |     CrawlerRunConfig,
17 |     BrowserConfig,
18 |     GeolocationConfig,
19 |     CrawlResult,
20 | )
21 | 
22 | async def demo_geo_override():
23 |     """Demo: Crawl a geolocation-test page with overrides and screenshot."""
24 |     print("\n=== Geo-Override Crawl ===")
25 | 
26 |     # 1) Browser setup: use Playwright-managed contexts
27 |     browser_cfg = BrowserConfig(
28 |         headless=False,
29 |         viewport_width=1280,
30 |         viewport_height=720,
31 |         use_managed_browser=False,
32 |     )
33 | 
34 |     # 2) Run config: include locale, timezone_id, geolocation, and screenshot
35 |     run_cfg = CrawlerRunConfig(
36 |         url="https://browserleaks.com/geo",          # test page that shows your location
37 |         locale="en-US",                              # Accept-Language & UI locale
38 |         timezone_id="America/Los_Angeles",           # JS Date()/Intl timezone
39 |         geolocation=GeolocationConfig(                 # override GPS coords
40 |             latitude=34.0522,
41 |             longitude=-118.2437,
42 |             accuracy=10.0,
43 |         ),
44 |         screenshot=True,                               # capture screenshot after load
45 |         session_id="geo_test",                       # reuse context if rerunning
46 |         delay_before_return_html=5
47 |     )
48 | 
49 |     async with AsyncWebCrawler(config=browser_cfg) as crawler:
50 |         # 3) Run crawl (returns list even for single URL)
51 |         results: List[CrawlResult] = await crawler.arun(
52 |             url=run_cfg.url,
53 |             config=run_cfg,            
54 |         )
55 |         result = results[0]
56 | 
57 |         # 4) Save screenshot and report path
58 |         if result.screenshot:
59 |             __current_dir = Path(__file__).parent
60 |             out_dir = __current_dir / "tmp"
61 |             out_dir.mkdir(exist_ok=True)
62 |             shot_path = out_dir / "geo_test.png"
63 |             with open(shot_path, "wb") as f:
64 |                 f.write(base64.b64decode(result.screenshot))
65 |             print(f"Saved screenshot to {shot_path}")
66 |         else:
67 |             print("No screenshot captured, check configuration.")
68 | 
69 | if __name__ == "__main__":
70 |     asyncio.run(demo_geo_override())
71 | 


--------------------------------------------------------------------------------
/docs/md_v2/advanced/crawl-dispatcher.md:
--------------------------------------------------------------------------------
 1 | # Crawl Dispatcher
 2 | 
 3 | We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress.
 4 | 
 5 | Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X.
 6 | 
 7 | Below is a **sample** of how the dispatcher’s performance monitor might look in action:
 8 | 
 9 | ![Crawl Dispatcher Performance Monitor](../assets/images/dispatcher.png)
10 | 
11 | 
12 | We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates!


--------------------------------------------------------------------------------
/docs/md_v2/advanced/proxy-security.md:
--------------------------------------------------------------------------------
 1 | # Proxy 
 2 | 
 3 | ## Basic Proxy Setup
 4 | 
 5 | Simple proxy configuration with `BrowserConfig`:
 6 | 
 7 | ```python
 8 | from crawl4ai.async_configs import BrowserConfig
 9 | 
10 | # Using proxy URL
11 | browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
12 | async with AsyncWebCrawler(config=browser_config) as crawler:
13 |     result = await crawler.arun(url="https://example.com")
14 | 
15 | # Using SOCKS proxy
16 | browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
17 | async with AsyncWebCrawler(config=browser_config) as crawler:
18 |     result = await crawler.arun(url="https://example.com")
19 | ```
20 | 
21 | ## Authenticated Proxy
22 | 
23 | Use an authenticated proxy with `BrowserConfig`:
24 | 
25 | ```python
26 | from crawl4ai.async_configs import BrowserConfig
27 | 
28 | proxy_config = {
29 |     "server": "http://proxy.example.com:8080",
30 |     "username": "user",
31 |     "password": "pass"
32 | }
33 | 
34 | browser_config = BrowserConfig(proxy_config=proxy_config)
35 | async with AsyncWebCrawler(config=browser_config) as crawler:
36 |     result = await crawler.arun(url="https://example.com")
37 | ```
38 | 
39 | Here's the corrected documentation:
40 | 
41 | ## Rotating Proxies 
42 | 
43 | Example using a proxy rotation service dynamically:
44 | 
45 | ```python
46 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
47 | 
48 | async def get_next_proxy():
49 |     # Your proxy rotation logic here
50 |     return {"server": "http://next.proxy.com:8080"}
51 | 
52 | async def main():
53 |     browser_config = BrowserConfig()
54 |     run_config = CrawlerRunConfig()
55 |     
56 |     async with AsyncWebCrawler(config=browser_config) as crawler:
57 |         # For each URL, create a new run config with different proxy
58 |         for url in urls:
59 |             proxy = await get_next_proxy()
60 |             # Clone the config and update proxy - this creates a new browser context
61 |             current_config = run_config.clone(proxy_config=proxy)
62 |             result = await crawler.arun(url=url, config=current_config)
63 | 
64 | if __name__ == "__main__":
65 |     import asyncio
66 |     asyncio.run(main())
67 | ```
68 | 
69 | 


--------------------------------------------------------------------------------
/docs/md_v2/ask_ai/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Crawl4AI Assistant</title>
 7 |     <!-- Link main styles first for variable access -->
 8 |     <link rel="stylesheet" href="../assets/layout.css">
 9 |     <link rel="stylesheet" href="../assets/styles.css">
10 |     <!-- Link specific AI styles -->
11 |     <link rel="stylesheet" href="../assets/highlight.css">
12 |     <link rel="stylesheet" href="ask-ai.css">
13 | </head>
14 | <body>
15 |     <div class="ai-assistant-container">
16 | 
17 |         <!-- Left Sidebar: Conversation History -->
18 |         <aside id="history-panel" class="sidebar left-sidebar">
19 |             <header>
20 |                 <h3>History</h3>
21 |                 <button id="new-chat-button" class="btn btn-sm">New Chat</button>
22 |             </header>
23 |             <ul id="history-list">
24 |                 <!-- History items populated by JS -->
25 |             </ul>
26 |         </aside>
27 | 
28 |         <!-- Main Area: Chat Interface -->
29 |         <main id="chat-panel">
30 |             <div id="chat-messages">
31 |                 <!-- Chat messages populated by JS -->
32 |                  <div class="message ai-message welcome-message">
33 |                     Welcome to the Crawl4AI Assistant! How can I help you today?
34 |                  </div>
35 |             </div>
36 |             <div id="chat-input-area">
37 |                 <!-- Loading indicator for general waiting (optional) -->
38 |                 <!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
39 |                 <textarea id="chat-input" placeholder="We will roll out this feature very soon." rows="2" disabled></textarea> 
40 |                 <button id="send-button">Send</button>
41 |             </div>
42 |         </main>
43 | 
44 |         <!-- Right Sidebar: Citations / Context -->
45 |         <aside id="citations-panel" class="sidebar right-sidebar">
46 |             <header>
47 |                 <h3>Citations</h3>
48 |             </header>
49 |             <ul id="citations-list">
50 |                 <!-- Citations populated by JS -->
51 |                 <li class="no-citations">No citations for this response yet.</li>
52 |             </ul>
53 |         </aside>
54 | 
55 |     </div>
56 | 
57 |     <!-- Include Marked.js library -->
58 |     <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
59 |     <script src="../assets/highlight.min.js"></script> 
60 | 
61 |     <!-- Your AI Assistant Logic -->
62 |     <script src="ask-ai.js"></script>
63 | </body>
64 | </html>


--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/md_v2/assets/DankMono-Bold.woff2


--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/md_v2/assets/DankMono-Italic.woff2


--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/md_v2/assets/DankMono-Regular.woff2


--------------------------------------------------------------------------------
/docs/md_v2/assets/Monaco.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/md_v2/assets/Monaco.woff


--------------------------------------------------------------------------------
/docs/md_v2/assets/copy_code.js:
--------------------------------------------------------------------------------
 1 | // ==== File: docs/assets/copy_code.js ====
 2 | 
 3 | document.addEventListener('DOMContentLoaded', () => {
 4 |     // Target specifically code blocks within the main content area
 5 |     const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
 6 | 
 7 |     codeBlocks.forEach((codeElement) => {
 8 |         const preElement = codeElement.parentElement; // The <pre> tag
 9 | 
10 |         // Ensure the <pre> tag can contain a positioned button
11 |         if (window.getComputedStyle(preElement).position === 'static') {
12 |             preElement.style.position = 'relative';
13 |         }
14 | 
15 |         // Create the button
16 |         const copyButton = document.createElement('button');
17 |         copyButton.className = 'copy-code-button';
18 |         copyButton.type = 'button';
19 |         copyButton.setAttribute('aria-label', 'Copy code to clipboard');
20 |         copyButton.title = 'Copy code to clipboard';
21 |         copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
22 | 
23 |         // Append the button to the <pre> element
24 |         preElement.appendChild(copyButton);
25 | 
26 |         // Add click event listener
27 |         copyButton.addEventListener('click', () => {
28 |             copyCodeToClipboard(codeElement, copyButton);
29 |         });
30 |     });
31 | 
32 |     async function copyCodeToClipboard(codeElement, button) {
33 |         // Use innerText to get the rendered text content, preserving line breaks
34 |         const textToCopy = codeElement.innerText;
35 | 
36 |         try {
37 |             await navigator.clipboard.writeText(textToCopy);
38 | 
39 |             // Visual feedback
40 |             button.innerHTML = 'Copied!';
41 |             button.classList.add('copied');
42 |             button.disabled = true; // Temporarily disable
43 | 
44 |             // Revert button state after a short delay
45 |             setTimeout(() => {
46 |                 button.innerHTML = 'Copy';
47 |                 button.classList.remove('copied');
48 |                 button.disabled = false;
49 |             }, 2000); // Show "Copied!" for 2 seconds
50 | 
51 |         } catch (err) {
52 |             console.error('Failed to copy code: ', err);
53 |             // Optional: Provide error feedback on the button
54 |             button.innerHTML = 'Error';
55 |             setTimeout(() => {
56 |                 button.innerHTML = 'Copy';
57 |             }, 2000);
58 |         }
59 |     }
60 | 
61 |     console.log("Copy Code Button script loaded.");
62 | });


--------------------------------------------------------------------------------
/docs/md_v2/assets/docs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/md_v2/assets/docs.zip


--------------------------------------------------------------------------------
/docs/md_v2/assets/floating_ask_ai_button.js:
--------------------------------------------------------------------------------
 1 | // ==== File: docs/assets/floating_ask_ai_button.js ====
 2 | 
 3 | document.addEventListener('DOMContentLoaded', () => {
 4 |     const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
 5 |     const currentPath = window.location.pathname;
 6 | 
 7 |     // Determine the base URL for constructing the link correctly,
 8 |     // especially if deployed in a sub-directory.
 9 |     // This assumes a simple structure; adjust if needed.
10 |     const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
11 | 
12 | 
13 |     // Check if the current page IS the Ask AI page
14 |     // Use includes() for flexibility (handles trailing slash or .html)
15 |     if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
16 |         console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
17 |         return; // Don't add the button on the target page
18 |     }
19 | 
20 |     // --- Create the button ---
21 |     const fabLink = document.createElement('a');
22 |     fabLink.className = 'floating-ask-ai-button';
23 |     fabLink.href = askAiPagePath; // Construct the correct URL
24 |     fabLink.title = 'Ask Crawl4AI Assistant';
25 |     fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
26 | 
27 |     // Add content (using SVG icon for better visuals)
28 |     fabLink.innerHTML = `
29 |         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
30 |             <path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
31 |         </svg>
32 |         <span>Ask AI</span>
33 |     `;
34 | 
35 |     // Append to body
36 |     document.body.appendChild(fabLink);
37 | 
38 |     console.log("Floating Ask AI Button added.");
39 | });


--------------------------------------------------------------------------------
/docs/md_v2/assets/highlight.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/md_v2/assets/highlight.css


--------------------------------------------------------------------------------
/docs/md_v2/assets/highlight_init.js:
--------------------------------------------------------------------------------
1 | document.addEventListener('DOMContentLoaded', (event) => {
2 |     document.querySelectorAll('pre code').forEach((block) => {
3 |       hljs.highlightBlock(block);
4 |     });
5 |   });
6 |   


--------------------------------------------------------------------------------
/docs/md_v2/assets/images/dispatcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/md_v2/assets/images/dispatcher.png


--------------------------------------------------------------------------------
/docs/md_v2/blog/articles/dockerize_hooks.md:
--------------------------------------------------------------------------------
 1 | ## Introducing Event Streams and Interactive Hooks in Crawl4AI
 2 | 
 3 | ![event-driven-crawl](https://res.cloudinary.com/kidocode/image/upload/t_400x400/v1734344008/15bb8bbb-83ac-43ac-962d-3feb3e0c3bbf_2_tjmr4n.webp)
 4 | 
 5 | In the near future, I’m planning to enhance Crawl4AI’s capabilities by introducing an event stream mechanism that will give clients deeper, real-time insights into the crawling process. Today, hooks are a powerful feature at the code level—they let developers define custom logic at key points in the crawl. However, when using Crawl4AI as a service (e.g., through a Dockerized API), there isn’t an easy way to interact with these hooks at runtime.
 6 | 
 7 | **What’s Changing?**
 8 | 
 9 | I’m working on a solution that will allow the crawler to emit a continuous stream of events, updating clients on the current crawling stage, encountered pages, and any decision points. This event stream could be exposed over a standardized protocol like Server-Sent Events (SSE) or WebSockets, enabling clients to “subscribe” and listen as the crawler works.
10 | 
11 | **Interactivity Through Process IDs**
12 | 
13 | A key part of this new design is the concept of a unique process ID for each crawl session. Imagine you’re listening to an event stream that informs you:
14 | - The crawler just hit a certain page  
15 | - It triggered a hook and is now pausing for instructions  
16 | 
17 | With the event stream in place, you can send a follow-up request back to the server—referencing the unique process ID—to provide extra data, instructions, or parameters. This might include selecting which links to follow next, adjusting extraction strategies, or providing authentication tokens for a protected API. Once the crawler receives these instructions, it resumes execution with the updated context.
18 | 
19 | ```mermaid
20 | sequenceDiagram
21 |     participant Client
22 |     participant Server
23 |     participant Crawler
24 | 
25 |     Client->>Server: Start crawl request
26 |     Server->>Crawler: Initiate crawl with Process ID
27 |     Crawler-->>Server: Event: Page hit
28 |     Server-->>Client: Stream: Page hit event
29 |     Client->>Server: Instruction for Process ID
30 |     Server->>Crawler: Update crawl with new instructions
31 |     Crawler-->>Server: Event: Crawl completed
32 |     Server-->>Client: Stream: Crawl completed
33 | ```
34 | 
35 | **Benefits for Developers and Users**
36 | 
37 | 1. **Fine-Grained Control**: Instead of predefining all logic upfront, you can dynamically guide the crawler in response to actual data and conditions encountered mid-crawl.
38 | 2. **Real-Time Insights**: Monitor progress, errors, or network bottlenecks as they happen, without waiting for the entire crawl to finish.
39 | 3. **Enhanced Collaboration**: Different team members or automated systems can watch the same crawl events and provide input, making the crawling process more adaptive and intelligent.
40 | 
41 | **Next Steps**
42 | 
43 | I’m currently exploring the best APIs, technologies, and patterns to make this vision a reality. My goal is to deliver a seamless developer experience—one that integrates with existing Crawl4AI workflows while offering new flexibility and power.
44 | 
45 | Stay tuned for more updates as I continue building this feature out. In the meantime, I’d love to hear any feedback or suggestions you might have to help shape this interactive, event-driven future of web crawling with Crawl4AI.
46 | 
47 | 


--------------------------------------------------------------------------------
/docs/md_v2/blog/releases/0.4.2.md:
--------------------------------------------------------------------------------
 1 | ## 🚀 Crawl4AI 0.4.2 Update: Smarter Crawling Just Got Easier (Dec 12, 2024)
 2 | 
 3 | ### Hey Developers,
 4 | 
 5 | I’m excited to share Crawl4AI 0.4.2—a major upgrade that makes crawling smarter, faster, and a whole lot more intuitive. I’ve packed in a bunch of new features to simplify your workflows and improve your experience. Let’s cut to the chase!
 6 | 
 7 | ---
 8 | 
 9 | ### 🔧 **Configurable Browser and Crawler Behavior**
10 | 
11 | You’ve asked for better control over how browsers and crawlers are configured, and now you’ve got it. With the new `BrowserConfig` and `CrawlerRunConfig` objects, you can set up your browser and crawling behavior exactly how you want. No more cluttering `arun` with a dozen arguments—just pass in your configs and go.
12 | 
13 | **Example:**
14 | ```python
15 | from crawl4ai import BrowserConfig, CrawlerRunConfig, AsyncWebCrawler
16 | 
17 | browser_config = BrowserConfig(headless=True, viewport_width=1920, viewport_height=1080)
18 | crawler_config = CrawlerRunConfig(cache_mode="BYPASS")
19 | 
20 | async with AsyncWebCrawler(config=browser_config) as crawler:
21 |     result = await crawler.arun(url="https://example.com", config=crawler_config)
22 |     print(result.markdown[:500])
23 | ```
24 | 
25 | This setup is a game-changer for scalability, keeping your code clean and flexible as we add more parameters in the future.
26 | 
27 | Remember: If you like to use the old way, you can still pass arguments directly to `arun` as before, no worries!
28 | 
29 | ---
30 | 
31 | ### 🔐 **Streamlined Session Management**
32 | 
33 | Here’s the big one: You can now pass local storage and cookies directly. Whether it’s setting values programmatically or importing a saved JSON state, managing sessions has never been easier. This is a must-have for authenticated crawls—just export your storage state once and reuse it effortlessly across runs.
34 | 
35 | **Example:**
36 | 1. Open a browser, log in manually, and export the storage state.
37 | 2. Import the JSON file for seamless authenticated crawling:
38 | 
39 | ```python
40 | result = await crawler.arun(
41 |     url="https://example.com/protected",
42 |     storage_state="my_storage_state.json"
43 | )
44 | ```
45 | 
46 | ---
47 | 
48 | ### 🔢 **Handling Large Pages: Supercharged Screenshots and PDF Conversion**
49 | 
50 | Two big upgrades here:
51 | 
52 | - **Blazing-fast long-page screenshots**: Turn extremely long web pages into clean, high-quality screenshots—without breaking a sweat. It’s optimized to handle large content without lag.
53 | 
54 | - **Full-page PDF exports**: Now, you can also convert any page into a PDF with all the details intact. Perfect for archiving or sharing complex layouts.
55 | 
56 | ---
57 | 
58 | ### 🔧 **Other Cool Stuff**
59 | 
60 | - **Anti-bot enhancements**: Magic mode now handles overlays, user simulation, and anti-detection features like a pro.
61 | - **JavaScript execution**: Execute custom JS snippets to handle dynamic content. No more wrestling with endless page interactions.
62 | 
63 | ---
64 | 
65 | ### 📊 **Performance Boosts and Dev-friendly Updates**
66 | 
67 | - Faster rendering and viewport adjustments for better performance.
68 | - Improved cookie and local storage handling for seamless authentication.
69 | - Better debugging with detailed logs and actionable error messages.
70 | 
71 | ---
72 | 
73 | ### 🔠 **Use Cases You’ll Love**
74 | 
75 | 1. **Authenticated Crawls**: Login once, export your storage state, and reuse it across multiple requests without the headache.
76 | 2. **Long-page Screenshots**: Perfect for blogs, e-commerce pages, or any endless-scroll website.
77 | 3. **PDF Export**: Create professional-looking page PDFs in seconds.
78 | 
79 | ---
80 | 
81 | ### Let’s Get Crawling
82 | 
83 | Crawl4AI 0.4.2 is ready for you to download and try. I’m always looking for ways to improve, so don’t hold back—share your thoughts and feedback.
84 | 
85 | Happy Crawling! 🚀
86 | 
87 | 


--------------------------------------------------------------------------------
/docs/md_v2/core/ask-ai.md:
--------------------------------------------------------------------------------
 1 | <div class="ask-ai-container">
 2 | <iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
 3 | </div>
 4 | 
 5 | <script>
 6 | // Iframe height adjustment
 7 | function resizeAskAiIframe() {
 8 |   const iframe = document.getElementById('ask-ai-frame');
 9 |   if (iframe) {
10 |     const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
11 |     // Footer is removed by JS below, so calculate height based on header + small buffer
12 |     const topOffset = headerHeight + 20; // Header + buffer/margin
13 | 
14 |     const availableHeight = window.innerHeight - topOffset;
15 |     iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
16 |   }
17 | }
18 | 
19 | // Run immediately and on resize/load
20 | resizeAskAiIframe(); // Initial call
21 | let resizeTimer;
22 | window.addEventListener('load', resizeAskAiIframe);
23 | window.addEventListener('resize', () => {
24 |     clearTimeout(resizeTimer);
25 |     resizeTimer = setTimeout(resizeAskAiIframe, 150);
26 | });
27 | 
28 | // Remove Footer & HR from parent page (DOM Ready might be safer)
29 | document.addEventListener('DOMContentLoaded', () => {
30 |     setTimeout(() => { // Add slight delay just in case elements render slowly
31 |         const footer = window.parent.document.querySelector('footer'); // Target parent document
32 |         if (footer) {
33 |             const hrBeforeFooter = footer.previousElementSibling;
34 |             if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
35 |                 hrBeforeFooter.remove();
36 |             }
37 |             footer.remove();
38 |             // Trigger resize again after removing footer
39 |             resizeAskAiIframe();
40 |         } else {
41 |              console.warn("Ask AI Page: Could not find footer in parent document to remove.");
42 |         }
43 |     }, 100); // Shorter delay
44 | });
45 | </script>
46 | 
47 | <style>
48 | #terminal-mkdocs-main-content {
49 |     padding: 0 !important;
50 |     margin: 0;
51 |     width: 100%;
52 |     height: 100%;
53 |     overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
54 | }
55 | 
56 | /* Ensure iframe container takes full space */
57 | #terminal-mkdocs-main-content .ask-ai-container {
58 |     /* Remove negative margins if footer removal handles space */
59 |      margin: 0;
60 |     padding: 0;
61 |     max-width: none;
62 |     /* Let the JS set the height */
63 |     /* height: 600px; Initial fallback height */
64 |     overflow: hidden; /* Hide potential overflow before JS resize */
65 | }
66 | 
67 | /* Hide title/paragraph if they were part of the markdown */
68 | /* Alternatively, just remove them from the .md file directly */
69 | /* #terminal-mkdocs-main-content > h1,
70 | #terminal-mkdocs-main-content > p:first-of-type {
71 |     display: none;
72 | } */
73 | 
74 | </style>
75 | 


--------------------------------------------------------------------------------
/docs/md_v2/core/cache-modes.md:
--------------------------------------------------------------------------------
 1 | # Crawl4AI Cache System and Migration Guide
 2 | 
 3 | ## Overview
 4 | Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
 5 | 
 6 | ## Old vs New Approach
 7 | 
 8 | ### Old Way (Deprecated)
 9 | The old system used multiple boolean flags:
10 | - `bypass_cache`: Skip cache entirely
11 | - `disable_cache`: Disable all caching
12 | - `no_cache_read`: Don't read from cache
13 | - `no_cache_write`: Don't write to cache
14 | 
15 | ### New Way (Recommended)
16 | The new system uses a single `CacheMode` enum:
17 | - `CacheMode.ENABLED`: Normal caching (read/write)
18 | - `CacheMode.DISABLED`: No caching at all
19 | - `CacheMode.READ_ONLY`: Only read from cache
20 | - `CacheMode.WRITE_ONLY`: Only write to cache
21 | - `CacheMode.BYPASS`: Skip cache for this operation
22 | 
23 | ## Migration Example
24 | 
25 | ### Old Code (Deprecated)
26 | ```python
27 | import asyncio
28 | from crawl4ai import AsyncWebCrawler
29 | 
30 | async def use_proxy():
31 |     async with AsyncWebCrawler(verbose=True) as crawler:
32 |         result = await crawler.arun(
33 |             url="https://www.nbcnews.com/business",
34 |             bypass_cache=True  # Old way
35 |         )
36 |         print(len(result.markdown))
37 | 
38 | async def main():
39 |     await use_proxy()
40 | 
41 | if __name__ == "__main__":
42 |     asyncio.run(main())
43 | ```
44 | 
45 | ### New Code (Recommended)
46 | ```python
47 | import asyncio
48 | from crawl4ai import AsyncWebCrawler, CacheMode
49 | from crawl4ai.async_configs import CrawlerRunConfig
50 | 
51 | async def use_proxy():
52 |     # Use CacheMode in CrawlerRunConfig
53 |     config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)  
54 |     async with AsyncWebCrawler(verbose=True) as crawler:
55 |         result = await crawler.arun(
56 |             url="https://www.nbcnews.com/business",
57 |             config=config  # Pass the configuration object
58 |         )
59 |         print(len(result.markdown))
60 | 
61 | async def main():
62 |     await use_proxy()
63 | 
64 | if __name__ == "__main__":
65 |     asyncio.run(main())
66 | ```
67 | 
68 | ## Common Migration Patterns
69 | 
70 | | Old Flag              | New Mode                       |
71 | |-----------------------|---------------------------------|
72 | | `bypass_cache=True`   | `cache_mode=CacheMode.BYPASS`  |
73 | | `disable_cache=True`  | `cache_mode=CacheMode.DISABLED`|
74 | | `no_cache_read=True`  | `cache_mode=CacheMode.WRITE_ONLY` |
75 | | `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |


--------------------------------------------------------------------------------
/docs/snippets/deep_crawl/1.intro.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import List
 3 | 
 4 | from crawl4ai import (
 5 |     AsyncWebCrawler,
 6 |     CrawlerRunConfig,
 7 |     BFSDeepCrawlStrategy,
 8 |     CrawlResult,
 9 |     FilterChain,
10 |     DomainFilter,
11 |     URLPatternFilter,
12 | )
13 | 
14 | # Import necessary classes from crawl4ai library:
15 | # - AsyncWebCrawler: The main class for web crawling.
16 | # - CrawlerRunConfig: Configuration class for crawler behavior.
17 | # - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
18 | # - CrawlResult: Data model for individual crawl results.
19 | # - FilterChain: Used to chain multiple URL filters.
20 | # - URLPatternFilter: Filter URLs based on patterns.
21 | # You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
22 | # but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
23 | 
24 | async def basic_deep_crawl():
25 |     """
26 |     Performs a basic deep crawl starting from a seed URL, demonstrating:
27 |     - Breadth-First Search (BFS) deep crawling strategy.
28 |     - Filtering URLs based on URL patterns.
29 |     - Accessing crawl results and metadata.
30 |     """
31 | 
32 |     # 1. Define URL Filters:
33 |     # Create a URLPatternFilter to include only URLs containing "text".
34 |     # This filter will be used to restrict crawling to URLs that are likely to contain textual content.
35 |     url_filter = URLPatternFilter(
36 |         patterns=[
37 |             "*text*", # Include URLs that contain "text" in their path or URL
38 |         ]
39 |     )
40 | 
41 |     # Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
42 |     # This filter will be used to restrict crawling to URLs within the "groq.com" domain.
43 |     domain_filter = DomainFilter(
44 |         allowed_domains=["groq.com"],
45 |         blocked_domains=["example.com"],
46 |     )
47 | 
48 |     # 2. Configure CrawlerRunConfig for Deep Crawling:
49 |     # Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
50 |     config = CrawlerRunConfig(
51 |         deep_crawl_strategy=BFSDeepCrawlStrategy(
52 |             max_depth=2,  # Set the maximum depth of crawling to 2 levels from the start URL
53 |             max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
54 |             include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
55 |             filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
56 |         ),
57 |         verbose=True, # Enable verbose logging to see detailed output during crawling
58 |     )
59 | 
60 |     # 3. Initialize and Run AsyncWebCrawler:
61 |     # Use AsyncWebCrawler as a context manager for automatic start and close.
62 |     async with AsyncWebCrawler() as crawler:
63 |         results: List[CrawlResult] = await crawler.arun(
64 |             # url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
65 |             url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
66 |             config=config, # Pass the configured CrawlerRunConfig to arun method
67 |         )
68 | 
69 |         # 4. Process and Print Crawl Results:
70 |         # Iterate through the list of CrawlResult objects returned by the deep crawl.
71 |         for result in results:
72 |             # Print the URL and its crawl depth from the metadata for each crawled URL.
73 |             print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     import asyncio
78 |     asyncio.run(basic_deep_crawl())
79 | 


--------------------------------------------------------------------------------
/docs/tutorials/coming_soon.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/docs/tutorials/coming_soon.md


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Crawl4AI Documentation (v0.6.x)
 2 | site_description:  🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
 3 | site_url: https://docs.crawl4ai.com
 4 | repo_url: https://github.com/unclecode/crawl4ai
 5 | repo_name: unclecode/crawl4ai
 6 | docs_dir: docs/md_v2
 7 | 
 8 | nav:
 9 |   - Home: 'index.md'
10 |   - "Ask AI": "core/ask-ai.md"
11 |   - "Quick Start": "core/quickstart.md"
12 |   - "Code Examples": "core/examples.md"
13 |   - Setup & Installation:
14 |     - "Installation": "core/installation.md"
15 |     - "Docker Deployment": "core/docker-deployment.md"
16 |   - "Blog & Changelog":
17 |     - "Blog Home": "blog/index.md"
18 |     - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
19 |   - Core:
20 |     - "Command Line Interface": "core/cli.md"
21 |     - "Simple Crawling": "core/simple-crawling.md"
22 |     - "Deep Crawling": "core/deep-crawling.md"
23 |     - "Crawler Result": "core/crawler-result.md"
24 |     - "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
25 |     - "Markdown Generation": "core/markdown-generation.md"
26 |     - "Fit Markdown": "core/fit-markdown.md"
27 |     - "Page Interaction": "core/page-interaction.md"
28 |     - "Content Selection": "core/content-selection.md"
29 |     - "Cache Modes": "core/cache-modes.md"
30 |     - "Local Files & Raw HTML": "core/local-files.md"
31 |     - "Link & Media": "core/link-media.md"
32 |   - Advanced:
33 |     - "Overview": "advanced/advanced-features.md"
34 |     - "File Downloading": "advanced/file-downloading.md"
35 |     - "Lazy Loading": "advanced/lazy-loading.md"
36 |     - "Hooks & Auth": "advanced/hooks-auth.md"
37 |     - "Proxy & Security": "advanced/proxy-security.md"
38 |     - "Session Management": "advanced/session-management.md"
39 |     - "Multi-URL Crawling": "advanced/multi-url-crawling.md"
40 |     - "Crawl Dispatcher": "advanced/crawl-dispatcher.md"
41 |     - "Identity Based Crawling": "advanced/identity-based-crawling.md"
42 |     - "SSL Certificate": "advanced/ssl-certificate.md"
43 |     - "Network & Console Capture": "advanced/network-console-capture.md"
44 |   - Extraction:
45 |     - "LLM-Free Strategies": "extraction/no-llm-strategies.md"
46 |     - "LLM Strategies": "extraction/llm-strategies.md"
47 |     - "Clustering Strategies": "extraction/clustring-strategies.md"
48 |     - "Chunking": "extraction/chunking.md"
49 |   - API Reference:
50 |     - "AsyncWebCrawler": "api/async-webcrawler.md"
51 |     - "arun()": "api/arun.md"
52 |     - "arun_many()": "api/arun_many.md"
53 |     - "Browser, Crawler & LLM Config": "api/parameters.md"
54 |     - "CrawlResult": "api/crawl-result.md"
55 |     - "Strategies": "api/strategies.md"
56 | 
57 | theme:
58 |   name: 'terminal'
59 |   palette: 'dark'
60 |   icon:
61 |     repo: fontawesome/brands/github
62 | 
63 | plugins:
64 |   - search
65 | 
66 | markdown_extensions:
67 |   - pymdownx.highlight:
68 |       anchor_linenums: true
69 |   - pymdownx.inlinehilite
70 |   - pymdownx.snippets
71 |   - pymdownx.superfences
72 |   - admonition
73 |   - pymdownx.details
74 |   - attr_list
75 |   - tables
76 | 
77 | extra:
78 |   version: !ENV [CRAWL4AI_VERSION, 'development']
79 | 
80 | extra_css:
81 |   - assets/layout.css
82 |   - assets/styles.css
83 |   - assets/highlight.css
84 |   - assets/dmvendor.css
85 | 
86 | extra_javascript:
87 |   - assets/highlight.min.js
88 |   - assets/highlight_init.js
89 |   - https://buttons.github.io/buttons.js
90 |   - assets/toc.js
91 |   - assets/github_stats.js 
92 |   - assets/selection_ask_ai.js
93 |   - assets/copy_code.js
94 |   - assets/floating_ask_ai_button.js
95 |   - assets/mobile_menu.js


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=64.0.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "Crawl4AI"
 7 | dynamic = ["version"]
 8 | description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
 9 | readme = "README.md"
10 | requires-python = ">=3.9"
11 | license = "Apache-2.0"
12 | authors = [
13 |     {name = "Unclecode", email = "unclecode@kidocode.com"}
14 | ]
15 | dependencies = [
16 |     "aiosqlite~=0.20",
17 |     "lxml~=5.3",
18 |     "litellm>=1.53.1",
19 |     "numpy>=1.26.0,<3",
20 |     "pillow~=10.4",
21 |     "playwright>=1.49.0",
22 |     "python-dotenv~=1.0",
23 |     "requests~=2.26",
24 |     "beautifulsoup4~=4.12",
25 |     "tf-playwright-stealth>=1.1.0",
26 |     "xxhash~=3.4",
27 |     "rank-bm25~=0.2",
28 |     "aiofiles>=24.1.0",
29 |     "colorama~=0.4",
30 |     "snowballstemmer~=2.2",
31 |     "pydantic>=2.10",
32 |     "pyOpenSSL>=24.3.0",
33 |     "psutil>=6.1.1",
34 |     "nltk>=3.9.1",
35 |     "playwright",
36 |     "aiofiles",
37 |     "rich>=13.9.4",
38 |     "cssselect>=1.2.0",
39 |     "httpx>=0.27.2",
40 |     "fake-useragent>=2.0.3",
41 |     "click>=8.1.7",
42 |     "pyperclip>=1.8.2",
43 |     "chardet>=5.2.0",
44 |     "aiohttp>=3.11.11",
45 |     "brotli>=1.1.0",
46 |     "humanize>=4.10.0",
47 | ]
48 | classifiers = [
49 |     "Development Status :: 4 - Beta",
50 |     "Intended Audience :: Developers",
51 |     "Programming Language :: Python :: 3",
52 |     "Programming Language :: Python :: 3.9",
53 |     "Programming Language :: Python :: 3.10",
54 |     "Programming Language :: Python :: 3.11",
55 |     "Programming Language :: Python :: 3.12",
56 |     "Programming Language :: Python :: 3.13",
57 | ]
58 | 
59 | [project.optional-dependencies]
60 | pdf = ["PyPDF2"]  
61 | torch = ["torch", "nltk", "scikit-learn"]
62 | transformer = ["transformers", "tokenizers"]
63 | cosine = ["torch", "transformers", "nltk"]
64 | sync = ["selenium"]
65 | all = [
66 |     "PyPDF2",
67 |     "torch",
68 |     "nltk",
69 |     "scikit-learn",
70 |     "transformers",
71 |     "tokenizers",
72 |     "selenium",
73 |     "PyPDF2"  
74 | ]
75 | 
76 | [project.scripts]
77 | crawl4ai-download-models = "crawl4ai.model_loader:main"
78 | crawl4ai-migrate = "crawl4ai.migrations:main"
79 | crawl4ai-setup = "crawl4ai.install:post_install"
80 | crawl4ai-doctor = "crawl4ai.install:doctor"
81 | crwl = "crawl4ai.cli:main"
82 | 
83 | [tool.setuptools]
84 | packages = {find = {where = ["."], include = ["crawl4ai*"]}}
85 | 
86 | [tool.setuptools.package-data]
87 | crawl4ai = ["js_snippet/*.js"]
88 | 
89 | [tool.setuptools.dynamic]
90 | version = {attr = "crawl4ai.__version__.__version__"}
91 | 
92 | [tool.uv.sources]
93 | crawl4ai = { workspace = true }
94 | 
95 | [dependency-groups]
96 | dev = [
97 |     "crawl4ai",
98 | ]
99 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Note: These requirements are also specified in pyproject.toml
 2 | # This file is kept for development environment setup and compatibility
 3 | aiosqlite~=0.20
 4 | lxml~=5.3
 5 | litellm>=1.53.1
 6 | numpy>=1.26.0,<3
 7 | pillow~=10.4
 8 | playwright>=1.49.0
 9 | python-dotenv~=1.0
10 | requests~=2.26
11 | beautifulsoup4~=4.12
12 | tf-playwright-stealth>=1.1.0
13 | xxhash~=3.4
14 | rank-bm25~=0.2
15 | aiofiles>=24.1.0
16 | colorama~=0.4
17 | snowballstemmer~=2.2
18 | pydantic>=2.10
19 | pyOpenSSL>=24.3.0
20 | psutil>=6.1.1
21 | nltk>=3.9.1
22 | rich>=13.9.4
23 | cssselect>=1.2.0
24 | chardet>=5.2.0
25 | brotli>=1.1.0


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [options]
2 | include_package_data = True


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os
 3 | from pathlib import Path
 4 | import shutil
 5 | 
 6 | # Note: Most configuration is now in pyproject.toml
 7 | # This setup.py is kept for backwards compatibility
 8 | 
 9 | # Create the .crawl4ai folder in the user's home directory if it doesn't exist
10 | # If the folder already exists, remove the cache folder
11 | base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
12 | crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
13 | crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
14 | cache_folder = crawl4ai_folder / "cache"
15 | content_folders = [
16 |     "html_content",
17 |     "cleaned_html",
18 |     "markdown_content",
19 |     "extracted_content",
20 |     "screenshots",
21 | ]
22 | 
23 | # Clean up old cache if exists
24 | if cache_folder.exists():
25 |     shutil.rmtree(cache_folder)
26 | 
27 | # Create new folder structure
28 | crawl4ai_folder.mkdir(exist_ok=True)
29 | cache_folder.mkdir(exist_ok=True)
30 | for folder in content_folders:
31 |     (crawl4ai_folder / folder).mkdir(exist_ok=True)
32 | 
33 | version = "0.0.0"  # This will be overridden by pyproject.toml's dynamic version
34 | try:
35 |     with open("crawl4ai/__version__.py") as f:
36 |         for line in f:
37 |             if line.startswith("__version__"):
38 |                 version = line.split("=")[1].strip().strip('"')
39 |                 break
40 | except Exception:
41 |     pass  # Let pyproject.toml handle version
42 | 
43 | setup(
44 |     name="Crawl4AI",
45 |     version=version,
46 |     description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
47 |     long_description=open("README.md", encoding="utf-8").read(),
48 |     long_description_content_type="text/markdown",
49 |     url="https://github.com/unclecode/crawl4ai",
50 |     author="Unclecode",
51 |     author_email="unclecode@kidocode.com",
52 |     license="Apache-2.0",
53 |     packages=find_packages(),
54 |     package_data={"crawl4ai": ["js_snippet/*.js"]},
55 |     classifiers=[
56 |         "Development Status :: 3 - Alpha",
57 |         "Intended Audience :: Developers",
58 |         "Programming Language :: Python :: 3",
59 |         "Programming Language :: Python :: 3.9",
60 |         "Programming Language :: Python :: 3.10",
61 |         "Programming Language :: Python :: 3.11",
62 |         "Programming Language :: Python :: 3.12",
63 |         "Programming Language :: Python :: 3.13",
64 |     ],
65 |     python_requires=">=3.9",
66 | )
67 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/897e0173618d20fea5d8952ccdbcdad0febc0fee/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async/test_basic_crawling.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | import time
 5 | 
 6 | # Add the parent directory to the Python path
 7 | parent_dir = os.path.dirname(
 8 |     os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 9 | )
10 | sys.path.append(parent_dir)
11 | 
12 | from crawl4ai.async_webcrawler import AsyncWebCrawler
13 | 
14 | 
15 | @pytest.mark.asyncio
16 | async def test_successful_crawl():
17 |     async with AsyncWebCrawler(verbose=True) as crawler:
18 |         url = "https://www.nbcnews.com/business"
19 |         result = await crawler.arun(url=url, bypass_cache=True)
20 |         assert result.success
21 |         assert result.url == url
22 |         assert result.html
23 |         assert result.markdown
24 |         assert result.cleaned_html
25 | 
26 | 
27 | @pytest.mark.asyncio
28 | async def test_invalid_url():
29 |     async with AsyncWebCrawler(verbose=True) as crawler:
30 |         url = "https://www.invalidurl12345.com"
31 |         result = await crawler.arun(url=url, bypass_cache=True)
32 |         assert not result.success
33 |         assert result.error_message
34 | 
35 | 
36 | @pytest.mark.asyncio
37 | async def test_multiple_urls():
38 |     async with AsyncWebCrawler(verbose=True) as crawler:
39 |         urls = [
40 |             "https://www.nbcnews.com/business",
41 |             "https://www.example.com",
42 |             "https://www.python.org",
43 |         ]
44 |         results = await crawler.arun_many(urls=urls, bypass_cache=True)
45 |         assert len(results) == len(urls)
46 |         assert all(result.success for result in results)
47 |         assert all(result.html for result in results)
48 | 
49 | 
50 | @pytest.mark.asyncio
51 | async def test_javascript_execution():
52 |     async with AsyncWebCrawler(verbose=True) as crawler:
53 |         js_code = "document.body.innerHTML = '<h1>Modified by JS</h1>';"
54 |         url = "https://www.example.com"
55 |         result = await crawler.arun(url=url, bypass_cache=True, js_code=js_code)
56 |         assert result.success
57 |         assert "<h1>Modified by JS</h1>" in result.html
58 | 
59 | 
60 | @pytest.mark.asyncio
61 | async def test_concurrent_crawling_performance():
62 |     async with AsyncWebCrawler(verbose=True) as crawler:
63 |         urls = [
64 |             "https://www.nbcnews.com/business",
65 |             "https://www.example.com",
66 |             "https://www.python.org",
67 |             "https://www.github.com",
68 |             "https://www.stackoverflow.com",
69 |         ]
70 | 
71 |         start_time = time.time()
72 |         results = await crawler.arun_many(urls=urls, bypass_cache=True)
73 |         end_time = time.time()
74 | 
75 |         total_time = end_time - start_time
76 |         print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
77 | 
78 |         assert all(result.success for result in results)
79 |         assert len(results) == len(urls)
80 | 
81 |         # Assert that concurrent crawling is faster than sequential
82 |         # This multiplier may need adjustment based on the number of URLs and their complexity
83 |         assert (
84 |             total_time < len(urls) * 5
85 |         ), f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
86 | 
87 | 
88 | # Entry point for debugging
89 | if __name__ == "__main__":
90 |     pytest.main([__file__, "-v"])
91 | 


--------------------------------------------------------------------------------
/tests/async/test_caching.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | import asyncio
 5 | 
 6 | # Add the parent directory to the Python path
 7 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(parent_dir)
 9 | 
10 | from crawl4ai.async_webcrawler import AsyncWebCrawler
11 | 
12 | 
13 | @pytest.mark.asyncio
14 | async def test_caching():
15 |     async with AsyncWebCrawler(verbose=True) as crawler:
16 |         url = "https://www.nbcnews.com/business"
17 | 
18 |         # First crawl (should not use cache)
19 |         start_time = asyncio.get_event_loop().time()
20 |         result1 = await crawler.arun(url=url, bypass_cache=True)
21 |         end_time = asyncio.get_event_loop().time()
22 |         time_taken1 = end_time - start_time
23 | 
24 |         assert result1.success
25 | 
26 |         # Second crawl (should use cache)
27 |         start_time = asyncio.get_event_loop().time()
28 |         result2 = await crawler.arun(url=url, bypass_cache=False)
29 |         end_time = asyncio.get_event_loop().time()
30 |         time_taken2 = end_time - start_time
31 | 
32 |         assert result2.success
33 |         assert time_taken2 < time_taken1  # Cached result should be faster
34 | 
35 | 
36 | @pytest.mark.asyncio
37 | async def test_bypass_cache():
38 |     async with AsyncWebCrawler(verbose=True) as crawler:
39 |         url = "https://www.nbcnews.com/business"
40 | 
41 |         # First crawl
42 |         result1 = await crawler.arun(url=url, bypass_cache=False)
43 |         assert result1.success
44 | 
45 |         # Second crawl with bypass_cache=True
46 |         result2 = await crawler.arun(url=url, bypass_cache=True)
47 |         assert result2.success
48 | 
49 |         # Content should be different (or at least, not guaranteed to be the same)
50 |         assert result1.html != result2.html or result1.markdown != result2.markdown
51 | 
52 | 
53 | @pytest.mark.asyncio
54 | async def test_clear_cache():
55 |     async with AsyncWebCrawler(verbose=True) as crawler:
56 |         url = "https://www.nbcnews.com/business"
57 | 
58 |         # Crawl and cache
59 |         await crawler.arun(url=url, bypass_cache=False)
60 | 
61 |         # Clear cache
62 |         await crawler.aclear_cache()
63 | 
64 |         # Check cache size
65 |         cache_size = await crawler.aget_cache_size()
66 |         assert cache_size == 0
67 | 
68 | 
69 | @pytest.mark.asyncio
70 | async def test_flush_cache():
71 |     async with AsyncWebCrawler(verbose=True) as crawler:
72 |         url = "https://www.nbcnews.com/business"
73 | 
74 |         # Crawl and cache
75 |         await crawler.arun(url=url, bypass_cache=False)
76 | 
77 |         # Flush cache
78 |         await crawler.aflush_cache()
79 | 
80 |         # Check cache size
81 |         cache_size = await crawler.aget_cache_size()
82 |         assert cache_size == 0
83 | 
84 | 
85 | # Entry point for debugging
86 | if __name__ == "__main__":
87 |     pytest.main([__file__, "-v"])
88 | 


--------------------------------------------------------------------------------
/tests/async/test_chunking_and_extraction_strategies.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | import json
 5 | 
 6 | # Add the parent directory to the Python path
 7 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(parent_dir)
 9 | 
10 | from crawl4ai import LLMConfig
11 | from crawl4ai.async_webcrawler import AsyncWebCrawler
12 | from crawl4ai.chunking_strategy import RegexChunking
13 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
14 | 
15 | 
16 | @pytest.mark.asyncio
17 | async def test_regex_chunking():
18 |     async with AsyncWebCrawler(verbose=True) as crawler:
19 |         url = "https://www.nbcnews.com/business"
20 |         chunking_strategy = RegexChunking(patterns=["\n\n"])
21 |         result = await crawler.arun(
22 |             url=url, chunking_strategy=chunking_strategy, bypass_cache=True
23 |         )
24 |         assert result.success
25 |         assert result.extracted_content
26 |         chunks = json.loads(result.extracted_content)
27 |         assert len(chunks) > 1  # Ensure multiple chunks were created
28 | 
29 | 
30 | # @pytest.mark.asyncio
31 | # async def test_cosine_strategy():
32 | #     async with AsyncWebCrawler(verbose=True) as crawler:
33 | #         url = "https://www.nbcnews.com/business"
34 | #         extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
35 | #         result = await crawler.arun(
36 | #             url=url,
37 | #             extraction_strategy=extraction_strategy,
38 | #             bypass_cache=True
39 | #         )
40 | #         assert result.success
41 | #         assert result.extracted_content
42 | #         extracted_data = json.loads(result.extracted_content)
43 | #         assert len(extracted_data) > 0
44 | #         assert all('tags' in item for item in extracted_data)
45 | 
46 | 
47 | @pytest.mark.asyncio
48 | async def test_llm_extraction_strategy():
49 |     async with AsyncWebCrawler(verbose=True) as crawler:
50 |         url = "https://www.nbcnews.com/business"
51 |         extraction_strategy = LLMExtractionStrategy(
52 |             llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
53 |             instruction="Extract only content related to technology",
54 |         )
55 |         result = await crawler.arun(
56 |             url=url, extraction_strategy=extraction_strategy, bypass_cache=True
57 |         )
58 |         assert result.success
59 |         assert result.extracted_content
60 |         extracted_data = json.loads(result.extracted_content)
61 |         assert len(extracted_data) > 0
62 |         assert all("content" in item for item in extracted_data)
63 | 
64 | 
65 | # @pytest.mark.asyncio
66 | # async def test_combined_chunking_and_extraction():
67 | #     async with AsyncWebCrawler(verbose=True) as crawler:
68 | #         url = "https://www.nbcnews.com/business"
69 | #         chunking_strategy = RegexChunking(patterns=["\n\n"])
70 | #         extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
71 | #         result = await crawler.arun(
72 | #             url=url,
73 | #             chunking_strategy=chunking_strategy,
74 | #             extraction_strategy=extraction_strategy,
75 | #             bypass_cache=True
76 | #         )
77 | #         assert result.success
78 | #         assert result.extracted_content
79 | #         extracted_data = json.loads(result.extracted_content)
80 | #         assert len(extracted_data) > 0
81 | #         assert all('tags' in item for item in extracted_data)
82 | #         assert all('content' in item for item in extracted_data)
83 | 
84 | # Entry point for debugging
85 | if __name__ == "__main__":
86 |     pytest.main([__file__, "-v"])
87 | 


--------------------------------------------------------------------------------
/tests/async/test_content_extraction.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | 
 5 | # Add the parent directory to the Python path
 6 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 7 | sys.path.append(parent_dir)
 8 | 
 9 | from crawl4ai.async_webcrawler import AsyncWebCrawler
10 | 
11 | 
12 | @pytest.mark.asyncio
13 | async def test_extract_markdown():
14 |     async with AsyncWebCrawler(verbose=True) as crawler:
15 |         url = "https://www.nbcnews.com/business"
16 |         result = await crawler.arun(url=url, bypass_cache=True)
17 |         assert result.success
18 |         assert result.markdown
19 |         assert isinstance(result.markdown, str)
20 |         assert len(result.markdown) > 0
21 | 
22 | 
23 | @pytest.mark.asyncio
24 | async def test_extract_cleaned_html():
25 |     async with AsyncWebCrawler(verbose=True) as crawler:
26 |         url = "https://www.nbcnews.com/business"
27 |         result = await crawler.arun(url=url, bypass_cache=True)
28 |         assert result.success
29 |         assert result.cleaned_html
30 |         assert isinstance(result.cleaned_html, str)
31 |         assert len(result.cleaned_html) > 0
32 | 
33 | 
34 | @pytest.mark.asyncio
35 | async def test_extract_media():
36 |     async with AsyncWebCrawler(verbose=True) as crawler:
37 |         url = "https://www.nbcnews.com/business"
38 |         result = await crawler.arun(url=url, bypass_cache=True)
39 |         assert result.success
40 |         assert result.media
41 |         media = result.media
42 |         assert isinstance(media, dict)
43 |         assert "images" in media
44 |         assert isinstance(media["images"], list)
45 |         for image in media["images"]:
46 |             assert "src" in image
47 |             assert "alt" in image
48 |             assert "type" in image
49 | 
50 | 
51 | @pytest.mark.asyncio
52 | async def test_extract_links():
53 |     async with AsyncWebCrawler(verbose=True) as crawler:
54 |         url = "https://www.nbcnews.com/business"
55 |         result = await crawler.arun(url=url, bypass_cache=True)
56 |         assert result.success
57 |         assert result.links
58 |         links = result.links
59 |         assert isinstance(links, dict)
60 |         assert "internal" in links
61 |         assert "external" in links
62 |         assert isinstance(links["internal"], list)
63 |         assert isinstance(links["external"], list)
64 |         for link in links["internal"] + links["external"]:
65 |             assert "href" in link
66 |             assert "text" in link
67 | 
68 | 
69 | @pytest.mark.asyncio
70 | async def test_extract_metadata():
71 |     async with AsyncWebCrawler(verbose=True) as crawler:
72 |         url = "https://www.nbcnews.com/business"
73 |         result = await crawler.arun(url=url, bypass_cache=True)
74 |         assert result.success
75 |         assert result.metadata
76 |         metadata = result.metadata
77 |         assert isinstance(metadata, dict)
78 |         assert "title" in metadata
79 |         assert isinstance(metadata["title"], str)
80 | 
81 | 
82 | @pytest.mark.asyncio
83 | async def test_css_selector_extraction():
84 |     async with AsyncWebCrawler(verbose=True) as crawler:
85 |         url = "https://www.nbcnews.com/business"
86 |         css_selector = "h1, h2, h3"
87 |         result = await crawler.arun(
88 |             url=url, bypass_cache=True, css_selector=css_selector
89 |         )
90 |         assert result.success
91 |         assert result.markdown
92 |         assert all(heading in result.markdown for heading in ["#", "##", "###"])
93 | 
94 | 
95 | # Entry point for debugging
96 | if __name__ == "__main__":
97 |     pytest.main([__file__, "-v"])
98 | 


--------------------------------------------------------------------------------
/tests/async/test_crawler_strategy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | 
 5 | # Add the parent directory to the Python path
 6 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 7 | sys.path.append(parent_dir)
 8 | 
 9 | from crawl4ai.async_webcrawler import AsyncWebCrawler
10 | 
11 | 
12 | @pytest.mark.asyncio
13 | async def test_custom_user_agent():
14 |     async with AsyncWebCrawler(verbose=True) as crawler:
15 |         custom_user_agent = "MyCustomUserAgent/1.0"
16 |         crawler.crawler_strategy.update_user_agent(custom_user_agent)
17 |         url = "https://httpbin.org/user-agent"
18 |         result = await crawler.arun(url=url, bypass_cache=True)
19 |         assert result.success
20 |         assert custom_user_agent in result.html
21 | 
22 | 
23 | @pytest.mark.asyncio
24 | async def test_custom_headers():
25 |     async with AsyncWebCrawler(verbose=True) as crawler:
26 |         custom_headers = {"X-Test-Header": "TestValue"}
27 |         crawler.crawler_strategy.set_custom_headers(custom_headers)
28 |         url = "https://httpbin.org/headers"
29 |         result = await crawler.arun(url=url, bypass_cache=True)
30 |         assert result.success
31 |         assert "X-Test-Header" in result.html
32 |         assert "TestValue" in result.html
33 | 
34 | 
35 | @pytest.mark.asyncio
36 | async def test_javascript_execution():
37 |     async with AsyncWebCrawler(verbose=True) as crawler:
38 |         js_code = "document.body.innerHTML = '<h1>Modified by JS</h1>';"
39 |         url = "https://www.example.com"
40 |         result = await crawler.arun(url=url, bypass_cache=True, js_code=js_code)
41 |         assert result.success
42 |         assert "<h1>Modified by JS</h1>" in result.html
43 | 
44 | 
45 | @pytest.mark.asyncio
46 | async def test_hook_execution():
47 |     async with AsyncWebCrawler(verbose=True) as crawler:
48 | 
49 |         async def test_hook(page):
50 |             await page.evaluate("document.body.style.backgroundColor = 'red';")
51 |             return page
52 | 
53 |         crawler.crawler_strategy.set_hook("after_goto", test_hook)
54 |         url = "https://www.example.com"
55 |         result = await crawler.arun(url=url, bypass_cache=True)
56 |         assert result.success
57 |         assert "background-color: red" in result.html
58 | 
59 | 
60 | @pytest.mark.asyncio
61 | async def test_screenshot():
62 |     async with AsyncWebCrawler(verbose=True) as crawler:
63 |         url = "https://www.example.com"
64 |         result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
65 |         assert result.success
66 |         assert result.screenshot
67 |         assert isinstance(result.screenshot, str)
68 |         assert len(result.screenshot) > 0
69 | 
70 | 
71 | # Entry point for debugging
72 | if __name__ == "__main__":
73 |     pytest.main([__file__, "-v"])
74 | 


--------------------------------------------------------------------------------
/tests/async/test_database_operations.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | 
 5 | # Add the parent directory to the Python path
 6 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 7 | sys.path.append(parent_dir)
 8 | 
 9 | from crawl4ai.async_webcrawler import AsyncWebCrawler
10 | 
11 | 
12 | @pytest.mark.asyncio
13 | async def test_cache_url():
14 |     async with AsyncWebCrawler(verbose=True) as crawler:
15 |         url = "https://www.example.com"
16 |         # First run to cache the URL
17 |         result1 = await crawler.arun(url=url, bypass_cache=True)
18 |         assert result1.success
19 | 
20 |         # Second run to retrieve from cache
21 |         result2 = await crawler.arun(url=url, bypass_cache=False)
22 |         assert result2.success
23 |         assert result2.html == result1.html
24 | 
25 | 
26 | @pytest.mark.asyncio
27 | async def test_bypass_cache():
28 |     async with AsyncWebCrawler(verbose=True) as crawler:
29 |         url = "https://www.python.org"
30 |         # First run to cache the URL
31 |         result1 = await crawler.arun(url=url, bypass_cache=True)
32 |         assert result1.success
33 | 
34 |         # Second run bypassing cache
35 |         result2 = await crawler.arun(url=url, bypass_cache=True)
36 |         assert result2.success
37 |         assert (
38 |             result2.html != result1.html
39 |         )  # Content might be different due to dynamic nature of websites
40 | 
41 | 
42 | @pytest.mark.asyncio
43 | async def test_cache_size():
44 |     async with AsyncWebCrawler(verbose=True) as crawler:
45 |         initial_size = await crawler.aget_cache_size()
46 | 
47 |         url = "https://www.nbcnews.com/business"
48 |         await crawler.arun(url=url, bypass_cache=True)
49 | 
50 |         new_size = await crawler.aget_cache_size()
51 |         assert new_size == initial_size + 1
52 | 
53 | 
54 | @pytest.mark.asyncio
55 | async def test_clear_cache():
56 |     async with AsyncWebCrawler(verbose=True) as crawler:
57 |         url = "https://www.example.org"
58 |         await crawler.arun(url=url, bypass_cache=True)
59 | 
60 |         initial_size = await crawler.aget_cache_size()
61 |         assert initial_size > 0
62 | 
63 |         await crawler.aclear_cache()
64 |         new_size = await crawler.aget_cache_size()
65 |         assert new_size == 0
66 | 
67 | 
68 | @pytest.mark.asyncio
69 | async def test_flush_cache():
70 |     async with AsyncWebCrawler(verbose=True) as crawler:
71 |         url = "https://www.example.net"
72 |         await crawler.arun(url=url, bypass_cache=True)
73 | 
74 |         initial_size = await crawler.aget_cache_size()
75 |         assert initial_size > 0
76 | 
77 |         await crawler.aflush_cache()
78 |         new_size = await crawler.aget_cache_size()
79 |         assert new_size == 0
80 | 
81 |         # Try to retrieve the previously cached URL
82 |         result = await crawler.arun(url=url, bypass_cache=False)
83 |         assert (
84 |             result.success
85 |         )  # The crawler should still succeed, but it will fetch the content anew
86 | 
87 | 
88 | # Entry point for debugging
89 | if __name__ == "__main__":
90 |     pytest.main([__file__, "-v"])
91 | 


--------------------------------------------------------------------------------
/tests/async/test_error_handling.py:
--------------------------------------------------------------------------------
 1 | # import os
 2 | # import sys
 3 | # import pytest
 4 | # import asyncio
 5 | 
 6 | # # Add the parent directory to the Python path
 7 | # parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | # sys.path.append(parent_dir)
 9 | 
10 | # from crawl4ai.async_webcrawler import AsyncWebCrawler
11 | # from crawl4ai.utils import InvalidCSSSelectorError
12 | 
13 | # class AsyncCrawlerWrapper:
14 | #     def __init__(self):
15 | #         self.crawler = None
16 | 
17 | #     async def setup(self):
18 | #         self.crawler = AsyncWebCrawler(verbose=True)
19 | #         await self.crawler.awarmup()
20 | 
21 | #     async def cleanup(self):
22 | #         if self.crawler:
23 | #             await self.crawler.aclear_cache()
24 | 
25 | # @pytest.fixture(scope="module")
26 | # def crawler_wrapper():
27 | #     wrapper = AsyncCrawlerWrapper()
28 | #     asyncio.get_event_loop().run_until_complete(wrapper.setup())
29 | #     yield wrapper
30 | #     asyncio.get_event_loop().run_until_complete(wrapper.cleanup())
31 | 
32 | # @pytest.mark.asyncio
33 | # async def test_network_error(crawler_wrapper):
34 | #     url = "https://www.nonexistentwebsite123456789.com"
35 | #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
36 | #     assert not result.success
37 | #     assert "Failed to crawl" in result.error_message
38 | 
39 | # # @pytest.mark.asyncio
40 | # # async def test_timeout_error(crawler_wrapper):
41 | # #     # Simulating a timeout by using a very short timeout value
42 | # #     url = "https://www.nbcnews.com/business"
43 | # #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, timeout=0.001)
44 | # #     assert not result.success
45 | # #     assert "timeout" in result.error_message.lower()
46 | 
47 | # # @pytest.mark.asyncio
48 | # # async def test_invalid_css_selector(crawler_wrapper):
49 | # #     url = "https://www.nbcnews.com/business"
50 | # #     with pytest.raises(InvalidCSSSelectorError):
51 | # #         await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, css_selector="invalid>>selector")
52 | 
53 | # # @pytest.mark.asyncio
54 | # # async def test_js_execution_error(crawler_wrapper):
55 | # #     url = "https://www.nbcnews.com/business"
56 | # #     invalid_js = "This is not valid JavaScript code;"
57 | # #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, js=invalid_js)
58 | # #     assert not result.success
59 | # #     assert "JavaScript" in result.error_message
60 | 
61 | # # @pytest.mark.asyncio
62 | # # async def test_empty_page(crawler_wrapper):
63 | # #     # Use a URL that typically returns an empty page
64 | # #     url = "http://example.com/empty"
65 | # #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
66 | # #     assert result.success  # The crawl itself should succeed
67 | # #     assert not result.markdown.strip()  # The markdown content should be empty or just whitespace
68 | 
69 | # # @pytest.mark.asyncio
70 | # # async def test_rate_limiting(crawler_wrapper):
71 | # #     # Simulate rate limiting by making multiple rapid requests
72 | # #     url = "https://www.nbcnews.com/business"
73 | # #     results = await asyncio.gather(*[crawler_wrapper.crawler.arun(url=url, bypass_cache=True) for _ in range(10)])
74 | # #     assert any(not result.success and "rate limit" in result.error_message.lower() for result in results)
75 | 
76 | # # Entry point for debugging
77 | # if __name__ == "__main__":
78 | #     pytest.main([__file__, "-v"])
79 | 


--------------------------------------------------------------------------------
/tests/async/test_performance.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | import time
 5 | 
 6 | # Add the parent directory to the Python path
 7 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(parent_dir)
 9 | 
10 | from crawl4ai.async_webcrawler import AsyncWebCrawler
11 | 
12 | 
13 | @pytest.mark.asyncio
14 | async def test_crawl_speed():
15 |     async with AsyncWebCrawler(verbose=True) as crawler:
16 |         url = "https://www.nbcnews.com/business"
17 |         start_time = time.time()
18 |         result = await crawler.arun(url=url, bypass_cache=True)
19 |         end_time = time.time()
20 | 
21 |         assert result.success
22 |         crawl_time = end_time - start_time
23 |         print(f"Crawl time: {crawl_time:.2f} seconds")
24 | 
25 |         assert crawl_time < 10, f"Crawl took too long: {crawl_time:.2f} seconds"
26 | 
27 | 
28 | @pytest.mark.asyncio
29 | async def test_concurrent_crawling_performance():
30 |     async with AsyncWebCrawler(verbose=True) as crawler:
31 |         urls = [
32 |             "https://www.nbcnews.com/business",
33 |             "https://www.example.com",
34 |             "https://www.python.org",
35 |             "https://www.github.com",
36 |             "https://www.stackoverflow.com",
37 |         ]
38 | 
39 |         start_time = time.time()
40 |         results = await crawler.arun_many(urls=urls, bypass_cache=True)
41 |         end_time = time.time()
42 | 
43 |         total_time = end_time - start_time
44 |         print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
45 | 
46 |         assert all(result.success for result in results)
47 |         assert len(results) == len(urls)
48 | 
49 |         assert (
50 |             total_time < len(urls) * 5
51 |         ), f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
52 | 
53 | 
54 | @pytest.mark.asyncio
55 | async def test_crawl_speed_with_caching():
56 |     async with AsyncWebCrawler(verbose=True) as crawler:
57 |         url = "https://www.nbcnews.com/business"
58 | 
59 |         start_time = time.time()
60 |         result1 = await crawler.arun(url=url, bypass_cache=True)
61 |         end_time = time.time()
62 |         first_crawl_time = end_time - start_time
63 | 
64 |         start_time = time.time()
65 |         result2 = await crawler.arun(url=url, bypass_cache=False)
66 |         end_time = time.time()
67 |         second_crawl_time = end_time - start_time
68 | 
69 |         assert result1.success and result2.success
70 |         print(f"First crawl time: {first_crawl_time:.2f} seconds")
71 |         print(f"Second crawl time (cached): {second_crawl_time:.2f} seconds")
72 | 
73 |         assert (
74 |             second_crawl_time < first_crawl_time / 2
75 |         ), "Cached crawl not significantly faster"
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     pytest.main([__file__, "-v"])
80 | 


--------------------------------------------------------------------------------
/tests/browser/docker/__init__.py:
--------------------------------------------------------------------------------
1 | """Docker browser strategy tests.
2 | 
3 | This package contains tests for the Docker browser strategy implementation.
4 | """


--------------------------------------------------------------------------------
/tests/browser/test_combined.py:
--------------------------------------------------------------------------------
 1 | """Combined test runner for all browser module tests.
 2 | 
 3 | This script runs all the browser module tests in sequence and
 4 | provides a comprehensive summary.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | import time
11 | 
12 | # Add the project root to Python path if running directly
13 | if __name__ == "__main__":
14 |     sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
15 | 
16 | from crawl4ai.async_logger import AsyncLogger
17 | 
18 | # Create a logger for clear terminal output
19 | logger = AsyncLogger(verbose=True, log_file=None)
20 | 
21 | async def run_test_module(module_name, header):
22 |     """Run all tests in a module and return results."""
23 |     logger.info(f"\n{'-'*30}", tag="TEST")
24 |     logger.info(f"RUNNING: {header}", tag="TEST")
25 |     logger.info(f"{'-'*30}", tag="TEST")
26 |     
27 |     # Import the module dynamically
28 |     module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
29 |     
30 |     # Track time for performance measurement
31 |     start_time = time.time()
32 |     
33 |     # Run the tests
34 |     await module.run_tests()
35 |     
36 |     # Calculate time taken
37 |     time_taken = time.time() - start_time
38 |     logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
39 |     
40 |     return time_taken
41 | 
42 | async def main():
43 |     """Run all test modules."""
44 |     logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
45 |     
46 |     # List of test modules to run
47 |     test_modules = [
48 |         ("test_browser_manager", "Browser Manager Tests"),
49 |         ("test_playwright_strategy", "Playwright Strategy Tests"),
50 |         ("test_cdp_strategy", "CDP Strategy Tests"),
51 |         ("test_builtin_strategy", "Builtin Browser Strategy Tests"),
52 |         ("test_profiles", "Profile Management Tests")
53 |     ]
54 |     
55 |     # Run each test module
56 |     timings = {}
57 |     for module_name, header in test_modules:
58 |         try:
59 |             time_taken = await run_test_module(module_name, header)
60 |             timings[module_name] = time_taken
61 |         except Exception as e:
62 |             logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
63 |     
64 |     # Print summary
65 |     logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
66 |     logger.info(f"{'-'*50}", tag="SUMMARY")
67 |     for module_name, header in test_modules:
68 |         if module_name in timings:
69 |             logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
70 |         else:
71 |             logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
72 |     logger.info(f"{'-'*50}", tag="SUMMARY")
73 |     total_time = sum(timings.values())
74 |     logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
75 | 
76 | if __name__ == "__main__":
77 |     asyncio.run(main())
78 | 


--------------------------------------------------------------------------------
/tests/browser/test_launch_standalone.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai.browser_profiler import BrowserProfiler
 2 | import asyncio
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     # Test launching a standalone browser
 7 |     async def test_standalone_browser():
 8 |         profiler = BrowserProfiler()
 9 |         cdp_url = await profiler.launch_standalone_browser(
10 |             browser_type="chromium",
11 |             user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
12 |             debugging_port=9222,
13 |             headless=False
14 |         )
15 |         print(f"CDP URL: {cdp_url}")
16 | 
17 |     asyncio.run(test_standalone_browser())


--------------------------------------------------------------------------------
/tests/docker/test_dockerclient.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai.docker_client import Crawl4aiDockerClient
 3 | from crawl4ai import (
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig
 6 | )
 7 | 
 8 | async def main():
 9 |     async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
10 |         await client.authenticate("test@example.com")
11 |         
12 |         # Non-streaming crawl
13 |         results = await client.crawl(
14 |             ["https://example.com", "https://python.org"],
15 |             browser_config=BrowserConfig(headless=True),
16 |             crawler_config=CrawlerRunConfig()
17 |         )
18 |         print(f"Non-streaming results: {results}")
19 |         
20 |         # Streaming crawl
21 |         crawler_config = CrawlerRunConfig(stream=True)
22 |         async for result in await client.crawl(
23 |             ["https://example.com", "https://python.org"],
24 |             browser_config=BrowserConfig(headless=True),
25 |             crawler_config=crawler_config
26 |         ):
27 |             print(f"Streamed result: {result}")
28 |         
29 |         # Get schema
30 |         schema = await client.get_schema()
31 |         print(f"Schema: {schema}")
32 | 
33 | if __name__ == "__main__":
34 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import (
 3 |     AsyncWebCrawler,
 4 |     CrawlerRunConfig,
 5 |     HTTPCrawlerConfig,
 6 |     CacheMode,
 7 |     DefaultMarkdownGenerator,
 8 |     PruningContentFilter
 9 | )
10 | from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
11 | from crawl4ai.async_logger import AsyncLogger
12 | 
13 | async def main():
14 |     # Initialize HTTP crawler strategy
15 |     http_strategy = AsyncHTTPCrawlerStrategy(
16 |         browser_config=HTTPCrawlerConfig(
17 |             method="GET",
18 |             verify_ssl=True,
19 |             follow_redirects=True
20 |         ),
21 |         logger=AsyncLogger(verbose=True)
22 |     )
23 | 
24 |     # Initialize web crawler with HTTP strategy
25 |     async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
26 |         crawler_config = CrawlerRunConfig(
27 |             cache_mode=CacheMode.BYPASS,
28 |             markdown_generator=DefaultMarkdownGenerator(
29 |                 content_filter=PruningContentFilter(
30 |                     threshold=0.48, 
31 |                     threshold_type="fixed", 
32 |                     min_word_threshold=0
33 |                 )
34 |             )
35 |         )
36 |         
37 |         # Test different URLs
38 |         urls = [
39 |             "https://example.com",
40 |             "https://httpbin.org/get",
41 |             "raw://<html><body>Test content</body></html>"
42 |         ]
43 |         
44 |         for url in urls:
45 |             print(f"\n=== Testing {url} ===")
46 |             try:
47 |                 result = await crawler.arun(url=url, config=crawler_config)
48 |                 print(f"Status: {result.status_code}")
49 |                 print(f"Raw HTML length: {len(result.html)}")
50 |                 if hasattr(result, 'markdown'):
51 |                     print(f"Markdown length: {len(result.markdown.raw_markdown)}")
52 |             except Exception as e:
53 |                 print(f"Error: {e}")
54 | 
55 | if __name__ == "__main__":
56 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_advanced_deep_crawl.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | 
 4 | 
 5 | from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
 6 | from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 7 | from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
 8 | from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
 9 | from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
10 | # from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
11 | 
12 | 
13 | async def main():
14 |     """Example deep crawl of documentation site."""
15 |     filter_chain = FilterChain([
16 |         URLPatternFilter(patterns=["*2025*"]),
17 |         DomainFilter(allowed_domains=["techcrunch.com"]),
18 |         ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
19 |         ContentTypeFilter(allowed_types=["text/html","application/javascript"])
20 |     ])
21 |     config = CrawlerRunConfig(
22 |         deep_crawl_strategy = BestFirstCrawlingStrategy(
23 |             max_depth=2,
24 |             include_external=False,
25 |             filter_chain=filter_chain,
26 |             url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
27 |         ),
28 |         stream=False,
29 |         verbose=True,
30 |         cache_mode=CacheMode.BYPASS,
31 |         scraping_strategy=LXMLWebScrapingStrategy()
32 |     )
33 | 
34 |     async with AsyncWebCrawler() as crawler:
35 |         print("Starting deep crawl in streaming mode:")
36 |         config.stream = True
37 |         start_time = time.perf_counter()
38 |         async for result in await crawler.arun(
39 |             url="https://techcrunch.com",
40 |             config=config
41 |         ):
42 |             print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
43 |         print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
44 | 
45 | if __name__ == "__main__":
46 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_cache_context.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 3 | from playwright.async_api import Page, BrowserContext
 4 | 
 5 | async def test_reuse_context_by_config():
 6 |     # We will store each context ID in these maps to confirm reuse
 7 |     context_ids_for_A = []
 8 |     context_ids_for_B = []
 9 | 
10 |     # Create a small hook to track context creation
11 |     async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
12 |         c_id = id(context)
13 |         print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
14 |         # Distinguish which config we used by checking a custom hook param
15 |         config_label = config.shared_data.get("config_label", "unknown")
16 |         if config_label == "A":
17 |             context_ids_for_A.append(c_id)
18 |         elif config_label == "B":
19 |             context_ids_for_B.append(c_id)
20 |         return page
21 | 
22 |     # Browser config - Headless, verbose so we see logs
23 |     browser_config = BrowserConfig(headless=True, verbose=True)
24 | 
25 |     # Two crawler run configs that differ (for example, text_mode):
26 |     configA = CrawlerRunConfig(
27 |         only_text=True,
28 |         cache_mode=CacheMode.BYPASS,
29 |         wait_until="domcontentloaded",
30 |         shared_data = {
31 |             "config_label" : "A"
32 |         }
33 |     )
34 |     configB = CrawlerRunConfig(
35 |         only_text=False,
36 |         cache_mode=CacheMode.BYPASS,
37 |         wait_until="domcontentloaded",
38 |         shared_data = {
39 |             "config_label" : "B"
40 |         }
41 |     )
42 | 
43 |     # Create the crawler
44 |     crawler = AsyncWebCrawler(config=browser_config)
45 | 
46 |     # Attach our custom hook
47 |     # Note: "on_page_context_created" will be called each time a new context+page is generated
48 |     crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
49 | 
50 |     # Start the crawler (launches the browser)
51 |     await crawler.start()
52 | 
53 |     # For demonstration, we’ll crawl a benign site multiple times with each config
54 |     test_url = "https://example.com"
55 |     print("\n--- Crawling with config A (text_mode=True) ---")
56 |     for _ in range(2):
57 |         # Pass an extra kwarg to the hook so we know which config is being used
58 |         await crawler.arun(test_url, config=configA)
59 | 
60 |     print("\n--- Crawling with config B (text_mode=False) ---")
61 |     for _ in range(2):
62 |         await crawler.arun(test_url, config=configB)
63 | 
64 |     # Close the crawler (shuts down the browser, closes contexts)
65 |     await crawler.close()
66 | 
67 |     # Validate and show the results
68 |     print("\n=== RESULTS ===")
69 |     print(f"Config A context IDs: {context_ids_for_A}")
70 |     print(f"Config B context IDs: {context_ids_for_B}")
71 |     if len(set(context_ids_for_A)) == 1:
72 |         print("✅ All config A crawls used the SAME BrowserContext.")
73 |     else:
74 |         print("❌ Config A crawls created multiple contexts unexpectedly.")
75 |     if len(set(context_ids_for_B)) == 1:
76 |         print("✅ All config B crawls used the SAME BrowserContext.")
77 |     else:
78 |         print("❌ Config B crawls created multiple contexts unexpectedly.")
79 |     if set(context_ids_for_A).isdisjoint(context_ids_for_B):
80 |         print("✅ Config A context is different from Config B context.")
81 |     else:
82 |         print("❌ A and B ended up sharing the same context somehow!")
83 | 
84 | if __name__ == "__main__":
85 |     asyncio.run(test_reuse_context_by_config())
86 | 


--------------------------------------------------------------------------------
/tests/general/test_crawlers.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # example_usageexample_usageexample_usage# example_usage.py
 3 | import asyncio
 4 | from crawl4ai.crawlers import get_crawler
 5 | 
 6 | async def main():
 7 |     # Get the registered crawler
 8 |     example_crawler = get_crawler("example_site.content")
 9 |     
10 |     # Crawl example.com
11 |     result = await example_crawler(url="https://example.com")
12 |         
13 |     print(result)
14 |             
15 | 
16 | if __name__ == "__main__":
17 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_deep_crawl.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | 
 4 | 
 5 | from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
 6 | from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 7 | from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 8 | # from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
 9 | 
10 | 
11 | async def main():
12 |     """Example deep crawl of documentation site."""
13 |     config = CrawlerRunConfig(
14 |         deep_crawl_strategy = BFSDeepCrawlStrategy(
15 |             max_depth=2,
16 |             include_external=False
17 |         ),
18 |         stream=False,
19 |         verbose=True,
20 |         cache_mode=CacheMode.BYPASS,
21 |         scraping_strategy=LXMLWebScrapingStrategy()
22 |     )
23 | 
24 |     async with AsyncWebCrawler() as crawler:
25 |         start_time = time.perf_counter()
26 |         print("\nStarting deep crawl in batch mode:")
27 |         results = await crawler.arun(
28 |             url="https://docs.crawl4ai.com",
29 |             config=config
30 |         )
31 |         print(f"Crawled {len(results)} pages")
32 |         print(f"Example page: {results[0].url}")
33 |         print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
34 | 
35 |         print("Starting deep crawl in streaming mode:")
36 |         config.stream = True
37 |         start_time = time.perf_counter()
38 |         async for result in await crawler.arun(
39 |             url="https://docs.crawl4ai.com",
40 |             config=config
41 |         ):
42 |             print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
43 |         print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
44 | 
45 | if __name__ == "__main__":
46 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_llm_filter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import asyncio
 3 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 4 | from crawl4ai import LLMConfig
 5 | from crawl4ai.content_filter_strategy import LLMContentFilter
 6 | 
 7 | async def test_llm_filter():
 8 |     # Create an HTML source that needs intelligent filtering
 9 |     url = "https://docs.python.org/3/tutorial/classes.html"
10 |     
11 |     browser_config = BrowserConfig(
12 |         headless=True,
13 |         verbose=True
14 |     )
15 |     
16 |     # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
17 |     run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
18 |     
19 |     async with AsyncWebCrawler(config=browser_config) as crawler:
20 |         # First get the raw HTML
21 |         result = await crawler.arun(url, config=run_config)
22 |         html = result.cleaned_html
23 | 
24 |         # Initialize LLM filter with focused instruction
25 |         filter = LLMContentFilter(
26 |             llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
27 |             instruction="""
28 |             Focus on extracting the core educational content about Python classes.
29 |             Include:
30 |             - Key concepts and their explanations
31 |             - Important code examples
32 |             - Essential technical details
33 |             Exclude:
34 |             - Navigation elements
35 |             - Sidebars
36 |             - Footer content
37 |             - Version information
38 |             - Any non-essential UI elements
39 |             
40 |             Format the output as clean markdown with proper code blocks and headers.
41 |             """,
42 |             verbose=True
43 |         )
44 |         
45 |         filter = LLMContentFilter(
46 |             llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
47 |             chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
48 |             instruction="""
49 |             Extract the main educational content while preserving its original wording and substance completely. Your task is to:
50 | 
51 |             1. Maintain the exact language and terminology used in the main content
52 |             2. Keep all technical explanations, examples, and educational content intact
53 |             3. Preserve the original flow and structure of the core content
54 |             4. Remove only clearly irrelevant elements like:
55 |             - Navigation menus
56 |             - Advertisement sections
57 |             - Cookie notices
58 |             - Footers with site information
59 |             - Sidebars with external links
60 |             - Any UI elements that don't contribute to learning
61 | 
62 |             The goal is to create a clean markdown version that reads exactly like the original article, 
63 |             keeping all valuable content but free from distracting elements. Imagine you're creating 
64 |             a perfect reading experience where nothing valuable is lost, but all noise is removed.
65 |             """,
66 |             verbose=True
67 |         )        
68 | 
69 |         # Apply filtering
70 |         filtered_content = filter.filter_content(html, ignore_cache = True)
71 |         
72 |         # Show results
73 |         print("\nFiltered Content Length:", len(filtered_content))
74 |         print("\nFirst 500 chars of filtered content:")
75 |         if filtered_content:
76 |             print(filtered_content[0][:500])
77 |         
78 |         # Save on disc the markdown version
79 |         with open("filtered_content.md", "w", encoding="utf-8") as f:
80 |             f.write("\n".join(filtered_content))
81 |         
82 |         # Show token usage
83 |         filter.show_usage()
84 | 
85 | if __name__ == "__main__":
86 |     asyncio.run(test_llm_filter())


--------------------------------------------------------------------------------
/tests/general/test_stream.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | # append 2 parent directories to sys.path to import crawl4ai
 3 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 4 | sys.path.append(parent_dir)
 5 | parent_parent_dir = os.path.dirname(parent_dir)
 6 | sys.path.append(parent_parent_dir)
 7 | 
 8 | import asyncio
 9 | from crawl4ai import *
10 | 
11 | async def test_crawler():
12 |     # Setup configurations
13 |     browser_config = BrowserConfig(headless=True, verbose=False)
14 |     crawler_config = CrawlerRunConfig(
15 |         cache_mode=CacheMode.BYPASS,
16 |         markdown_generator=DefaultMarkdownGenerator(
17 |             content_filter=PruningContentFilter(
18 |                 threshold=0.48, 
19 |                 threshold_type="fixed", 
20 |                 min_word_threshold=0
21 |             )
22 |         ),
23 |     )
24 | 
25 |     # Test URLs - mix of different sites
26 |     urls = [
27 |         "http://example.com",
28 |         "http://example.org",
29 |         "http://example.net",
30 |     ] * 10  # 15 total URLs
31 | 
32 |     async with AsyncWebCrawler(config=browser_config) as crawler:
33 |         print("\n=== Testing Streaming Mode ===")
34 |         async for result in await crawler.arun_many(
35 |             urls=urls,
36 |             config=crawler_config.clone(stream=True),
37 |         ):
38 |             print(f"Received result for: {result.url} - Success: {result.success}")
39 |             
40 |         print("\n=== Testing Batch Mode ===")
41 |         results = await crawler.arun_many(
42 |             urls=urls,
43 |             config=crawler_config,
44 |         )
45 |         print(f"Received all {len(results)} results at once")
46 |         for result in results:
47 |             print(f"Batch result for: {result.url} - Success: {result.success}")
48 | 
49 | if __name__ == "__main__":
50 |     asyncio.run(test_crawler())


--------------------------------------------------------------------------------
/tests/general/test_stream_dispatch.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | # append 2 parent directories to sys.path to import crawl4ai
 3 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 4 | sys.path.append(parent_dir)
 5 | parent_parent_dir = os.path.dirname(parent_dir)
 6 | sys.path.append(parent_parent_dir)
 7 | 
 8 | 
 9 | import asyncio
10 | from typing import List
11 | from crawl4ai import *
12 | from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
13 | 
14 | async def test_streaming():
15 |     browser_config = BrowserConfig(headless=True, verbose=True)
16 |     crawler_config = CrawlerRunConfig(
17 |         cache_mode=CacheMode.BYPASS,
18 |         markdown_generator=DefaultMarkdownGenerator(
19 |             # content_filter=PruningContentFilter(
20 |             #     threshold=0.48, 
21 |             #     threshold_type="fixed", 
22 |             #     min_word_threshold=0
23 |             # )
24 |         ),
25 |     )
26 | 
27 |     urls = ["http://example.com"] * 10
28 |     
29 |     async with AsyncWebCrawler(config=browser_config) as crawler:
30 |         dispatcher = MemoryAdaptiveDispatcher(
31 |             max_session_permit=5,
32 |             check_interval=0.5
33 |         )
34 |         
35 |         async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
36 |             print(f"Got result for {result.url} - Success: {result.result.success}")
37 | 
38 | if __name__ == "__main__":
39 |     asyncio.run(test_streaming())


--------------------------------------------------------------------------------
/tests/general/tets_robot.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import *
 3 | 
 4 | async def test_real_websites():
 5 |     print("\n=== Testing Real Website Robots.txt Compliance ===\n")
 6 |     
 7 |     browser_config = BrowserConfig(headless=True, verbose=True)
 8 |     async with AsyncWebCrawler(config=browser_config) as crawler:
 9 |         
10 |         # Test cases with URLs
11 |         test_cases = [
12 |             # Public sites that should be allowed
13 |             ("https://example.com", True),  # Simple public site
14 |             ("https://httpbin.org/get", True),  # API endpoint
15 |             
16 |             # Sites with known strict robots.txt
17 |             ("https://www.facebook.com/robots.txt", False),  # Social media
18 |             ("https://www.google.com/search", False),  # Search pages
19 |             
20 |             # Edge cases
21 |             ("https://api.github.com", True),  # API service
22 |             ("https://raw.githubusercontent.com", True),  # Content delivery
23 |             
24 |             # Non-existent/error cases
25 |             ("https://thisisnotarealwebsite.com", True),  # Non-existent domain
26 |             ("https://localhost:12345", True),  # Invalid port
27 |         ]
28 | 
29 |         for url, expected in test_cases:
30 |             print(f"\nTesting: {url}")
31 |             try:
32 |                 config = CrawlerRunConfig(
33 |                     cache_mode=CacheMode.BYPASS,
34 |                     check_robots_txt=True,  # Enable robots.txt checking
35 |                     verbose=True
36 |                 )
37 |                 
38 |                 result = await crawler.arun(url=url, config=config)
39 |                 allowed = result.success and not result.error_message
40 |                 
41 |                 print(f"Expected: {'allowed' if expected else 'denied'}")
42 |                 print(f"Actual: {'allowed' if allowed else 'denied'}")
43 |                 print(f"Status Code: {result.status_code}")
44 |                 if result.error_message:
45 |                     print(f"Error: {result.error_message}")
46 |                 
47 |                 # Optional: Print robots.txt content if available
48 |                 if result.metadata and 'robots_txt' in result.metadata:
49 |                     print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
50 |                 
51 |             except Exception as e:
52 |                 print(f"Test failed with error: {str(e)}")
53 | 
54 | async def main():
55 |     try:
56 |         await test_real_websites()
57 |     except Exception as e:
58 |         print(f"Test suite failed: {str(e)}")
59 |         raise
60 | 
61 | if __name__ == "__main__":
62 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/hub/test_simple.py:
--------------------------------------------------------------------------------
 1 | # test.py
 2 | from crawl4ai import CrawlerHub
 3 | import json
 4 | 
 5 | async def amazon_example():
 6 |     if (crawler_cls := CrawlerHub.get("amazon_product")) :
 7 |         crawler = crawler_cls()
 8 |         print(f"Crawler version: {crawler_cls.meta['version']}")
 9 |         print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
10 |         print(await crawler.run("https://amazon.com/test"))
11 |     else:
12 |         print("Crawler not found!")
13 | 
14 | async def google_example():
15 |     # Get crawler dynamically
16 |     crawler_cls = CrawlerHub.get("google_search")
17 |     crawler = crawler_cls()
18 | 
19 |     # Text search
20 |     text_results = await crawler.run(
21 |         query="apple inc", 
22 |         search_type="text",  
23 |         schema_cache_path="/Users/unclecode/.crawl4ai"
24 |     )
25 |     print(json.dumps(json.loads(text_results), indent=4))
26 | 
27 |     # Image search
28 |     # image_results = await crawler.run(query="apple inc", search_type="image")
29 |     # print(image_results)
30 | 
31 | if __name__ == "__main__":
32 |     import asyncio
33 |     # asyncio.run(amazon_example())
34 |     asyncio.run(google_example())


--------------------------------------------------------------------------------
/tests/loggers/test_logger.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
 3 | import os
 4 | from datetime import datetime
 5 | 
 6 | class AsyncFileLogger(AsyncLoggerBase):
 7 |     """
 8 |     File-only asynchronous logger that writes logs to a specified file.
 9 |     """
10 | 
11 |     def __init__(self, log_file: str):
12 |         """
13 |         Initialize the file logger.
14 | 
15 |         Args:
16 |             log_file: File path for logging
17 |         """
18 |         self.log_file = log_file
19 |         os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
20 | 
21 |     def _write_to_file(self, level: str, message: str, tag: str):
22 |         """Write a message to the log file."""
23 |         timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
24 |         with open(self.log_file, "a", encoding="utf-8") as f:
25 |             f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
26 | 
27 |     def debug(self, message: str, tag: str = "DEBUG", **kwargs):
28 |         """Log a debug message to file."""
29 |         self._write_to_file("DEBUG", message, tag)
30 | 
31 |     def info(self, message: str, tag: str = "INFO", **kwargs):
32 |         """Log an info message to file."""
33 |         self._write_to_file("INFO", message, tag)
34 | 
35 |     def success(self, message: str, tag: str = "SUCCESS", **kwargs):
36 |         """Log a success message to file."""
37 |         self._write_to_file("SUCCESS", message, tag)
38 | 
39 |     def warning(self, message: str, tag: str = "WARNING", **kwargs):
40 |         """Log a warning message to file."""
41 |         self._write_to_file("WARNING", message, tag)
42 | 
43 |     def error(self, message: str, tag: str = "ERROR", **kwargs):
44 |         """Log an error message to file."""
45 |         self._write_to_file("ERROR", message, tag)
46 | 
47 |     def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
48 |         """Log URL fetch status to file."""
49 |         status = "SUCCESS" if success else "FAILED"
50 |         message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
51 |         self._write_to_file("URL_STATUS", message, tag)
52 | 
53 |     def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
54 |         """Log error status to file."""
55 |         message = f"{url[:url_length]}... | Error: {error}"
56 |         self._write_to_file("ERROR", message, tag)
57 | 
58 | async def main():
59 |     browser_config = BrowserConfig(headless=True, verbose=True)
60 |     crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
61 |     await crawler.start()
62 |     
63 |     try:
64 |         crawl_config = CrawlerRunConfig(
65 |             cache_mode=CacheMode.BYPASS,
66 |         )
67 |         # Use the crawler multiple times
68 |         result = await crawler.arun(
69 |             url='https://kidocode.com/',
70 |             config=crawl_config
71 |         )
72 |         if result.success:
73 |             print("First crawl - Raw Markdown Length:", len(result.markdown.raw_markdown))
74 |             
75 |     finally:
76 |         # Always ensure we close the crawler
77 |         await crawler.close()
78 | 
79 | if __name__ == "__main__":
80 |     asyncio.run(main())
81 | 


--------------------------------------------------------------------------------
/tests/mcp/test_mcp_sse.py:
--------------------------------------------------------------------------------
 1 | from mcp.client.sse import sse_client
 2 | from mcp.client.session import ClientSession
 3 | 
 4 | async def main():
 5 |     async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
 6 |         async with ClientSession(r, w) as sess:
 7 |             print(await sess.list_tools())      # now works
 8 |             
 9 | if __name__ == "__main__":
10 |     import asyncio
11 |     asyncio.run(main())
12 | 


--------------------------------------------------------------------------------
/tests/memory/cap_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
 4 | """
 5 | 
 6 | import asyncio, httpx, json, uuid, argparse
 7 | 
 8 | API = "http://localhost:8020/crawl"
 9 | URLS_PER_CALL = 1          # keep it minimal so each arun() == 1 page
10 | CONCURRENT_CALLS = 20      # way above your cap
11 | 
12 | payload_template = {
13 |     "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
14 |     "crawler_config": {
15 |         "type": "CrawlerRunConfig",
16 |         "params": {"cache_mode": "BYPASS", "verbose": False},
17 |     }
18 | }
19 | 
20 | async def one_call(client):
21 |     payload = payload_template.copy()
22 |     payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
23 |     r = await client.post(API, json=payload)
24 |     r.raise_for_status()
25 |     return r.json()["server_peak_memory_mb"]
26 | 
27 | async def main():
28 |     async with httpx.AsyncClient(timeout=60) as client:
29 |         tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
30 |         mem_usages = await asyncio.gather(*tasks)
31 |         print("Calls finished OK, server peaks reported:", mem_usages)
32 | 
33 | if __name__ == "__main__":
34 |     asyncio.run(main())
35 | 


--------------------------------------------------------------------------------
/tests/memory/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=1.5.0
2 | matplotlib>=3.5.0
3 | seaborn>=0.12.0
4 | rich>=12.0.0


--------------------------------------------------------------------------------
/tests/memory/test_docker_config_gen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Quick sanity‑check for /config/dump endpoint.
 4 | 
 5 | Usage:
 6 |     python test_config_dump.py  [http://localhost:8020]
 7 | 
 8 | If the server isn’t running, start it first:
 9 |     uvicorn deploy.docker.server:app --port 8020
10 | """
11 | 
12 | import sys, json, textwrap, requests
13 | 
14 | # BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
15 | BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
16 | URL  = f"{BASE.rstrip('/')}/config/dump"
17 | 
18 | CASES = [
19 |     # --- CrawlRunConfig variants ---
20 |     "CrawlerRunConfig()",
21 |     "CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
22 |     "CrawlerRunConfig(js_only=True, wait_until='networkidle')",
23 | 
24 |     # --- BrowserConfig variants ---
25 |     "BrowserConfig()",
26 |     "BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
27 |     "BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
28 | ]
29 | 
30 | for code in CASES:
31 |     print("\n===  POST:", code)
32 |     resp = requests.post(URL, json={"code": code}, timeout=15)
33 |     if resp.ok:
34 |         print(json.dumps(resp.json(), indent=2)[:400] + "...")
35 |     else:
36 |         print("ERROR", resp.status_code, resp.text[:200])
37 | 


--------------------------------------------------------------------------------
/tests/profiler/test_crteate_profile.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai import BrowserProfiler
 2 | import asyncio
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     # Example usage
 7 |     profiler = BrowserProfiler()
 8 |     
 9 |     # Create a new profile
10 |     import os
11 |     from pathlib import Path
12 |     home_dir = Path.home()
13 |     profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
14 |     
15 |     print(f"Profile created at: {profile_path}")
16 | 
17 |         
18 |             
19 |     # # Launch a standalone browser
20 |     # asyncio.run(profiler.launch_standalone_browser())
21 |     
22 |     # # List profiles
23 |     # profiles = profiler.list_profiles()
24 |     # for profile in profiles:
25 |     #     print(f"Profile: {profile['name']}, Path: {profile['path']}")
26 |     
27 |     # # Delete a profile
28 |     # success = profiler.delete_profile("my-profile")
29 |     # if success:
30 |     #     print("Profile deleted successfully")
31 |     # else:
32 |     #     print("Failed to delete profile")


--------------------------------------------------------------------------------
/tests/test_cli_docs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai.docs_manager import DocsManager
 3 | from click.testing import CliRunner
 4 | from crawl4ai.cli import cli
 5 | 
 6 | 
 7 | def test_cli():
 8 |     """Test all CLI commands"""
 9 |     runner = CliRunner()
10 | 
11 |     print("\n1. Testing docs update...")
12 |     # Use sync version for testing
13 |     docs_manager = DocsManager()
14 |     loop = asyncio.get_event_loop()
15 |     loop.run_until_complete(docs_manager.fetch_docs())
16 | 
17 |     # print("\n2. Testing listing...")
18 |     # result = runner.invoke(cli, ['docs', 'list'])
19 |     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
20 |     # print(result.output)
21 | 
22 |     # print("\n2. Testing index building...")
23 |     # result = runner.invoke(cli, ['docs', 'index'])
24 |     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
25 |     # print(f"Output: {result.output}")
26 | 
27 |     # print("\n3. Testing search...")
28 |     # result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index'])
29 |     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
30 |     # print(f"First 200 chars: {result.output[:200]}...")
31 | 
32 |     # print("\n4. Testing combine with sections...")
33 |     # result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended'])
34 |     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
35 |     # print(f"First 200 chars: {result.output[:200]}...")
36 | 
37 |     print("\n5. Testing combine all sections...")
38 |     result = runner.invoke(cli, ["docs", "combine", "--mode", "condensed"])
39 |     print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
40 |     print(f"First 200 chars: {result.output[:200]}...")
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     test_cli()
45 | 


--------------------------------------------------------------------------------
/tests/test_llmtxt.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai.llmtxt import AsyncLLMTextManager  # Changed to AsyncLLMTextManager
 2 | from crawl4ai.async_logger import AsyncLogger
 3 | from pathlib import Path
 4 | import asyncio
 5 | 
 6 | 
 7 | async def main():
 8 |     current_file = Path(__file__).resolve()
 9 |     # base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs"
10 |     base_dir = current_file.parent.parent / "local/_docs/llm.txt"
11 |     docs_dir = base_dir
12 | 
13 |     # Create directory if it doesn't exist
14 |     docs_dir.mkdir(parents=True, exist_ok=True)
15 | 
16 |     # Initialize logger
17 |     logger = AsyncLogger()
18 |     # Updated initialization with default batching params
19 |     # manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2)
20 |     manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2)
21 | 
22 |     # Let's first check what files we have
23 |     print("\nAvailable files:")
24 |     for f in docs_dir.glob("*.md"):
25 |         print(f"- {f.name}")
26 | 
27 |     # Generate index files
28 |     print("\nGenerating index files...")
29 |     await manager.generate_index_files(
30 |         force_generate_facts=False, clear_bm25_cache=False
31 |     )
32 | 
33 |     # Test some relevant queries about Crawl4AI
34 |     test_queries = [
35 |         "How is using the `arun_many` method?",
36 |     ]
37 | 
38 |     print("\nTesting search functionality:")
39 |     for query in test_queries:
40 |         print(f"\nQuery: {query}")
41 |         results = manager.search(query, top_k=2)
42 |         print(f"Results length: {len(results)} characters")
43 |         if results:
44 |             print(
45 |                 "First 200 chars of results:", results[:200].replace("\n", " "), "..."
46 |             )
47 |         else:
48 |             print("No results found")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     asyncio.run(main())
53 | 


--------------------------------------------------------------------------------
/tests/test_scraping_strategy.py:
--------------------------------------------------------------------------------
 1 | import nest_asyncio
 2 | 
 3 | nest_asyncio.apply()
 4 | 
 5 | import asyncio
 6 | from crawl4ai import (
 7 |     AsyncWebCrawler,
 8 |     CrawlerRunConfig,
 9 |     LXMLWebScrapingStrategy,
10 |     CacheMode,
11 | )
12 | 
13 | 
14 | async def main():
15 |     config = CrawlerRunConfig(
16 |         cache_mode=CacheMode.BYPASS,
17 |         scraping_strategy=LXMLWebScrapingStrategy(),  # Faster alternative to default BeautifulSoup
18 |     )
19 |     async with AsyncWebCrawler() as crawler:
20 |         result = await crawler.arun(url="https://example.com", config=config)
21 |         print(f"Success: {result.success}")
22 |         print(f"Markdown length: {len(result.markdown.raw_markdown)}")
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     asyncio.run(main())
27 | 


--------------------------------------------------------------------------------