The response has been limited to 50k tokens of the smallest files in the repo. You can remove this limitation by removing the max tokens filter.
├── .claude
    └── settings.local.json
├── .env.txt
├── .gitattributes
├── .github
    ├── DISCUSSION_TEMPLATE
    │   └── feature-requests.yml
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   └── config.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── docker-release.yml
    │   ├── docs
    │       ├── ARCHITECTURE.md
    │       ├── README.md
    │       └── WORKFLOW_REFERENCE.md
    │   ├── main.yml
    │   ├── release.yml
    │   ├── release.yml.backup
    │   └── test-release.yml.disabled
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTORS.md
├── Dockerfile
├── JOURNAL.md
├── LICENSE
├── MANIFEST.in
├── MISSION.md
├── PROGRESSIVE_CRAWLING.md
├── README-first.md
├── README.md
├── ROADMAP.md
├── SPONSORS.md
├── cliff.toml
├── crawl4ai
    ├── __init__.py
    ├── __version__.py
    ├── adaptive_crawler copy.py
    ├── adaptive_crawler.py
    ├── async_configs.py
    ├── async_crawler_strategy.back.py
    ├── async_crawler_strategy.py
    ├── async_database.py
    ├── async_dispatcher.py
    ├── async_logger.py
    ├── async_url_seeder.py
    ├── async_webcrawler.py
    ├── browser_adapter.py
    ├── browser_manager.py
    ├── browser_profiler.py
    ├── cache_context.py
    ├── chunking_strategy.py
    ├── cli.py
    ├── components
    │   └── crawler_monitor.py
    ├── config.py
    ├── content_filter_strategy.py
    ├── content_scraping_strategy.py
    ├── crawlers
    │   ├── __init__.py
    │   ├── amazon_product
    │   │   ├── __init__.py
    │   │   └── crawler.py
    │   └── google_search
    │   │   ├── __init__.py
    │   │   ├── crawler.py
    │   │   └── script.js
    ├── deep_crawling
    │   ├── __init__.py
    │   ├── base_strategy.py
    │   ├── bff_strategy.py
    │   ├── bfs_strategy.py
    │   ├── crazy.py
    │   ├── dfs_strategy.py
    │   ├── filters.py
    │   └── scorers.py
    ├── docker_client.py
    ├── extraction_strategy.py
    ├── html2text
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── _typing.py
    │   ├── cli.py
    │   ├── config.py
    │   ├── elements.py
    │   └── utils.py
    ├── hub.py
    ├── install.py
    ├── js_snippet
    │   ├── __init__.py
    │   ├── navigator_overrider.js
    │   ├── remove_overlay_elements.js
    │   └── update_image_dimensions.js
    ├── legacy
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── crawler_strategy.py
    │   ├── database.py
    │   ├── docs_manager.py
    │   ├── llmtxt.py
    │   ├── version_manager.py
    │   └── web_crawler.py
    ├── link_preview.py
    ├── markdown_generation_strategy.py
    ├── migrations.py
    ├── model_loader.py
    ├── models.py
    ├── processors
    │   └── pdf
    │   │   ├── __init__.py
    │   │   ├── processor.py
    │   │   └── utils.py
    ├── prompts.py
    ├── proxy_strategy.py
    ├── script
    │   ├── __init__.py
    │   ├── c4a_compile.py
    │   ├── c4a_result.py
    │   └── c4ai_script.py
    ├── ssl_certificate.py
    ├── table_extraction.py
    ├── types.py
    ├── user_agent_generator.py
    └── utils.py
├── deploy
    └── docker
    │   ├── .dockerignore
    │   ├── .llm.env.example
    │   ├── README.md
    │   ├── WEBHOOK_EXAMPLES.md
    │   ├── api.py
    │   ├── auth.py
    │   ├── c4ai-code-context.md
    │   ├── c4ai-doc-context.md
    │   ├── config.yml
    │   ├── crawler_pool.py
    │   ├── hook_manager.py
    │   ├── job.py
    │   ├── mcp_bridge.py
    │   ├── requirements.txt
    │   ├── schemas.py
    │   ├── server.py
    │   ├── static
    │       └── playground
    │       │   └── index.html
    │   ├── supervisord.conf
    │   ├── utils.py
    │   └── webhook.py
├── docker-compose.yml
├── docs
    ├── apps
    │   ├── iseeyou
    │   │   └── llms-full.txt
    │   └── linkdin
    │   │   ├── Crawl4ai_Linkedin_Data_Discovery_Part_1.ipynb
    │   │   ├── Crawl4ai_Linkedin_Data_Discovery_Part_2.ipynb
    │   │   ├── README.md
    │   │   ├── c4ai_discover.py
    │   │   ├── c4ai_insights.py
    │   │   ├── samples
    │   │       ├── companies.jsonl
    │   │       └── people.jsonl
    │   │   ├── schemas
    │   │       ├── company_card.json
    │   │       └── people_card.json
    │   │   ├── snippets
    │   │       ├── company.html
    │   │       └── people.html
    │   │   └── templates
    │   │       ├── ai.js
    │   │       └── graph_view_template.html
    ├── assets
    │   ├── pitch-dark.png
    │   ├── pitch-dark.svg
    │   ├── powered-by-dark.svg
    │   ├── powered-by-disco.svg
    │   ├── powered-by-light.svg
    │   └── powered-by-night.svg
    ├── blog
    │   ├── release-v0.7.0.md
    │   ├── release-v0.7.1.md
    │   ├── release-v0.7.3.md
    │   ├── release-v0.7.4.md
    │   ├── release-v0.7.5.md
    │   └── release-v0.7.6.md
    ├── codebase
    │   ├── browser.md
    │   └── cli.md
    ├── deprecated
    │   └── docker-deployment.md
    ├── examples
    │   ├── README_BUILTIN_BROWSER.md
    │   ├── adaptive_crawling
    │   │   ├── README.md
    │   │   ├── advanced_configuration.py
    │   │   ├── basic_usage.py
    │   │   ├── custom_strategies.py
    │   │   ├── embedding_configuration.py
    │   │   ├── embedding_strategy.py
    │   │   ├── embedding_vs_statistical.py
    │   │   ├── export_import_kb.py
    │   │   └── llm_config_example.py
    │   ├── amazon_product_extraction_direct_url.py
    │   ├── amazon_product_extraction_using_hooks.py
    │   ├── amazon_product_extraction_using_use_javascript.py
    │   ├── arun_vs_arun_many.py
    │   ├── assets
    │   │   ├── audio.mp3
    │   │   ├── basic.png
    │   │   ├── cosine_extraction.png
    │   │   ├── css_js.png
    │   │   ├── css_selector.png
    │   │   ├── exec_script.png
    │   │   ├── instagram_grid_result.png
    │   │   ├── llm_extraction.png
    │   │   ├── semantic_extraction_cosine.png
    │   │   ├── semantic_extraction_llm.png
    │   │   ├── virtual_scroll_append_only.html
    │   │   ├── virtual_scroll_instagram_grid.html
    │   │   ├── virtual_scroll_news_feed.html
    │   │   └── virtual_scroll_twitter_like.html
    │   ├── async_webcrawler_multiple_urls_example.py
    │   ├── browser_optimization_example.py
    │   ├── builtin_browser_example.py
    │   ├── c4a_script
    │   │   ├── amazon_example
    │   │   │   ├── README.md
    │   │   │   ├── amazon_r2d2_search.py
    │   │   │   ├── extracted_products.json
    │   │   │   ├── generated_product_schema.json
    │   │   │   ├── generated_search_script.js
    │   │   │   ├── header.html
    │   │   │   └── product.html
    │   │   ├── api_usage_examples.py
    │   │   ├── c4a_script_hello_world.py
    │   │   ├── c4a_script_hello_world_error.py
    │   │   ├── demo_c4a_crawl4ai.py
    │   │   ├── generate_script_hello_world.py
    │   │   ├── github_search
    │   │   │   ├── extracted_repositories.json
    │   │   │   ├── generated_result_schema.json
    │   │   │   ├── generated_search_script.js
    │   │   │   ├── github_search_crawler.py
    │   │   │   ├── result.html
    │   │   │   └── search_form.html
    │   │   ├── script_samples
    │   │   │   ├── add_to_cart.c4a
    │   │   │   ├── advanced_control_flow.c4a
    │   │   │   ├── conditional_login.c4a
    │   │   │   ├── data_extraction.c4a
    │   │   │   ├── fill_contact.c4a
    │   │   │   ├── load_more_content.c4a
    │   │   │   ├── login_flow.c4a
    │   │   │   ├── multi_step_workflow.c4a
    │   │   │   ├── navigate_tabs.c4a
    │   │   │   ├── quick_login.c4a
    │   │   │   ├── responsive_actions.c4a
    │   │   │   ├── scroll_and_click.c4a
    │   │   │   ├── search_product.c4a
    │   │   │   ├── simple_form.c4a
    │   │   │   └── smart_form_fill.c4a
    │   │   └── tutorial
    │   │   │   ├── README.md
    │   │   │   ├── assets
    │   │   │       ├── DankMono-Bold.woff2
    │   │   │       ├── DankMono-Italic.woff2
    │   │   │       ├── DankMono-Regular.woff2
    │   │   │       ├── app.css
    │   │   │       ├── app.js
    │   │   │       ├── blockly-manager.js
    │   │   │       ├── blockly-theme.css
    │   │   │       ├── c4a-blocks.js
    │   │   │       ├── c4a-generator.js
    │   │   │       └── styles.css
    │   │   │   ├── blockly-demo.c4a
    │   │   │   ├── index.html
    │   │   │   ├── playground
    │   │   │       ├── app.js
    │   │   │       ├── index.html
    │   │   │       └── styles.css
    │   │   │   ├── requirements.txt
    │   │   │   ├── scripts
    │   │   │       ├── 01-basic-interaction.c4a
    │   │   │       ├── 02-login-flow.c4a
    │   │   │       ├── 03-infinite-scroll.c4a
    │   │   │       ├── 04-multi-step-form.c4a
    │   │   │       └── 05-complex-workflow.c4a
    │   │   │   ├── server.py
    │   │   │   └── test_blockly.html
    │   ├── chainlit.md
    │   ├── cli
    │   │   ├── browser.yml
    │   │   ├── crawler.yml
    │   │   ├── css_schema.json
    │   │   ├── extract.yml
    │   │   ├── extract_css.yml
    │   │   └── llm_schema.json
    │   ├── crawlai_vs_firecrawl.py
    │   ├── crawler_monitor_example.py
    │   ├── crypto_analysis_example.py
    │   ├── deepcrawl_example.py
    │   ├── demo_multi_config_clean.py
    │   ├── dispatcher_example.py
    │   ├── docker
    │   │   ├── demo_docker_api.py
    │   │   └── demo_docker_polling.py
    │   ├── docker_client_hooks_example.py
    │   ├── docker_config_obj.py
    │   ├── docker_example.py
    │   ├── docker_hooks_examples.py
    │   ├── docker_python_rest_api.py
    │   ├── docker_python_sdk.py
    │   ├── docker_webhook_example.py
    │   ├── extraction_strategies_examples.py
    │   ├── full_page_screenshot_and_pdf_export.md
    │   ├── hello_world.py
    │   ├── hello_world_undetected.py
    │   ├── hooks_example.py
    │   ├── identity_based_browsing.py
    │   ├── language_support_example.py
    │   ├── link_head_extraction_example.py
    │   ├── llm_extraction_openai_pricing.py
    │   ├── llm_markdown_generator.py
    │   ├── llm_table_extraction_example.py
    │   ├── markdown
    │   │   ├── content_source_example.py
    │   │   └── content_source_short_example.py
    │   ├── network_console_capture_example.py
    │   ├── proxy_rotation_demo.py
    │   ├── quickstart.ipynb
    │   ├── quickstart.py
    │   ├── quickstart_examples_set_1.py
    │   ├── quickstart_examples_set_2.py
    │   ├── regex_extraction_quickstart.py
    │   ├── research_assistant.py
    │   ├── rest_call.py
    │   ├── sample_ecommerce.html
    │   ├── scraping_strategies_performance.py
    │   ├── serp_api_project_11_feb.py
    │   ├── session_id_example.py
    │   ├── simple_anti_bot_examples.py
    │   ├── ssl_example.py
    │   ├── stealth_mode_example.py
    │   ├── stealth_mode_quick_start.py
    │   ├── stealth_test_simple.py
    │   ├── storage_state_tutorial.md
    │   ├── summarize_page.py
    │   ├── table_extraction_example.py
    │   ├── tutorial_dynamic_clicks.md
    │   ├── tutorial_v0.5.py
    │   ├── undetectability
    │   │   ├── undetected_basic_test.py
    │   │   ├── undetected_bot_test.py
    │   │   ├── undetected_cloudflare_test.py
    │   │   └── undetected_vs_regular_comparison.py
    │   ├── undetected_simple_demo.py
    │   ├── url_seeder
    │   │   ├── Crawl4AI_URL_Seeder_Tutorial.ipynb
    │   │   ├── bbc_sport_research_assistant.py
    │   │   ├── convert_tutorial_to_colab.py
    │   │   ├── tutorial_url_seeder.md
    │   │   ├── url_seeder_demo.py
    │   │   └── url_seeder_quick_demo.py
    │   ├── use_geo_location.py
    │   ├── virtual_scroll_example.py
    │   └── website-to-api
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── api_server.py
    │   │   ├── app.py
    │   │   ├── assets
    │   │       └── crawl4ai_logo.jpg
    │   │   ├── requirements.txt
    │   │   ├── static
    │   │       ├── index.html
    │   │       ├── script.js
    │   │       └── styles.css
    │   │   ├── test_api.py
    │   │   ├── test_models.py
    │   │   └── web_scraper_lib.py
    ├── md_v2
    │   ├── advanced
    │   │   ├── adaptive-strategies.md
    │   │   ├── advanced-features.md
    │   │   ├── crawl-dispatcher.md
    │   │   ├── file-downloading.md
    │   │   ├── hooks-auth.md
    │   │   ├── identity-based-crawling.md
    │   │   ├── lazy-loading.md
    │   │   ├── multi-url-crawling.md
    │   │   ├── network-console-capture.md
    │   │   ├── pdf-parsing.md
    │   │   ├── proxy-security.md
    │   │   ├── session-management.md
    │   │   ├── ssl-certificate.md
    │   │   ├── undetected-browser.md
    │   │   └── virtual-scroll.md
    │   ├── api
    │   │   ├── adaptive-crawler.md
    │   │   ├── arun.md
    │   │   ├── arun_many.md
    │   │   ├── async-webcrawler.md
    │   │   ├── c4a-script-reference.md
    │   │   ├── crawl-result.md
    │   │   ├── digest.md
    │   │   ├── parameters.md
    │   │   └── strategies.md
    │   ├── apps
    │   │   ├── assets
    │   │   │   ├── DankMono-Bold.woff2
    │   │   │   ├── DankMono-Italic.woff2
    │   │   │   └── DankMono-Regular.woff2
    │   │   ├── c4a-script
    │   │   │   ├── README.md
    │   │   │   ├── assets
    │   │   │   │   ├── DankMono-Bold.woff2
    │   │   │   │   ├── DankMono-Italic.woff2
    │   │   │   │   ├── DankMono-Regular.woff2
    │   │   │   │   ├── app.css
    │   │   │   │   ├── app.js
    │   │   │   │   ├── blockly-manager.js
    │   │   │   │   ├── blockly-theme.css
    │   │   │   │   ├── c4a-blocks.js
    │   │   │   │   ├── c4a-generator.js
    │   │   │   │   └── styles.css
    │   │   │   ├── blockly-demo.c4a
    │   │   │   ├── index.html
    │   │   │   ├── playground
    │   │   │   │   ├── app.js
    │   │   │   │   ├── index.html
    │   │   │   │   └── styles.css
    │   │   │   ├── requirements.txt
    │   │   │   ├── scripts
    │   │   │   │   ├── 01-basic-interaction.c4a
    │   │   │   │   ├── 02-login-flow.c4a
    │   │   │   │   ├── 03-infinite-scroll.c4a
    │   │   │   │   ├── 04-multi-step-form.c4a
    │   │   │   │   └── 05-complex-workflow.c4a
    │   │   │   ├── server.py
    │   │   │   └── test_blockly.html
    │   │   ├── crawl4ai-assistant
    │   │   │   ├── README.md
    │   │   │   ├── assets
    │   │   │   │   ├── DankMono-Bold.woff2
    │   │   │   │   ├── DankMono-Italic.woff2
    │   │   │   │   └── DankMono-Regular.woff2
    │   │   │   ├── assistant.css
    │   │   │   ├── background
    │   │   │   │   └── service-worker.js
    │   │   │   ├── content
    │   │   │   │   ├── click2crawl.js
    │   │   │   │   ├── content.js
    │   │   │   │   ├── contentAnalyzer.js
    │   │   │   │   ├── markdownConverter.js
    │   │   │   │   ├── markdownExtraction.js
    │   │   │   │   ├── markdownPreviewModal.js
    │   │   │   │   ├── overlay.css
    │   │   │   │   ├── scriptBuilder.js
    │   │   │   │   └── shared
    │   │   │   │   │   └── utils.js
    │   │   │   ├── crawl4ai-assistant-v1.2.1.zip
    │   │   │   ├── crawl4ai-assistant-v1.3.0.zip
    │   │   │   ├── icons
    │   │   │   │   ├── favicon.ico
    │   │   │   │   ├── icon-128.png
    │   │   │   │   ├── icon-16.png
    │   │   │   │   └── icon-48.png
    │   │   │   ├── index.html
    │   │   │   ├── libs
    │   │   │   │   └── marked.min.js
    │   │   │   ├── manifest.json
    │   │   │   └── popup
    │   │   │   │   ├── icons
    │   │   │   │       ├── favicon.ico
    │   │   │   │       ├── icon-128.png
    │   │   │   │       ├── icon-16.png
    │   │   │   │       └── icon-48.png
    │   │   │   │   ├── popup.css
    │   │   │   │   ├── popup.html
    │   │   │   │   └── popup.js
    │   │   ├── index.md
    │   │   └── llmtxt
    │   │   │   ├── build.md
    │   │   │   ├── index.html
    │   │   │   ├── llmtxt.css
    │   │   │   ├── llmtxt.js
    │   │   │   └── why.md
    │   ├── ask_ai
    │   │   ├── ask-ai.css
    │   │   ├── ask-ai.js
    │   │   └── index.html
    │   ├── assets
    │   │   ├── DankMono-Bold.woff2
    │   │   ├── DankMono-Italic.woff2
    │   │   ├── DankMono-Regular.woff2
    │   │   ├── Monaco.woff
    │   │   ├── copy_code.js
    │   │   ├── crawl4ai-skill.zip
    │   │   ├── dmvendor.css
    │   │   ├── docs.zip
    │   │   ├── feedback-overrides.css
    │   │   ├── floating_ask_ai_button.js
    │   │   ├── github_stats.js
    │   │   ├── gtag.js
    │   │   ├── highlight.css
    │   │   ├── highlight.min.js
    │   │   ├── highlight_init.js
    │   │   ├── images
    │   │   │   ├── dispatcher.png
    │   │   │   └── logo.png
    │   │   ├── layout.css
    │   │   ├── llm.txt
    │   │   │   ├── diagrams
    │   │   │   │   ├── cli.txt
    │   │   │   │   ├── config_objects.txt
    │   │   │   │   ├── deep_crawl_advanced_filters_scorers.txt
    │   │   │   │   ├── deep_crawling.txt
    │   │   │   │   ├── docker.txt
    │   │   │   │   ├── extraction-llm.txt
    │   │   │   │   ├── extraction-no-llm.txt
    │   │   │   │   ├── http_based_crawler_strategy.txt
    │   │   │   │   ├── installation.txt
    │   │   │   │   ├── llms-diagram.txt
    │   │   │   │   ├── multi_urls_crawling.txt
    │   │   │   │   ├── simple_crawling.txt
    │   │   │   │   └── url_seeder.txt
    │   │   │   └── txt
    │   │   │   │   ├── cli.txt
    │   │   │   │   ├── config_objects.txt
    │   │   │   │   ├── deep_crawl_advanced_filters_scorers.txt
    │   │   │   │   ├── deep_crawling.txt
    │   │   │   │   ├── docker.txt
    │   │   │   │   ├── extraction-llm.txt
    │   │   │   │   ├── extraction-no-llm.txt
    │   │   │   │   ├── http_based_crawler_strategy.txt
    │   │   │   │   ├── installation.txt
    │   │   │   │   ├── llms-full-v0.1.1.txt
    │   │   │   │   ├── llms-full.txt
    │   │   │   │   ├── multi_urls_crawling.txt
    │   │   │   │   ├── simple_crawling.txt
    │   │   │   │   └── url_seeder.txt
    │   │   ├── mobile_menu.js
    │   │   ├── page_actions.css
    │   │   ├── page_actions.js
    │   │   ├── selection_ask_ai.js
    │   │   ├── styles.css
    │   │   ├── test
    │   │   │   └── toc.js
    │   │   └── toc.js
    │   ├── basic
    │   │   └── installation.md
    │   ├── blog
    │   │   ├── articles
    │   │   │   ├── adaptive-crawling-revolution.md
    │   │   │   ├── dockerize_hooks.md
    │   │   │   ├── llm-context-revolution.md
    │   │   │   └── virtual-scroll-revolution.md
    │   │   ├── index.md
    │   │   ├── index.md.bak
    │   │   └── releases
    │   │   │   ├── 0.4.0.md
    │   │   │   ├── 0.4.1.md
    │   │   │   ├── 0.4.2.md
    │   │   │   ├── 0.5.0.md
    │   │   │   ├── 0.6.0.md
    │   │   │   ├── 0.7.0.md
    │   │   │   ├── 0.7.1.md
    │   │   │   ├── 0.7.2.md
    │   │   │   ├── 0.7.3.md
    │   │   │   ├── 0.7.6.md
    │   │   │   ├── v0.4.3b1.md
    │   │   │   └── v0.7.5.md
    │   ├── branding
    │   │   └── index.md
    │   ├── complete-sdk-reference.md
    │   ├── core
    │   │   ├── adaptive-crawling.md
    │   │   ├── ask-ai.md
    │   │   ├── browser-crawler-config.md
    │   │   ├── c4a-script.md
    │   │   ├── cache-modes.md
    │   │   ├── cli.md
    │   │   ├── content-selection.md
    │   │   ├── crawler-result.md
    │   │   ├── deep-crawling.md
    │   │   ├── docker-deployment.md
    │   │   ├── examples.md
    │   │   ├── fit-markdown.md
    │   │   ├── installation.md
    │   │   ├── link-media.md
    │   │   ├── llmtxt.md
    │   │   ├── local-files.md
    │   │   ├── markdown-generation.md
    │   │   ├── page-interaction.md
    │   │   ├── quickstart.md
    │   │   ├── simple-crawling.md
    │   │   ├── table_extraction.md
    │   │   └── url-seeding.md
    │   ├── extraction
    │   │   ├── chunking.md
    │   │   ├── clustring-strategies.md
    │   │   ├── llm-strategies.md
    │   │   └── no-llm-strategies.md
    │   ├── favicon.ico
    │   ├── img
    │   │   ├── favicon-32x32.png
    │   │   ├── favicon-x-32x32.png
    │   │   └── favicon.ico
    │   ├── index.md
    │   ├── marketplace
    │   │   ├── README.md
    │   │   ├── admin
    │   │   │   ├── admin.css
    │   │   │   ├── admin.js
    │   │   │   └── index.html
    │   │   ├── app-detail.css
    │   │   ├── app-detail.html
    │   │   ├── app-detail.js
    │   │   ├── backend
    │   │   │   ├── .env.example
    │   │   │   ├── config.py
    │   │   │   ├── database.py
    │   │   │   ├── dummy_data.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── schema.yaml
    │   │   │   ├── server.py
    │   │   │   └── uploads
    │   │   │   │   └── .gitignore
    │   │   ├── frontend
    │   │   │   ├── app-detail.css
    │   │   │   ├── app-detail.html
    │   │   │   ├── app-detail.js
    │   │   │   ├── index.html
    │   │   │   ├── marketplace.css
    │   │   │   └── marketplace.js
    │   │   ├── index.html
    │   │   ├── marketplace.css
    │   │   └── marketplace.js
    │   ├── migration
    │   │   ├── table_extraction_v073.md
    │   │   └── webscraping-strategy-migration.md
    │   └── overrides
    │   │   └── main.html
    ├── releases_review
    │   ├── Crawl4AI_v0.3.72_Release_Announcement.ipynb
    │   ├── crawl4ai_v0_7_0_showcase.py
    │   ├── demo_v0.7.0.py
    │   ├── demo_v0.7.5.py
    │   ├── demo_v0.7.6.py
    │   ├── v0.3.74.overview.py
    │   ├── v0.7.5_docker_hooks_demo.py
    │   ├── v0.7.5_video_walkthrough.ipynb
    │   ├── v0_4_24_walkthrough.py
    │   ├── v0_4_3b2_features_demo.py
    │   └── v0_7_0_features_demo.py
    ├── snippets
    │   └── deep_crawl
    │   │   ├── 1.intro.py
    │   │   └── 2.filters.py
    └── tutorials
    │   └── coming_soon.md
├── mkdocs.yml
├── prompts
    └── prompt_net_requests.md
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
├── test_llm_webhook_feature.py
├── test_webhook_implementation.py
├── tests
    ├── WEBHOOK_TEST_README.md
    ├── __init__.py
    ├── adaptive
    │   ├── compare_performance.py
    │   ├── test_adaptive_crawler.py
    │   ├── test_confidence_debug.py
    │   ├── test_embedding_performance.py
    │   ├── test_embedding_strategy.py
    │   └── test_llm_embedding.py
    ├── async
    │   ├── sample_wikipedia.html
    │   ├── test_0.4.2_browser_manager.py
    │   ├── test_0.4.2_config_params.py
    │   ├── test_async_doanloader.py
    │   ├── test_basic_crawling.py
    │   ├── test_caching.py
    │   ├── test_chunking_and_extraction_strategies.py
    │   ├── test_content_extraction.py
    │   ├── test_content_filter_bm25.py
    │   ├── test_content_filter_prune.py
    │   ├── test_content_scraper_strategy.py
    │   ├── test_crawler_strategy.py
    │   ├── test_database_operations.py
    │   ├── test_dispatchers.py
    │   ├── test_edge_cases.py
    │   ├── test_error_handling.py
    │   ├── test_evaluation_scraping_methods_performance.configs.py
    │   ├── test_markdown_genertor.py
    │   ├── test_parameters_and_options.py
    │   ├── test_performance.py
    │   └── test_screenshot.py
    ├── async_assistant
    │   ├── test_extract_pipeline.py
    │   └── test_extract_pipeline_v2.py
    ├── browser
    │   ├── docker
    │   │   ├── __init__.py
    │   │   └── test_docker_browser.py
    │   ├── manager
    │   │   └── demo_browser_manager.py
    │   ├── test_browser_manager.py
    │   ├── test_builtin_browser.py
    │   ├── test_builtin_strategy.py
    │   ├── test_cdp_strategy.py
    │   ├── test_combined.py
    │   ├── test_launch_standalone.py
    │   ├── test_parallel_crawling.py
    │   ├── test_playwright_strategy.py
    │   └── test_profiles.py
    ├── check_dependencies.py
    ├── cli
    │   └── test_cli.py
    ├── deep_crwaling
    │   └── test_filter.py
    ├── docker
    │   ├── simple_api_test.py
    │   ├── test_config_object.py
    │   ├── test_docker.py
    │   ├── test_dockerclient.py
    │   ├── test_filter_deep_crawl.py
    │   ├── test_hooks_client.py
    │   ├── test_hooks_comprehensive.py
    │   ├── test_hooks_utility.py
    │   ├── test_llm_params.py
    │   ├── test_rest_api_deep_crawl.py
    │   ├── test_serialization.py
    │   ├── test_server.py
    │   ├── test_server_requests.py
    │   └── test_server_token.py
    ├── docker_example.py
    ├── general
    │   ├── generate_dummy_site.py
    │   ├── test_acyn_crawl_wuth_http_crawler_strategy.py
    │   ├── test_advanced_deep_crawl.py
    │   ├── test_async_crawler_strategy.py
    │   ├── test_async_markdown_generator.py
    │   ├── test_async_url_seeder_bm25.py
    │   ├── test_async_webcrawler.py
    │   ├── test_bff_scoring.py
    │   ├── test_cache_context.py
    │   ├── test_content_source_parameter.py
    │   ├── test_crawlers.py
    │   ├── test_deep_crawl.py
    │   ├── test_deep_crawl_filters.py
    │   ├── test_deep_crawl_scorers.py
    │   ├── test_download_file.py
    │   ├── test_http_crawler_strategy.py
    │   ├── test_llm_filter.py
    │   ├── test_max_scroll.py
    │   ├── test_mhtml.py
    │   ├── test_network_console_capture.py
    │   ├── test_persistent_context.py
    │   ├── test_robot_parser.py
    │   ├── test_schema_builder.py
    │   ├── test_stream.py
    │   ├── test_stream_dispatch.py
    │   ├── test_url_pattern.py
    │   └── tets_robot.py
    ├── hub
    │   └── test_simple.py
    ├── loggers
    │   └── test_logger.py
    ├── mcp
    │   ├── test_mcp_socket.py
    │   └── test_mcp_sse.py
    ├── memory
    │   ├── README.md
    │   ├── benchmark_report.py
    │   ├── cap_test.py
    │   ├── requirements.txt
    │   ├── run_benchmark.py
    │   ├── test_crawler_monitor.py
    │   ├── test_dispatcher_stress.py
    │   ├── test_docker_config_gen.py
    │   ├── test_stress_api.py
    │   ├── test_stress_api_xs.py
    │   ├── test_stress_docker_api.py
    │   └── test_stress_sdk.py
    ├── profiler
    │   ├── test_create_profile.py
    │   └── test_keyboard_handle.py
    ├── proxy
    │   ├── test_proxy_config.py
    │   └── test_proxy_deprecation.py
    ├── releases
    │   ├── test_release_0.6.4.py
    │   └── test_release_0.7.0.py
    ├── test_arun_many.py
    ├── test_cli_docs.py
    ├── test_config_matching_only.py
    ├── test_config_selection.py
    ├── test_docker.py
    ├── test_docker_api_with_llm_provider.py
    ├── test_link_extractor.py
    ├── test_llm_simple_url.py
    ├── test_llmtxt.py
    ├── test_main.py
    ├── test_memory_macos.py
    ├── test_multi_config.py
    ├── test_normalize_url.py
    ├── test_preserve_https_for_internal_links.py
    ├── test_scraping_strategy.py
    ├── test_virtual_scroll.py
    ├── test_web_crawler.py
    └── test_webhook_feature.sh
└── uv.lock


/.claude/settings.local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "permissions": {
 3 |     "allow": [
 4 |       "Bash(cd:*)",
 5 |       "Bash(python3:*)",
 6 |       "Bash(python:*)",
 7 |       "Bash(grep:*)",
 8 |       "Bash(mkdir:*)",
 9 |       "Bash(cp:*)",
10 |       "Bash(rm:*)",
11 |       "Bash(true)",
12 |       "Bash(./package-extension.sh:*)",
13 |       "Bash(find:*)",
14 |       "Bash(chmod:*)",
15 |       "Bash(rg:*)",
16 |       "Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -A 5 -B 5 \"Script Builder\" docs/md_v2/apps/crawl4ai-assistant/)",
17 |       "Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -A 30 \"generateCode\\(events, format\\)\" docs/md_v2/apps/crawl4ai-assistant/content/content.js)",
18 |       "Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg \"<style>\" docs/md_v2/apps/crawl4ai-assistant/index.html -A 5)",
19 |       "Bash(git checkout:*)",
20 |       "Bash(docker logs:*)",
21 |       "Bash(curl:*)",
22 |       "Bash(docker compose:*)",
23 |       "Bash(./test-final-integration.sh:*)",
24 |       "Bash(mv:*)"
25 |     ]
26 |   },
27 |   "enableAllProjectMcpServers": false
28 | }


--------------------------------------------------------------------------------
/.env.txt:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY = "YOUR_GROQ_API"
2 | OPENAI_API_KEY = "YOUR_OPENAI_API"
3 | ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API"
4 | # You can add more API keys here


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | *.html linguist-documentation
 3 | docs/* linguist-documentation
 4 | docs/examples/* linguist-documentation
 5 | docs/md_v2/* linguist-documentation
 6 | 
 7 | # Explicitly mark Python as the main language
 8 | *.py linguist-detectable=true
 9 | *.py linguist-language=Python
10 | 
11 | # Exclude HTML from language statistics
12 | *.html linguist-detectable=false
13 | 


--------------------------------------------------------------------------------
/.github/DISCUSSION_TEMPLATE/feature-requests.yml:
--------------------------------------------------------------------------------
 1 | title: "[Feature Request]: "
 2 | labels: ["⚙️ New"]
 3 | body:
 4 |   - type: markdown
 5 |     attributes:
 6 |       value: |
 7 |         Thank you for your interest in suggesting a new feature! Before you submit, please take a moment to check if already exists in
 8 |         this discussions category to avoid duplicates. 😊
 9 | 
10 |   - type: textarea
11 |     id: needs_to_be_done
12 |     attributes:
13 |       label: What needs to be done?
14 |       description: Please describe the feature or functionality you'd like to see.
15 |       placeholder: "e.g., Return alt text along with images scraped from a webpages in Result"
16 |     validations:
17 |       required: true
18 | 
19 |   - type: textarea
20 |     id: problem_to_solve
21 |     attributes:
22 |       label: What problem does this solve?
23 |       description: Explain the pain point or issue this feature will help address.
24 |       placeholder: "e.g., Bypass Captchas added by cloudflare"
25 |     validations:
26 |       required: true
27 | 
28 |   - type: textarea
29 |     id: target_users
30 |     attributes:
31 |       label: Target users/beneficiaries
32 |       description: Who would benefit from this feature? (e.g., specific teams, developers, users, etc.)
33 |       placeholder: "e.g., Marketing teams, developers"
34 |     validations:
35 |       required: false
36 | 
37 |   - type: textarea
38 |     id: current_workarounds
39 |     attributes:
40 |       label: Current alternatives/workarounds
41 |       description: Are there any existing solutions or workarounds? How does this feature improve upon them?
42 |       placeholder: "e.g., Users manually select the css classes mapped to data fields to extract them"
43 |     validations:
44 |       required: false
45 | 
46 |   - type: markdown
47 |     attributes:
48 |       value: |
49 |         ### 💡 Implementation Ideas
50 | 
51 |   - type: textarea
52 |     id: proposed_approach
53 |     attributes:
54 |       label: Proposed approach
55 |       description: Share any ideas you have for how this feature could be implemented. Point out any challenges your foresee
56 |        and the success metrics for this feature
57 |       placeholder: "e.g., Implement a breadth first traversal algorithm for scraper"
58 |     validations:
59 |       required: false
60 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | # GitHub Sponsors
4 | github: unclecode
5 | 
6 | # Custom links for enterprise inquiries (uncomment when ready)
7 | # custom: ["https://crawl4ai.com/enterprise"]


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Feature Requests
4 |     url: https://github.com/unclecode/crawl4ai/discussions/categories/feature-requests
5 |     about: "Suggest new features or enhancements for Crawl4AI"
6 |   - name: Forums - Q&A
7 |     url: https://github.com/unclecode/crawl4ai/discussions/categories/forums-q-a
8 |     about: "Ask questions or engage in general discussions about Crawl4AI"
9 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Summary
 2 | Please include a summary of the change and/or which issues are fixed.
 3 | 
 4 | eg: `Fixes #123` (Tag GitHub issue numbers in this format, so it automatically links the issues with your PR)
 5 | 
 6 | ## List of files changed and why
 7 | eg: quickstart.py - To update the example as per new changes
 8 | 
 9 | ## How Has This Been Tested?
10 | Please describe the tests that you ran to verify your changes.
11 | 
12 | ## Checklist:
13 | 
14 | - [ ] My code follows the style guidelines of this project
15 | - [ ] I have performed a self-review of my own code
16 | - [ ] I have commented my code, particularly in hard-to-understand areas
17 | - [ ] I have made corresponding changes to the documentation
18 | - [ ] I have added/updated unit tests that prove my fix is effective or that my feature works
19 | - [ ] New and existing unit tests pass locally with my changes
20 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Discord GitHub Notifications
 2 | 
 3 | on:
 4 |   issues:
 5 |     types: [opened]
 6 |   issue_comment:
 7 |     types: [created]
 8 |   pull_request:
 9 |     types: [opened]
10 |   discussion:
11 |     types: [created]
12 |   watch:
13 |     types: [started]
14 | 
15 | jobs:
16 |   notify-discord:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - name: Send to Google Apps Script (Stars only)
20 |         if: github.event_name == 'watch'
21 |         run: |
22 |           curl -fSs -X POST "${{ secrets.GOOGLE_SCRIPT_ENDPOINT }}" \
23 |             -H 'Content-Type: application/json' \
24 |             -d '{"url":"${{ github.event.sender.html_url }}"}'
25 |       - name: Set webhook based on event type
26 |         id: set-webhook
27 |         run: |
28 |           if [ "${{ github.event_name }}" == "discussion" ]; then
29 |             echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT
30 |           elif [ "${{ github.event_name }}" == "watch" ]; then
31 |             echo "webhook=${{ secrets.DISCORD_STAR_GAZERS }}" >> $GITHUB_OUTPUT
32 |           else
33 |             echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT
34 |           fi
35 | 
36 |       - name: Discord Notification
37 |         uses: Ilshidur/action-discord@master
38 |         env:
39 |           DISCORD_WEBHOOK: ${{ steps.set-webhook.outputs.webhook }}
40 |         with:
41 |           args: |
42 |             ${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) || 
43 |             github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) ||
44 |             github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) ||
45 |             github.event_name == 'watch' && format('⭐ {0} starred Crawl4AI 🥳! Check out their profile: {1}', github.event.sender.login, github.event.sender.html_url) ||
46 |             format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }}
47 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | recursive-include crawl4ai/js_snippet *.js


--------------------------------------------------------------------------------
/SPONSORS.md:
--------------------------------------------------------------------------------
 1 | # 💖 Sponsors & Supporters
 2 | 
 3 | Thank you to everyone supporting Crawl4AI! Your sponsorship helps keep this project open-source and actively maintained.
 4 | 
 5 | ## 👑 Founding Sponsors
 6 | *The first 50 sponsors who believed in our vision - permanently recognized*
 7 | 
 8 | <!-- Founding sponsors will be listed here with special recognition -->
 9 | 🎉 **Become a Founding Sponsor!** Only [X/50] spots remaining! [Join now →](https://github.com/sponsors/unclecode)
10 | 
11 | ---
12 | 
13 | ## 🏢 Data Infrastructure Partners ($2000/month)
14 | *These organizations are building their data sovereignty with Crawl4AI at the core*
15 | 
16 | <!-- Data Infrastructure Partners will be listed here -->
17 | *Be the first Data Infrastructure Partner! [Join us →](https://github.com/sponsors/unclecode)*
18 | 
19 | ---
20 | 
21 | ## 💼 Growing Teams ($500/month)
22 | *Teams scaling their data extraction with Crawl4AI*
23 | 
24 | <!-- Growing Teams will be listed here -->
25 | *Your team could be here! [Become a sponsor →](https://github.com/sponsors/unclecode)*
26 | 
27 | ---
28 | 
29 | ## 🚀 Builders ($50/month)
30 | *Developers and entrepreneurs building with Crawl4AI*
31 | 
32 | <!-- Builders will be listed here -->
33 | *Join the builders! [Start sponsoring →](https://github.com/sponsors/unclecode)*
34 | 
35 | ---
36 | 
37 | ## 🌱 Believers ($5/month)
38 | *The community supporting data democratization*
39 | 
40 | <!-- Believers will be listed here -->
41 | *Thank you to all our community believers!*
42 | 
43 | ---
44 | 
45 | ## 🤝 Want to Sponsor?
46 | 
47 | Crawl4AI is the #1 trending open-source web crawler. We're building the future of data extraction - where organizations own their data pipelines instead of relying on rate-limited APIs.
48 | 
49 | ### Available Sponsorship Tiers:
50 | - **🌱 Believer** ($5/mo) - Support the movement
51 | - **🚀 Builder** ($50/mo) - Priority support & early access
52 | - **💼 Growing Team** ($500/mo) - Bi-weekly syncs & optimization
53 | - **🏢 Data Infrastructure Partner** ($2000/mo) - Full partnership & dedicated support
54 | 
55 | [View all tiers and benefits →](https://github.com/sponsors/unclecode)
56 | 
57 | ### Enterprise & Custom Partnerships
58 | 
59 | Building data extraction at scale? Need dedicated support or infrastructure? Let's talk about a custom partnership.
60 | 
61 | 📧 Contact: [hello@crawl4ai.com](mailto:hello@crawl4ai.com) | 📅 [Schedule a call](https://calendar.app.google/rEpvi2UBgUQjWHfJ9)
62 | 
63 | ---
64 | 
65 | *This list is updated regularly. Sponsors at $50+ tiers can submit their logos via [hello@crawl4ai.com](mailto:hello@crawl4ai.com)*


--------------------------------------------------------------------------------
/cliff.toml:
--------------------------------------------------------------------------------
 1 | [changelog]
 2 | # Template format
 3 | header = """
 4 | # Changelog\n
 5 | All notable changes to this project will be documented in this file.\n
 6 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 7 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n
 8 | """
 9 | 
10 | # Organize commits by type
11 | [git]
12 | conventional_commits = true
13 | filter_unconventional = true
14 | commit_parsers = [
15 |     { message = "^feat", group = "Added"},
16 |     { message = "^fix", group = "Fixed"},
17 |     { message = "^doc", group = "Documentation"},
18 |     { message = "^perf", group = "Performance"},
19 |     { message = "^refactor", group = "Changed"},
20 |     { message = "^style", group = "Changed"},
21 |     { message = "^test", group = "Testing"},
22 |     { message = "^chore\\(release\\): prepare for", skip = true},
23 |     { message = "^chore", group = "Miscellaneous Tasks"},
24 | ]


--------------------------------------------------------------------------------
/crawl4ai/__version__.py:
--------------------------------------------------------------------------------
1 | # crawl4ai/__version__.py
2 | 
3 | # This is the version that will be used for stable releases
4 | __version__ = "0.7.6"
5 | 
6 | # For nightly builds, this gets set during build process
7 | __nightly_version__ = None
8 | 
9 | 


--------------------------------------------------------------------------------
/crawl4ai/crawlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/crawl4ai/crawlers/__init__.py


--------------------------------------------------------------------------------
/crawl4ai/crawlers/amazon_product/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/crawl4ai/crawlers/amazon_product/__init__.py


--------------------------------------------------------------------------------
/crawl4ai/crawlers/amazon_product/crawler.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai.hub import BaseCrawler
 2 | 
 3 | __meta__ = {
 4 |     "version": "1.2.0",
 5 |     "tested_on": ["amazon.com"],
 6 |     "rate_limit": "50 RPM",
 7 |     "schema": {"product": ["name", "price"]}
 8 | }
 9 | 
10 | class AmazonProductCrawler(BaseCrawler):
11 |     async def run(self, url: str, **kwargs) -> str:
12 |         try:
13 |             self.logger.info(f"Crawling {url}")
14 |             return '{"product": {"name": "Test Amazon Product"}}'
15 |         except Exception as e:
16 |             self.logger.error(f"Crawl failed: {str(e)}")
17 |             return json.dumps({
18 |                 "error": str(e),
19 |                 "metadata": self.meta  # Include meta in error response
20 |             })            


--------------------------------------------------------------------------------
/crawl4ai/crawlers/google_search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/crawl4ai/crawlers/google_search/__init__.py


--------------------------------------------------------------------------------
/crawl4ai/deep_crawling/__init__.py:
--------------------------------------------------------------------------------
 1 | # deep_crawling/__init__.py
 2 | from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
 3 | from .bfs_strategy import BFSDeepCrawlStrategy
 4 | from .bff_strategy import BestFirstCrawlingStrategy
 5 | from .dfs_strategy import DFSDeepCrawlStrategy
 6 | from .filters import (
 7 |     FilterChain,
 8 |     ContentTypeFilter,
 9 |     DomainFilter,
10 |     URLFilter,
11 |     URLPatternFilter,
12 |     FilterStats,
13 |     ContentRelevanceFilter,
14 |     SEOFilter
15 | )
16 | from .scorers import (
17 |     KeywordRelevanceScorer,
18 |     URLScorer,
19 |     CompositeScorer,
20 |     DomainAuthorityScorer,
21 |     FreshnessScorer,
22 |     PathDepthScorer,
23 |     ContentTypeScorer
24 | )
25 | 
26 | __all__ = [
27 |     "DeepCrawlDecorator",
28 |     "DeepCrawlStrategy",
29 |     "BFSDeepCrawlStrategy",
30 |     "BestFirstCrawlingStrategy",
31 |     "DFSDeepCrawlStrategy",
32 |     "FilterChain",
33 |     "ContentTypeFilter",
34 |     "DomainFilter",
35 |     "URLFilter",
36 |     "URLPatternFilter",
37 |     "FilterStats",
38 |     "ContentRelevanceFilter",
39 |     "SEOFilter",
40 |     "KeywordRelevanceScorer",
41 |     "URLScorer",
42 |     "CompositeScorer",
43 |     "DomainAuthorityScorer",
44 |     "FreshnessScorer",
45 |     "PathDepthScorer",
46 |     "ContentTypeScorer",
47 | ]
48 | 


--------------------------------------------------------------------------------
/crawl4ai/html2text/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import main
2 | 
3 | main()
4 | 


--------------------------------------------------------------------------------
/crawl4ai/html2text/_typing.py:
--------------------------------------------------------------------------------
1 | class OutCallback:
2 |     def __call__(self, s: str) -> None:
3 |         ...
4 | 


--------------------------------------------------------------------------------
/crawl4ai/html2text/elements.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | 
 3 | 
 4 | class AnchorElement:
 5 |     __slots__ = ["attrs", "count", "outcount"]
 6 | 
 7 |     def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
 8 |         self.attrs = attrs
 9 |         self.count = count
10 |         self.outcount = outcount
11 | 
12 | 
13 | class ListElement:
14 |     __slots__ = ["name", "num"]
15 | 
16 |     def __init__(self, name: str, num: int):
17 |         self.name = name
18 |         self.num = num
19 | 


--------------------------------------------------------------------------------
/crawl4ai/hub.py:
--------------------------------------------------------------------------------
 1 | # crawl4ai/hub.py
 2 | from abc import ABC, abstractmethod
 3 | from typing import Dict, Type, Union
 4 | import logging
 5 | import importlib
 6 | from pathlib import Path
 7 | import inspect
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class BaseCrawler(ABC):
13 |     def __init__(self):
14 |         self.logger = logging.getLogger(self.__class__.__name__)
15 |         
16 |     @abstractmethod
17 |     async def run(self, url: str = "", **kwargs) -> str:
18 |         """
19 |         Implement this method to return JSON string.
20 |         Must accept URL + arbitrary kwargs for flexibility.
21 |         """
22 |         pass
23 | 
24 |     def __init_subclass__(cls, **kwargs):
25 |         """Enforce interface validation on subclassing"""
26 |         super().__init_subclass__(**kwargs)
27 |         
28 |         # Verify run method signature
29 |         run_method = cls.run
30 |         if not run_method.__code__.co_argcount >= 2:  # self + url
31 |             raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'")
32 |             
33 |         # Verify async nature
34 |         if not inspect.iscoroutinefunction(run_method):
35 |             raise TypeError(f"{cls.__name__}.run must be async")
36 | 
37 | class CrawlerHub:
38 |     _crawlers: Dict[str, Type[BaseCrawler]] = {}
39 | 
40 |     @classmethod
41 |     def _discover_crawlers(cls):
42 |         """Dynamically load crawlers from /crawlers in 3 lines"""
43 |         base_path = Path(__file__).parent / "crawlers"
44 |         for crawler_dir in base_path.iterdir():
45 |             if crawler_dir.is_dir():
46 |                 try:
47 |                     module = importlib.import_module(
48 |                         f"crawl4ai.crawlers.{crawler_dir.name}.crawler"
49 |                     )
50 |                     for attr in dir(module):
51 |                         cls._maybe_register_crawler(
52 |                             getattr(module, attr), crawler_dir.name
53 |                         )
54 |                 except Exception as e:
55 |                     logger.warning(f"Failed {crawler_dir.name}: {str(e)}")
56 | 
57 |     @classmethod
58 |     def _maybe_register_crawler(cls, obj, name: str):
59 |         """Brilliant one-liner registration"""
60 |         if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler:
61 |             module = importlib.import_module(obj.__module__)
62 |             obj.meta = getattr(module, "__meta__", {})
63 |             cls._crawlers[name] = obj
64 | 
65 |     @classmethod
66 |     def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
67 |         if not cls._crawlers:
68 |             cls._discover_crawlers()
69 |         return cls._crawlers.get(name)


--------------------------------------------------------------------------------
/crawl4ai/js_snippet/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | # Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free
 5 | def load_js_script(script_name):
 6 |     # Get the path of the current script
 7 |     current_script_path = os.path.dirname(os.path.realpath(__file__))
 8 |     # Get the path of the script to load
 9 |     script_path = os.path.join(current_script_path, script_name + ".js")
10 |     # Check if the script exists
11 |     if not os.path.exists(script_path):
12 |         raise ValueError(
13 |             f"Script {script_name} not found in the folder {current_script_path}"
14 |         )
15 |     # Load the content of the script
16 |     with open(script_path, "r") as f:
17 |         script_content = f.read()
18 |     return script_content
19 | 


--------------------------------------------------------------------------------
/crawl4ai/js_snippet/navigator_overrider.js:
--------------------------------------------------------------------------------
 1 | // Pass the Permissions Test.
 2 | const originalQuery = window.navigator.permissions.query;
 3 | window.navigator.permissions.query = (parameters) =>
 4 |     parameters.name === "notifications"
 5 |         ? Promise.resolve({ state: Notification.permission })
 6 |         : originalQuery(parameters);
 7 | Object.defineProperty(navigator, "webdriver", {
 8 |     get: () => undefined,
 9 | });
10 | window.navigator.chrome = {
11 |     runtime: {},
12 |     // Add other properties if necessary
13 | };
14 | Object.defineProperty(navigator, "plugins", {
15 |     get: () => [1, 2, 3, 4, 5],
16 | });
17 | Object.defineProperty(navigator, "languages", {
18 |     get: () => ["en-US", "en"],
19 | });
20 | Object.defineProperty(document, "hidden", {
21 |     get: () => false,
22 | });
23 | Object.defineProperty(document, "visibilityState", {
24 |     get: () => "visible",
25 | });
26 | 


--------------------------------------------------------------------------------
/crawl4ai/js_snippet/update_image_dimensions.js:
--------------------------------------------------------------------------------
 1 | () => {
 2 |     return new Promise((resolve) => {
 3 |         const filterImage = (img) => {
 4 |             // Filter out images that are too small
 5 |             if (img.width < 100 && img.height < 100) return false;
 6 | 
 7 |             // Filter out images that are not visible
 8 |             const rect = img.getBoundingClientRect();
 9 |             if (rect.width === 0 || rect.height === 0) return false;
10 | 
11 |             // Filter out images with certain class names (e.g., icons, thumbnails)
12 |             if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false;
13 | 
14 |             // Filter out images with certain patterns in their src (e.g., placeholder images)
15 |             if (img.src.includes("placeholder") || img.src.includes("icon")) return false;
16 | 
17 |             return true;
18 |         };
19 | 
20 |         const images = Array.from(document.querySelectorAll("img")).filter(filterImage);
21 |         let imagesLeft = images.length;
22 | 
23 |         if (imagesLeft === 0) {
24 |             resolve();
25 |             return;
26 |         }
27 | 
28 |         const checkImage = (img) => {
29 |             if (img.complete && img.naturalWidth !== 0) {
30 |                 img.setAttribute("width", img.naturalWidth);
31 |                 img.setAttribute("height", img.naturalHeight);
32 |                 imagesLeft--;
33 |                 if (imagesLeft === 0) resolve();
34 |             }
35 |         };
36 | 
37 |         images.forEach((img) => {
38 |             checkImage(img);
39 |             if (!img.complete) {
40 |                 img.onload = () => {
41 |                     checkImage(img);
42 |                 };
43 |                 img.onerror = () => {
44 |                     imagesLeft--;
45 |                     if (imagesLeft === 0) resolve();
46 |                 };
47 |             }
48 |         });
49 | 
50 |         // Fallback timeout of 5 seconds
51 |         // setTimeout(() => resolve(), 5000);
52 |         resolve();
53 |     });
54 | };
55 | 


--------------------------------------------------------------------------------
/crawl4ai/legacy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/crawl4ai/legacy/__init__.py


--------------------------------------------------------------------------------
/crawl4ai/legacy/version_manager.py:
--------------------------------------------------------------------------------
 1 | # version_manager.py
 2 | from pathlib import Path
 3 | from packaging import version
 4 | from . import __version__
 5 | 
 6 | 
 7 | class VersionManager:
 8 |     def __init__(self):
 9 |         self.home_dir = Path.home() / ".crawl4ai"
10 |         self.version_file = self.home_dir / "version.txt"
11 | 
12 |     def get_installed_version(self):
13 |         """Get the version recorded in home directory"""
14 |         if not self.version_file.exists():
15 |             return None
16 |         try:
17 |             return version.parse(self.version_file.read_text().strip())
18 |         except:
19 |             return None
20 | 
21 |     def update_version(self):
22 |         """Update the version file to current library version"""
23 |         self.version_file.write_text(__version__.__version__)
24 | 
25 |     def needs_update(self):
26 |         """Check if database needs update based on version"""
27 |         installed = self.get_installed_version()
28 |         current = version.parse(__version__.__version__)
29 |         return installed is None or installed < current
30 | 


--------------------------------------------------------------------------------
/crawl4ai/script/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | C4A-Script: A domain-specific language for web automation in Crawl4AI
 3 | """
 4 | 
 5 | from .c4a_compile import C4ACompiler, compile, validate, compile_file
 6 | from .c4a_result import (
 7 |     CompilationResult, 
 8 |     ValidationResult, 
 9 |     ErrorDetail, 
10 |     WarningDetail,
11 |     ErrorType, 
12 |     Severity, 
13 |     Suggestion
14 | )
15 | 
16 | __all__ = [
17 |     # Main compiler
18 |     "C4ACompiler",
19 |     
20 |     # Convenience functions
21 |     "compile",
22 |     "validate", 
23 |     "compile_file",
24 |     
25 |     # Result types
26 |     "CompilationResult",
27 |     "ValidationResult",
28 |     "ErrorDetail",
29 |     "WarningDetail",
30 |     
31 |     # Enums
32 |     "ErrorType",
33 |     "Severity",
34 |     "Suggestion"
35 | ]


--------------------------------------------------------------------------------
/deploy/docker/.dockerignore:
--------------------------------------------------------------------------------
 1 | # .dockerignore
 2 | *
 3 | 
 4 | # Allow specific files and directories when using local installation
 5 | !crawl4ai/
 6 | !docs/
 7 | !deploy/docker/
 8 | !setup.py
 9 | !pyproject.toml
10 | !README.md
11 | !LICENSE
12 | !MANIFEST.in
13 | !setup.cfg
14 | !mkdocs.yml
15 | 
16 | .git/
17 | __pycache__/
18 | *.pyc
19 | *.pyo
20 | *.pyd
21 | .DS_Store
22 | .env
23 | .venv
24 | venv/
25 | tests/
26 | coverage.xml
27 | *.log
28 | *.swp
29 | *.egg-info/
30 | dist/
31 | build/


--------------------------------------------------------------------------------
/deploy/docker/.llm.env.example:
--------------------------------------------------------------------------------
 1 | # LLM Provider Keys
 2 | OPENAI_API_KEY=your_openai_key_here
 3 | DEEPSEEK_API_KEY=your_deepseek_key_here
 4 | ANTHROPIC_API_KEY=your_anthropic_key_here
 5 | GROQ_API_KEY=your_groq_key_here
 6 | TOGETHER_API_KEY=your_together_key_here
 7 | MISTRAL_API_KEY=your_mistral_key_here
 8 | GEMINI_API_TOKEN=your_gemini_key_here
 9 | 
10 | # Optional: Override the default LLM provider
11 | # Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
12 | # If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
13 | # LLM_PROVIDER=anthropic/claude-3-opus
14 | 
15 | # Optional: Global LLM temperature setting (0.0-2.0)
16 | # Controls randomness in responses. Lower = more focused, Higher = more creative
17 | # LLM_TEMPERATURE=0.7
18 | 
19 | # Optional: Global custom API base URL
20 | # Use this to point to custom endpoints or proxy servers
21 | # LLM_BASE_URL=https://api.custom.com/v1
22 | 
23 | # Optional: Provider-specific temperature overrides
24 | # These take precedence over the global LLM_TEMPERATURE
25 | # OPENAI_TEMPERATURE=0.5
26 | # ANTHROPIC_TEMPERATURE=0.3
27 | # GROQ_TEMPERATURE=0.8
28 | 
29 | # Optional: Provider-specific base URL overrides
30 | # Use for provider-specific proxy endpoints
31 | # OPENAI_BASE_URL=https://custom-openai.company.com/v1
32 | # GROQ_BASE_URL=https://custom-groq.company.com/v1


--------------------------------------------------------------------------------
/deploy/docker/crawler_pool.py:
--------------------------------------------------------------------------------
 1 | # crawler_pool.py  (new file)
 2 | import asyncio, json, hashlib, time, psutil
 3 | from contextlib import suppress
 4 | from typing import Dict
 5 | from crawl4ai import AsyncWebCrawler, BrowserConfig
 6 | from typing import Dict
 7 | from utils import load_config 
 8 | 
 9 | CONFIG = load_config()
10 | 
11 | POOL: Dict[str, AsyncWebCrawler] = {}
12 | LAST_USED: Dict[str, float] = {}
13 | LOCK = asyncio.Lock()
14 | 
15 | MEM_LIMIT  = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0)   # % RAM – refuse new browsers above this
16 | IDLE_TTL  = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)   # close if unused for 30 min
17 | 
18 | def _sig(cfg: BrowserConfig) -> str:
19 |     payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
20 |     return hashlib.sha1(payload.encode()).hexdigest()
21 | 
22 | async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
23 |     try:
24 |         sig = _sig(cfg)
25 |         async with LOCK:
26 |             if sig in POOL:
27 |                 LAST_USED[sig] = time.time();  
28 |                 return POOL[sig]
29 |             if psutil.virtual_memory().percent >= MEM_LIMIT:
30 |                 raise MemoryError("RAM pressure – new browser denied")
31 |             crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
32 |             await crawler.start()
33 |             POOL[sig] = crawler; LAST_USED[sig] = time.time()
34 |             return crawler
35 |     except MemoryError as e:
36 |         raise MemoryError(f"RAM pressure – new browser denied: {e}")
37 |     except Exception as e:
38 |         raise RuntimeError(f"Failed to start browser: {e}")
39 |     finally:
40 |         if sig in POOL:
41 |             LAST_USED[sig] = time.time()
42 |         else:
43 |             # If we failed to start the browser, we should remove it from the pool
44 |             POOL.pop(sig, None)
45 |             LAST_USED.pop(sig, None)
46 |         # If we failed to start the browser, we should remove it from the pool
47 | async def close_all():
48 |     async with LOCK:
49 |         await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
50 |         POOL.clear(); LAST_USED.clear()
51 | 
52 | async def janitor():
53 |     while True:
54 |         await asyncio.sleep(60)
55 |         now = time.time()
56 |         async with LOCK:
57 |             for sig, crawler in list(POOL.items()):
58 |                 if now - LAST_USED[sig] > IDLE_TTL:
59 |                     with suppress(Exception): await crawler.close()
60 |                     POOL.pop(sig, None); LAST_USED.pop(sig, None)
61 | 


--------------------------------------------------------------------------------
/deploy/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi>=0.115.12
 2 | uvicorn>=0.34.2
 3 | gunicorn>=23.0.0
 4 | slowapi==0.1.9
 5 | prometheus-fastapi-instrumentator>=7.1.0
 6 | redis>=5.2.1
 7 | jwt>=1.3.1
 8 | dnspython>=2.7.0
 9 | email-validator==2.2.0
10 | sse-starlette==2.2.1
11 | pydantic>=2.11
12 | rank-bm25==0.2.2
13 | anyio==4.9.0
14 | PyJWT==2.10.1
15 | mcp>=1.18.0
16 | websockets>=15.0.1
17 | httpx[http2]>=0.27.2
18 | 


--------------------------------------------------------------------------------
/deploy/docker/supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisord]
 2 | nodaemon=true                   ; Run supervisord in the foreground
 3 | logfile=/dev/null               ; Log supervisord output to stdout/stderr
 4 | logfile_maxbytes=0
 5 | 
 6 | [program:redis]
 7 | command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
 8 | user=appuser                    ; Run redis as our non-root user
 9 | autorestart=true
10 | priority=10
11 | stdout_logfile=/dev/stdout      ; Redirect redis stdout to container stdout
12 | stdout_logfile_maxbytes=0
13 | stderr_logfile=/dev/stderr      ; Redirect redis stderr to container stderr
14 | stderr_logfile_maxbytes=0
15 | 
16 | [program:gunicorn]
17 | command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 1 --threads 4 --timeout 1800 --graceful-timeout 30 --keep-alive 300 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
18 | directory=/app                  ; Working directory for the app
19 | user=appuser                    ; Run gunicorn as our non-root user
20 | autorestart=true
21 | priority=20
22 | environment=PYTHONUNBUFFERED=1  ; Ensure Python output is sent straight to logs
23 | stdout_logfile=/dev/stdout      ; Redirect gunicorn stdout to container stdout
24 | stdout_logfile_maxbytes=0
25 | stderr_logfile=/dev/stderr      ; Redirect gunicorn stderr to container stderr
26 | stderr_logfile_maxbytes=0
27 | 
28 | # Optional: Add filebeat or other logging agents here if needed


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | # Shared configuration for all environments
 4 | x-base-config: &base-config
 5 |   ports:
 6 |     - "11235:11235"  # Gunicorn port
 7 |   env_file:
 8 |     - .llm.env       # API keys (create from .llm.env.example)
 9 |   environment:
10 |     - OPENAI_API_KEY=${OPENAI_API_KEY:-}
11 |     - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
12 |     - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
13 |     - GROQ_API_KEY=${GROQ_API_KEY:-}
14 |     - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
15 |     - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
16 |     - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
17 |     - LLM_PROVIDER=${LLM_PROVIDER:-}  # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
18 |   volumes:
19 |     - /dev/shm:/dev/shm  # Chromium performance
20 |   deploy:
21 |     resources:
22 |       limits:
23 |         memory: 4G
24 |       reservations:
25 |         memory: 1G
26 |   restart: unless-stopped
27 |   healthcheck:
28 |     test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
29 |     interval: 30s
30 |     timeout: 10s
31 |     retries: 3
32 |     start_period: 40s
33 |   user: "appuser"
34 | 
35 | services:
36 |   crawl4ai:
37 |     # 1. Default: Pull multi-platform test image from Docker Hub
38 |     # 2. Override with local image via: IMAGE=local-test docker compose up
39 |     image: ${IMAGE:-unclecode/crawl4ai:${TAG:-latest}}
40 |     
41 |     # Local build config (used with --build)
42 |     build:
43 |       context: .
44 |       dockerfile: Dockerfile
45 |       args:
46 |         INSTALL_TYPE: ${INSTALL_TYPE:-default}
47 |         ENABLE_GPU: ${ENABLE_GPU:-false}
48 |     
49 |     # Inherit shared config
50 |     <<: *base-config


--------------------------------------------------------------------------------
/docs/apps/linkdin/schemas/company_card.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "LinkedIn Company Search Result Card",
 3 |   "baseSelector": "div[data-chameleon-result-urn][data-view-name=\"search-entity-result-universal-template\"]",
 4 |   "baseFields": [
 5 |     {
 6 |       "name": "chameleon_result_urn",
 7 |       "type": "attribute",
 8 |       "attribute": "data-chameleon-result-urn"
 9 |     },
10 |     {
11 |       "name": "view_name",
12 |       "type": "attribute",
13 |       "attribute": "data-view-name"
14 |     }
15 |   ],
16 |   "fields": [
17 |     {
18 |       "name": "handle",
19 |       "selector": "div.mb1 div.display-flex span a[data-test-app-aware-link]",
20 |       "type": "attribute",
21 |       "attribute": "href"
22 |     },
23 |     {
24 |       "name": "profile_image",
25 |       "selector": "div.ivm-image-view-model img",
26 |       "type": "attribute",
27 |       "attribute": "src"
28 |     },
29 |     {
30 |       "name": "name",
31 |       "selector": "div.mb1 div.display-flex span a[data-test-app-aware-link]",
32 |       "type": "text"
33 |     },
34 |     {
35 |       "name": "descriptor",
36 |       "selector": "div.mb1 > div[class*=\"t-14 t-black\"]",
37 |       "type": "text"
38 |     },
39 |     {
40 |       "name": "about",
41 |       "selector": "p.entity-result__summary--2-lines",
42 |       "type": "text"
43 |     },
44 |     {
45 |       "name": "followers",
46 |       "selector": "div.mb1 > div:nth-of-type(3)",
47 |       "type": "regex",
48 |       "pattern": "(\\d+[KM]?) followers"
49 |     }
50 |   ]
51 | }


--------------------------------------------------------------------------------
/docs/apps/linkdin/schemas/people_card.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "LinkedIn People Profile Card",
 3 |   "baseSelector": "li.org-people-profile-card__profile-card-spacing",
 4 |   "baseFields": [],
 5 |   "fields": [
 6 |     {
 7 |       "name": "profile_url",
 8 |       "selector": "div.artdeco-entity-lockup__title a[data-test-app-aware-link]",
 9 |       "type": "attribute",
10 |       "attribute": "href"
11 |     },
12 |     {
13 |       "name": "avatar_url",
14 |       "selector": "div.artdeco-entity-lockup__image img",
15 |       "type": "attribute",
16 |       "attribute": "src"
17 |     },
18 |     {
19 |       "name": "name",
20 |       "selector": "div.artdeco-entity-lockup__title a div.lt-line-clamp--single-line",
21 |       "type": "text"
22 |     },
23 |     {
24 |       "name": "headline",
25 |       "selector": "div.artdeco-entity-lockup__subtitle div.lt-line-clamp--multi-line",
26 |       "type": "text"
27 |     },
28 |     {
29 |       "name": "followers",
30 |       "selector": "span.text-align-center span.lt-line-clamp--multi-line",
31 |       "type": "regex",
32 |       "pattern": "(\\d+)"
33 |     },
34 |     {
35 |       "name": "connection_degree",
36 |       "selector": "span.artdeco-entity-lockup__degree",
37 |       "type": "regex",
38 |       "pattern": "(\\d+\\w+)"
39 |     }
40 |   ]
41 | }


--------------------------------------------------------------------------------
/docs/apps/linkdin/templates/ai.js:
--------------------------------------------------------------------------------
 1 | // ==== File: ai.js ====
 2 | 
 3 | class ApiHandler {
 4 |     constructor(apiKey = null) {
 5 |       this.apiKey = apiKey || localStorage.getItem("openai_api_key") || "";
 6 |       console.log("ApiHandler ready");
 7 |     }
 8 |   
 9 |     setApiKey(k) {
10 |       this.apiKey = k.trim();
11 |       if (this.apiKey) localStorage.setItem("openai_api_key", this.apiKey);
12 |     }
13 |   
14 |     async *chatStream(messages, {model = "gpt-4o", temperature = 0.7} = {}) {
15 |       if (!this.apiKey) throw new Error("OpenAI API key missing");
16 |       const payload = {model, messages, stream: true, max_tokens: 1024};
17 |       const controller = new AbortController();
18 |   
19 |       const res = await fetch("https://api.openai.com/v1/chat/completions", {
20 |         method: "POST",
21 |         headers: {
22 |           "Content-Type": "application/json",
23 |           Authorization: `Bearer ${this.apiKey}`,
24 |         },
25 |         body: JSON.stringify(payload),
26 |         signal: controller.signal,
27 |       });
28 |       if (!res.ok) throw new Error(`OpenAI: ${res.statusText}`);
29 |       const reader = res.body.getReader();
30 |       const dec = new TextDecoder();
31 |   
32 |       let buf = "";
33 |       while (true) {
34 |         const {done, value} = await reader.read();
35 |         if (done) break;
36 |         buf += dec.decode(value, {stream: true});
37 |         for (const line of buf.split("\n")) {
38 |           if (!line.startsWith("data: ")) continue;
39 |           if (line.includes("[DONE]")) return;
40 |           const json = JSON.parse(line.slice(6));
41 |           const delta = json.choices?.[0]?.delta?.content;
42 |           if (delta) yield delta;
43 |         }
44 |         buf = buf.endsWith("\n") ? "" : buf; // keep partial line
45 |       }
46 |     }
47 |   }
48 |   
49 |   window.API = new ApiHandler();
50 |   


--------------------------------------------------------------------------------
/docs/assets/pitch-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/assets/pitch-dark.png


--------------------------------------------------------------------------------
/docs/assets/powered-by-dark.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
 2 |   <!-- Dark Theme -->
 3 |   <g>
 4 |     <defs>
 5 |       <pattern id="halftoneDark" width="4" height="4" patternUnits="userSpaceOnUse">
 6 |         <circle cx="2" cy="2" r="1" fill="#eee" opacity="0.1"/>
 7 |       </pattern>
 8 |       <pattern id="halftoneTextDark" width="3" height="3" patternUnits="userSpaceOnUse">
 9 |         <circle cx="1.5" cy="1.5" r="2" fill="#aaa" opacity="0.2"/>
10 |       </pattern>
11 |     </defs>
12 |     <!-- White border - added as outer rectangle -->
13 |     <rect width="120" height="35" rx="5" fill="#111"/>
14 |     <!-- Dark background slightly smaller to show thicker border -->
15 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="#1a1a1a"/>
16 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneDark)"/>
17 |     
18 |     <!-- Logo with halftone -->
19 |     <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#eee" stroke-width="2"/>
20 |     <path d="M18 17.5 L27 17.5" stroke="#eee" stroke-width="2"/>
21 |     <circle cx="22.5" cy="17.5" r="2" fill="#eee"/>
22 |     
23 |     <text x="40" y="23" fill="#eee" font-family="Arial, sans-serif" font-weight="500" font-size="14">Crawl4AI</text>
24 |   </g>
25 | </svg>


--------------------------------------------------------------------------------
/docs/assets/powered-by-disco.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
 2 |   <g>
 3 |     <defs>
 4 |       <pattern id="cyberdots" width="4" height="4" patternUnits="userSpaceOnUse">
 5 |         <circle cx="2" cy="2" r="1">
 6 |           <animate attributeName="fill" 
 7 |                    values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4" 
 8 |                    dur="6s" 
 9 |                    repeatCount="indefinite"/>
10 |           <animate attributeName="opacity" 
11 |                    values="0.2;0.4;0.2" 
12 |                    dur="4s" 
13 |                    repeatCount="indefinite"/>
14 |         </circle>
15 |       </pattern>
16 |       <filter id="neonGlow" x="-20%" y="-20%" width="140%" height="140%">
17 |         <feGaussianBlur stdDeviation="1" result="blur"/>
18 |         <feFlood flood-color="#FF2EC4" flood-opacity="0.2">
19 |           <animate attributeName="flood-color"
20 |                    values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
21 |                    dur="8s"
22 |                    repeatCount="indefinite"/>
23 |         </feFlood>
24 |         <feComposite in2="blur" operator="in"/>
25 |         <feMerge>
26 |           <feMergeNode/>
27 |           <feMergeNode in="SourceGraphic"/>
28 |         </feMerge>
29 |       </filter>
30 |     </defs>
31 |     
32 |     <rect width="120" height="35" rx="5" fill="#0A0A0F"/>
33 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="#16161E"/>
34 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#cyberdots)"/>
35 |     
36 |     <!-- Logo with animated neon -->
37 |     <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)">
38 |       <animate attributeName="stroke"
39 |                values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
40 |                dur="8s"
41 |                repeatCount="indefinite"/>
42 |     </path>
43 |     <path d="M18 17.5 L27 17.5" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)">
44 |       <animate attributeName="stroke"
45 |                values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
46 |                dur="8s"
47 |                repeatCount="indefinite"/>
48 |     </path>
49 |     <circle cx="22.5" cy="17.5" r="2" fill="#0BC5EA">
50 |       <animate attributeName="fill" 
51 |                values="#0BC5EA;#FF2EC4;#8B5CF6;#0BC5EA" 
52 |                dur="8s" 
53 |                repeatCount="indefinite"/>
54 |     </circle>
55 |     
56 |     <text x="40" y="23" font-family="Arial, sans-serif" font-weight="500" font-size="14" filter="url(#neonGlow)">
57 |       <animate attributeName="fill"
58 |                values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
59 |                dur="8s"
60 |                repeatCount="indefinite"/>
61 |       Crawl4AI
62 |     </text>
63 |   </g>
64 | </svg>


--------------------------------------------------------------------------------
/docs/assets/powered-by-light.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
 2 |   <g>
 3 |     <defs>
 4 |       <pattern id="halftoneLight" width="4" height="4" patternUnits="userSpaceOnUse">
 5 |         <circle cx="2" cy="2" r="1" fill="#111" opacity="0.1"/>
 6 |       </pattern>
 7 |     </defs>
 8 |     <!-- Dark border -->
 9 |     <rect width="120" height="35" rx="5" fill="#DDD"/>
10 |     <!-- Light background -->
11 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="#fff"/>
12 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneLight)"/>
13 |     
14 |     <!-- Logo -->
15 |     <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#111" stroke-width="2"/>
16 |     <path d="M18 17.5 L27 17.5" stroke="#111" stroke-width="2"/>
17 |     <circle cx="22.5" cy="17.5" r="2" fill="#111"/>
18 |     
19 |     <text x="40" y="23" fill="#111" font-family="Arial, sans-serif" font-weight="500" font-size="14">Crawl4AI</text>
20 |   </g>
21 | </svg>


--------------------------------------------------------------------------------
/docs/assets/powered-by-night.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
 2 |   <g>
 3 |     <defs>
 4 |       <pattern id="halftoneDark" width="4" height="4" patternUnits="userSpaceOnUse">
 5 |         <circle cx="2" cy="2" r="1" fill="#8B5CF6" opacity="0.1"/>
 6 |       </pattern>
 7 |       <filter id="neonGlow" x="-20%" y="-20%" width="140%" height="140%">
 8 |         <feGaussianBlur stdDeviation="1" result="blur"/>
 9 |         <feFlood flood-color="#8B5CF6" flood-opacity="0.2"/>
10 |         <feComposite in2="blur" operator="in"/>
11 |         <feMerge>
12 |           <feMergeNode/>
13 |           <feMergeNode in="SourceGraphic"/>
14 |         </feMerge>
15 |       </filter>
16 |     </defs>
17 |     <rect width="120" height="35" rx="5" fill="#0A0A0F"/>
18 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="#16161E"/>
19 |     <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneDark)"/>
20 |     
21 |     <!-- Logo with neon glow -->
22 |     <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)"/>
23 |     <path d="M18 17.5 L27 17.5" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)"/>
24 |     <circle cx="22.5" cy="17.5" r="2" fill="#8B5CF6"/>
25 |     
26 |     <text x="40" y="23" fill="#fff" font-family="Arial, sans-serif" font-weight="500" font-size="14" filter="url(#neonGlow)">Crawl4AI</text>
27 |   </g>
28 | </svg>


--------------------------------------------------------------------------------
/docs/blog/release-v0.7.1.md:
--------------------------------------------------------------------------------
 1 | # 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
 2 | 
 3 | *July 17, 2025 • 2 min read*
 4 | 
 5 | ---
 6 | 
 7 | A small maintenance release that removes unused code and improves documentation.
 8 | 
 9 | ## 🎯 What's Changed
10 | 
11 | - **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
12 | - **Updated documentation** with better examples and parameter explanations
13 | - **Fixed virtual scroll configuration** examples in docs
14 | 
15 | ## 🧹 Code Cleanup
16 | 
17 | Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
18 | 
19 | ```python
20 | # Removed unused code:
21 | from playwright_stealth import StealthConfig
22 | stealth_config = StealthConfig(...)  # This was never used
23 | ```
24 | 
25 | ## 📖 Documentation Updates
26 | 
27 | - Fixed adaptive crawling parameter examples
28 | - Updated session management documentation
29 | - Corrected virtual scroll configuration examples
30 | 
31 | ## 🚀 Installation
32 | 
33 | ```bash
34 | pip install crawl4ai==0.7.1
35 | ```
36 | 
37 | No breaking changes - upgrade directly from v0.7.0.
38 | 
39 | ---
40 | 
41 | Questions? Issues? 
42 | - GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
43 | - Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)


--------------------------------------------------------------------------------
/docs/examples/assets/audio.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/audio.mp3


--------------------------------------------------------------------------------
/docs/examples/assets/basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/basic.png


--------------------------------------------------------------------------------
/docs/examples/assets/cosine_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/cosine_extraction.png


--------------------------------------------------------------------------------
/docs/examples/assets/css_js.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/css_js.png


--------------------------------------------------------------------------------
/docs/examples/assets/css_selector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/css_selector.png


--------------------------------------------------------------------------------
/docs/examples/assets/exec_script.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/exec_script.png


--------------------------------------------------------------------------------
/docs/examples/assets/instagram_grid_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/instagram_grid_result.png


--------------------------------------------------------------------------------
/docs/examples/assets/llm_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/llm_extraction.png


--------------------------------------------------------------------------------
/docs/examples/assets/semantic_extraction_cosine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/semantic_extraction_cosine.png


--------------------------------------------------------------------------------
/docs/examples/assets/semantic_extraction_llm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/semantic_extraction_llm.png


--------------------------------------------------------------------------------
/docs/examples/async_webcrawler_multiple_urls_example.py:
--------------------------------------------------------------------------------
 1 | # File: async_webcrawler_multiple_urls_example.py
 2 | import os, sys
 3 | 
 4 | # append 2 parent directories to sys.path to import crawl4ai
 5 | parent_dir = os.path.dirname(
 6 |     os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 7 | )
 8 | sys.path.append(parent_dir)
 9 | 
10 | import asyncio
11 | from crawl4ai import AsyncWebCrawler
12 | 
13 | 
14 | async def main():
15 |     # Initialize the AsyncWebCrawler
16 |     async with AsyncWebCrawler(verbose=True) as crawler:
17 |         # List of URLs to crawl
18 |         urls = [
19 |             "https://example.com",
20 |             "https://python.org",
21 |             "https://github.com",
22 |             "https://stackoverflow.com",
23 |             "https://news.ycombinator.com",
24 |         ]
25 | 
26 |         # Set up crawling parameters
27 |         word_count_threshold = 100
28 | 
29 |         # Run the crawling process for multiple URLs
30 |         results = await crawler.arun_many(
31 |             urls=urls,
32 |             word_count_threshold=word_count_threshold,
33 |             bypass_cache=True,
34 |             verbose=True,
35 |         )
36 | 
37 |         # Process the results
38 |         for result in results:
39 |             if result.success:
40 |                 print(f"Successfully crawled: {result.url}")
41 |                 print(f"Title: {result.metadata.get('title', 'N/A')}")
42 |                 print(f"Word count: {len(result.markdown.split())}")
43 |                 print(
44 |                     f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
45 |                 )
46 |                 print(f"Number of images: {len(result.media.get('images', []))}")
47 |                 print("---")
48 |             else:
49 |                 print(f"Failed to crawl: {result.url}")
50 |                 print(f"Error: {result.error_message}")
51 |                 print("---")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     asyncio.run(main())
56 | 


--------------------------------------------------------------------------------
/docs/examples/c4a_script/amazon_example/generated_product_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Amazon Product Search Results",
 3 |   "baseSelector": "div[data-component-type='s-impression-counter']",
 4 |   "fields": [
 5 |     {
 6 |       "name": "title",
 7 |       "selector": "h2.a-size-base-plus.a-spacing-none.a-color-base.a-text-normal span",
 8 |       "type": "text"
 9 |     },
10 |     {
11 |       "name": "price",
12 |       "selector": "span.a-price > span.a-offscreen",
13 |       "type": "text"
14 |     },
15 |     {
16 |       "name": "rating",
17 |       "selector": "i.a-icon-star-small span.a-icon-alt",
18 |       "type": "text"
19 |     },
20 |     {
21 |       "name": "number_of_reviews",
22 |       "selector": "a.a-link-normal.s-underline-text span.a-size-base",
23 |       "type": "text"
24 |     },
25 |     {
26 |       "name": "delivery_info",
27 |       "selector": "div[data-cy='delivery-recipe'] span.a-color-base",
28 |       "type": "text"
29 |     },
30 |     {
31 |       "name": "product_url",
32 |       "selector": "a.a-link-normal.s-no-outline",
33 |       "type": "attribute",
34 |       "attribute": "href"
35 |     },
36 |     {
37 |       "name": "sponsored",
38 |       "selector": "span.puis-label-popover-default span.a-color-secondary",
39 |       "type": "text"
40 |     },
41 |     {
42 |       "name": "small_business_badge",
43 |       "selector": "span.a-size-base.a-color-base",
44 |       "type": "text"
45 |     }
46 |   ]
47 | }


--------------------------------------------------------------------------------
/docs/examples/c4a_script/amazon_example/generated_search_script.js:
--------------------------------------------------------------------------------
1 | const searchBox = document.querySelector('#twotabsearchtextbox');
2 | const searchButton = document.querySelector('#nav-search-submit-button');
3 | 
4 | if (searchBox && searchButton) {
5 |   searchBox.focus();
6 |   searchBox.value = '';
7 |   searchBox.value = 'r2d2';
8 |   searchButton.click();
9 | }


--------------------------------------------------------------------------------
/docs/examples/c4a_script/c4a_script_hello_world.py:
--------------------------------------------------------------------------------
 1 | """
 2 | C4A-Script Hello World
 3 | A concise example showing how to use the C4A-Script compiler
 4 | """
 5 | 
 6 | from crawl4ai.script.c4a_compile import compile
 7 | 
 8 | # Define your C4A-Script
 9 | script = """
10 | GO https://example.com
11 | WAIT `#content` 5
12 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
13 | CLICK `button.submit`
14 | """
15 | 
16 | # Compile the script
17 | result = compile(script)
18 | 
19 | # Check if compilation was successful
20 | if result.success:
21 |     # Success! Use the generated JavaScript
22 |     print("✅ Compilation successful!")
23 |     print(f"Generated {len(result.js_code)} JavaScript statements:\n")
24 |     
25 |     for i, js in enumerate(result.js_code, 1):
26 |         print(f"{i}. {js}\n")
27 |     
28 |     # In real usage, you'd pass result.js_code to Crawl4AI:
29 |     # config = CrawlerRunConfig(js_code=result.js_code)
30 |     
31 | else:
32 |     # Error! Handle the compilation error
33 |     print("❌ Compilation failed!")
34 |     
35 |     # Get the first error (there might be multiple)
36 |     error = result.first_error
37 |     
38 |     # Show error details
39 |     print(f"Error at line {error.line}, column {error.column}")
40 |     print(f"Message: {error.message}")
41 |     
42 |     # Show the problematic code
43 |     print(f"\nCode: {error.source_line}")
44 |     print(" " * (6 + error.column) + "^")
45 |     
46 |     # Show suggestions if available
47 |     if error.suggestions:
48 |         print("\n💡 How to fix:")
49 |         for suggestion in error.suggestions:
50 |             print(f"   {suggestion.message}")
51 |     
52 |     # For debugging or logging, you can also get JSON
53 |     # error_json = result.to_json()


--------------------------------------------------------------------------------
/docs/examples/c4a_script/c4a_script_hello_world_error.py:
--------------------------------------------------------------------------------
 1 | """
 2 | C4A-Script Hello World - Error Example
 3 | Shows how error handling works
 4 | """
 5 | 
 6 | from crawl4ai.script.c4a_compile import compile
 7 | 
 8 | # Define a script with an error (missing THEN)
 9 | script = """
10 | GO https://example.com
11 | WAIT `#content` 5
12 | IF (EXISTS `.cookie-banner`) CLICK `.accept`
13 | CLICK `button.submit`
14 | """
15 | 
16 | # Compile the script
17 | result = compile(script)
18 | 
19 | # Check if compilation was successful
20 | if result.success:
21 |     # Success! Use the generated JavaScript
22 |     print("✅ Compilation successful!")
23 |     print(f"Generated {len(result.js_code)} JavaScript statements:\n")
24 |     
25 |     for i, js in enumerate(result.js_code, 1):
26 |         print(f"{i}. {js}\n")
27 |     
28 |     # In real usage, you'd pass result.js_code to Crawl4AI:
29 |     # config = CrawlerRunConfig(js_code=result.js_code)
30 |     
31 | else:
32 |     # Error! Handle the compilation error
33 |     print("❌ Compilation failed!")
34 |     
35 |     # Get the first error (there might be multiple)
36 |     error = result.first_error
37 |     
38 |     # Show error details
39 |     print(f"Error at line {error.line}, column {error.column}")
40 |     print(f"Message: {error.message}")
41 |     
42 |     # Show the problematic code
43 |     print(f"\nCode: {error.source_line}")
44 |     print(" " * (6 + error.column) + "^")
45 |     
46 |     # Show suggestions if available
47 |     if error.suggestions:
48 |         print("\n💡 How to fix:")
49 |         for suggestion in error.suggestions:
50 |             print(f"   {suggestion.message}")
51 |     
52 |     # For debugging or logging, you can also get JSON
53 |     # error_json = result.to_json()


--------------------------------------------------------------------------------
/docs/examples/c4a_script/generate_script_hello_world.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Hello World Example: LLM-Generated C4A-Script
 4 | 
 5 | This example shows how to use the new generate_script() function to automatically
 6 | create C4A-Script automation from natural language descriptions and HTML.
 7 | """
 8 | 
 9 | from crawl4ai.script.c4a_compile import C4ACompiler
10 | 
11 | def main():
12 |     print("🤖 C4A-Script Generation Hello World")
13 |     print("=" * 50)
14 |     
15 |     # Example 1: Simple login form
16 |     html = """
17 |     <html>
18 |     <body>
19 |         <form id="login">
20 |             <input id="email" type="email" placeholder="Email">
21 |             <input id="password" type="password" placeholder="Password">
22 |             <button id="submit">Login</button>
23 |         </form>
24 |     </body>
25 |     </html>
26 |     """
27 |     
28 |     goal = "Fill in email 'user@example.com', password 'secret123', and submit the form"
29 |     
30 |     print("📝 Goal:", goal)
31 |     print("🌐 HTML: Simple login form")
32 |     print()
33 |     
34 |     # Generate C4A-Script
35 |     print("🔧 Generated C4A-Script:")
36 |     print("-" * 30)
37 |     c4a_script = C4ACompiler.generate_script(
38 |         html=html,
39 |         query=goal,
40 |         mode="c4a"
41 |     )
42 |     print(c4a_script)
43 |     print()
44 |     
45 |     # Generate JavaScript
46 |     print("🔧 Generated JavaScript:")
47 |     print("-" * 30)
48 |     js_script = C4ACompiler.generate_script(
49 |         html=html,
50 |         query=goal,
51 |         mode="js"
52 |     )
53 |     print(js_script)
54 |     print()
55 |     
56 |     # Example 2: Simple button click
57 |     html2 = """
58 |     <html>
59 |     <body>
60 |         <div class="content">
61 |             <h1>Welcome!</h1>
62 |             <button id="start-btn" class="primary">Get Started</button>
63 |         </div>
64 |     </body>
65 |     </html>
66 |     """
67 |     
68 |     goal2 = "Click the 'Get Started' button"
69 |     
70 |     print("=" * 50)
71 |     print("📝 Goal:", goal2)
72 |     print("🌐 HTML: Simple button")
73 |     print()
74 |     
75 |     print("🔧 Generated C4A-Script:")
76 |     print("-" * 30)
77 |     c4a_script2 = C4ACompiler.generate_script(
78 |         html=html2,
79 |         query=goal2,
80 |         mode="c4a"
81 |     )
82 |     print(c4a_script2)
83 |     print()
84 |     
85 |     print("✅ Done! The LLM automatically converted natural language goals")
86 |     print("   into executable automation scripts.")
87 | 
88 | if __name__ == "__main__":
89 |     main()


--------------------------------------------------------------------------------
/docs/examples/c4a_script/github_search/generated_result_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "GitHub Repository Cards",
 3 |   "baseSelector": "div.Box-sc-g0xbh4-0.iwUbcA",
 4 |   "fields": [
 5 |     {
 6 |       "name": "repository_name",
 7 |       "selector": "div.search-title a span",
 8 |       "type": "text",
 9 |       "transform": "strip"
10 |     },
11 |     {
12 |       "name": "repository_owner",
13 |       "selector": "div.search-title a span",
14 |       "type": "text",
15 |       "transform": "split",
16 |       "pattern": "/"
17 |     },
18 |     {
19 |       "name": "repository_url",
20 |       "selector": "div.search-title a",
21 |       "type": "attribute",
22 |       "attribute": "href",
23 |       "transform": "prepend",
24 |       "pattern": "https://github.com"
25 |     },
26 |     {
27 |       "name": "description",
28 |       "selector": "div.dcdlju span",
29 |       "type": "text"
30 |     },
31 |     {
32 |       "name": "primary_language",
33 |       "selector": "ul.bZkODq li span[aria-label]",
34 |       "type": "text"
35 |     },
36 |     {
37 |       "name": "star_count",
38 |       "selector": "ul.bZkODq li a[href*='stargazers'] span",
39 |       "type": "text",
40 |       "transform": "strip"
41 |     },
42 |     {
43 |       "name": "topics",
44 |       "type": "list",
45 |       "selector": "div.jgRnBg div a",
46 |       "fields": [
47 |         {
48 |           "name": "topic_name",
49 |           "selector": "a",
50 |           "type": "text"
51 |         }
52 |       ]
53 |     },
54 |     {
55 |       "name": "last_updated",
56 |       "selector": "ul.bZkODq li span[title]",
57 |       "type": "text"
58 |     },
59 |     {
60 |       "name": "has_sponsor_button",
61 |       "selector": "button[aria-label*='Sponsor']",
62 |       "type": "text",
63 |       "transform": "exists"
64 |     }
65 |   ]
66 | }


--------------------------------------------------------------------------------
/docs/examples/c4a_script/github_search/generated_search_script.js:
--------------------------------------------------------------------------------
 1 | (async () => {
 2 |   const waitForElement = (selector, timeout = 10000) => new Promise((resolve, reject) => {
 3 |     const el = document.querySelector(selector);
 4 |     if (el) return resolve(el);
 5 |     const observer = new MutationObserver(() => {
 6 |       const el = document.querySelector(selector);
 7 |       if (el) {
 8 |         observer.disconnect();
 9 |         resolve(el);
10 |       }
11 |     });
12 |     observer.observe(document.body, { childList: true, subtree: true });
13 |     setTimeout(() => {
14 |       observer.disconnect();
15 |       reject(new Error(`Timeout waiting for ${selector}`));
16 |     }, timeout);
17 |   });
18 | 
19 |   try {
20 |     const searchInput = await waitForElement('#adv_code_search input[type="text"]');
21 |     searchInput.value = 'crawl4AI';
22 |     searchInput.dispatchEvent(new Event('input', { bubbles: true }));
23 | 
24 |     const languageSelect = await waitForElement('#search_language');
25 |     languageSelect.value = 'Python';
26 |     languageSelect.dispatchEvent(new Event('change', { bubbles: true }));
27 | 
28 |     const starsInput = await waitForElement('#search_stars');
29 |     starsInput.value = '>10000';
30 |     starsInput.dispatchEvent(new Event('input', { bubbles: true }));
31 | 
32 |     const searchButton = await waitForElement('#adv_code_search button[type="submit"]');
33 |     searchButton.click();
34 | 
35 |     await waitForElement('.codesearch-results, #search-results');
36 |   } catch (e) {
37 |     console.error('Search script failed:', e.message);
38 |   }
39 | })();


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/add_to_cart.c4a:
--------------------------------------------------------------------------------
1 | GO https://store.example.com/product/laptop
2 | WAIT `.product-details` 8
3 | CLICK `button.add-to-cart`
4 | WAIT `.cart-notification` 3
5 | CLICK `.cart-icon`
6 | WAIT `.checkout-btn` 5
7 | CLICK `.checkout-btn`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/advanced_control_flow.c4a:
--------------------------------------------------------------------------------
 1 | # Advanced control flow with IF, EXISTS, and REPEAT
 2 | 
 3 | # Define reusable procedures
 4 | PROC handle_cookie_banner
 5 |   IF (EXISTS `.cookie-banner`) THEN CLICK `.accept-cookies`
 6 |   IF (EXISTS `.privacy-notice`) THEN CLICK `.dismiss-privacy`
 7 | ENDPROC
 8 | 
 9 | PROC scroll_to_load
10 |   SCROLL DOWN 500
11 |   WAIT 0.5
12 | ENDPROC
13 | 
14 | PROC try_login
15 |   CLICK `#email`
16 |   TYPE "user@example.com"
17 |   CLICK `#password`
18 |   TYPE "secure123"
19 |   CLICK `button[type="submit"]`
20 |   WAIT 2
21 | ENDPROC
22 | 
23 | # Main script
24 | GO https://example.com
25 | WAIT 2
26 | 
27 | # Handle popups
28 | handle_cookie_banner
29 | 
30 | # Conditional navigation based on login state
31 | IF (EXISTS `.user-menu`) THEN CLICK `.dashboard-link` ELSE try_login
32 | 
33 | # Repeat scrolling based on content count
34 | REPEAT (scroll_to_load, 5)
35 | 
36 | # Load more content while button exists
37 | REPEAT (CLICK `.load-more`, `document.querySelector('.load-more') && !document.querySelector('.no-more-content')`)
38 | 
39 | # Process items conditionally
40 | IF (`document.querySelectorAll('.item').length > 10`) THEN EVAL `console.log('Found ' + document.querySelectorAll('.item').length + ' items')`
41 | 
42 | # Complex condition with viewport check
43 | IF (`window.innerWidth < 768 && document.querySelector('.mobile-menu')`) THEN CLICK `.mobile-menu-toggle`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/conditional_login.c4a:
--------------------------------------------------------------------------------
1 | GO https://myapp.com
2 | WAIT 2
3 | IF (EXISTS `.user-avatar`) THEN CLICK `.logout` ELSE CLICK `.login`
4 | WAIT `#auth-form` 5
5 | IF (EXISTS `#auth-form`) THEN TYPE "user@example.com"
6 | IF (EXISTS `#auth-form`) THEN PRESS Tab
7 | IF (EXISTS `#auth-form`) THEN TYPE "password123"
8 | IF (EXISTS `#auth-form`) THEN CLICK `button[type="submit"]`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/data_extraction.c4a:
--------------------------------------------------------------------------------
 1 | # Data extraction example
 2 | # Scrapes product information from an e-commerce site
 3 | 
 4 | # Navigate to products page
 5 | GO https://shop.example.com/products
 6 | WAIT `.product-list` 10
 7 | 
 8 | # Scroll to load lazy-loaded content
 9 | SCROLL DOWN 500
10 | WAIT 1
11 | SCROLL DOWN 500
12 | WAIT 1
13 | SCROLL DOWN 500
14 | WAIT 2
15 | 
16 | # Extract product data
17 | EVAL `
18 |   // Extract all product information
19 |   const products = Array.from(document.querySelectorAll('.product-card')).map((card, index) => {
20 |     return {
21 |       id: index + 1,
22 |       name: card.querySelector('.product-title')?.textContent?.trim() || 'N/A',
23 |       price: card.querySelector('.price')?.textContent?.trim() || 'N/A',
24 |       rating: card.querySelector('.rating')?.textContent?.trim() || 'N/A',
25 |       availability: card.querySelector('.in-stock') ? 'In Stock' : 'Out of Stock',
26 |       image: card.querySelector('img')?.src || 'N/A'
27 |     };
28 |   });
29 | 
30 |   // Log results
31 |   console.log('=== Product Extraction Results ===');
32 |   console.log('Total products found:', products.length);
33 |   console.log(JSON.stringify(products, null, 2));
34 |   
35 |   // Save to localStorage for retrieval
36 |   localStorage.setItem('scraped_products', JSON.stringify(products));
37 | `
38 | 
39 | # Optional: Click on first product for details
40 | CLICK `.product-card:first-child`
41 | WAIT `.product-details` 5
42 | 
43 | # Extract detailed information
44 | EVAL `
45 |   const details = {
46 |     description: document.querySelector('.product-description')?.textContent?.trim(),
47 |     specifications: Array.from(document.querySelectorAll('.spec-item')).map(spec => ({
48 |       label: spec.querySelector('.spec-label')?.textContent,
49 |       value: spec.querySelector('.spec-value')?.textContent
50 |     })),
51 |     reviews: document.querySelector('.review-count')?.textContent
52 |   };
53 |   
54 |   console.log('=== Product Details ===');
55 |   console.log(JSON.stringify(details, null, 2));
56 | `


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/fill_contact.c4a:
--------------------------------------------------------------------------------
1 | GO https://company.com/contact
2 | WAIT `form#contact` 10
3 | TYPE "John Smith"
4 | PRESS Tab
5 | TYPE "john@email.com"
6 | PRESS Tab
7 | TYPE "Need help with my order"
8 | CLICK `button[type="submit"]`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/load_more_content.c4a:
--------------------------------------------------------------------------------
1 | GO https://news.example.com
2 | WAIT `.article-list` 5
3 | REPEAT (SCROLL DOWN 500, 3)
4 | WAIT 1
5 | REPEAT (CLICK `.load-more`, `document.querySelector('.load-more') !== null`)
6 | WAIT 2
7 | IF (`document.querySelectorAll('.article').length > 20`) THEN EVAL `console.log('Loaded enough articles')`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/login_flow.c4a:
--------------------------------------------------------------------------------
 1 | # Login flow with error handling
 2 | # Demonstrates procedures, variables, and conditional checks
 3 | 
 4 | # Define login procedure
 5 | PROC perform_login
 6 |   CLICK `input#email`
 7 |   TYPE $email
 8 |   CLICK `input#password`
 9 |   TYPE $password
10 |   CLICK `button.login-submit`
11 | ENDPROC
12 | 
13 | # Set credentials
14 | SET email = "user@example.com"
15 | SET password = "securePassword123"
16 | 
17 | # Navigate to login page
18 | GO https://app.example.com/login
19 | WAIT `.login-container` 15
20 | 
21 | # Attempt login
22 | perform_login
23 | 
24 | # Wait for page to load
25 | WAIT 3
26 | 
27 | # Check if login was successful
28 | EVAL `
29 |   if (document.querySelector('.dashboard')) {
30 |     console.log('Login successful - on dashboard');
31 |   } else if (document.querySelector('.error-message')) {
32 |     console.log('Login failed:', document.querySelector('.error-message').textContent);
33 |   } else {
34 |     console.log('Unknown state after login');
35 |   }
36 | `


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/navigate_tabs.c4a:
--------------------------------------------------------------------------------
1 | GO https://app.example.com
2 | WAIT `.nav-menu` 8
3 | CLICK `a[href="/products"]`
4 | WAIT 2
5 | CLICK `a[href="/about"]`
6 | WAIT 2
7 | BACK
8 | WAIT 1


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/quick_login.c4a:
--------------------------------------------------------------------------------
1 | GO https://myapp.com/login
2 | WAIT `input#email` 5
3 | CLICK `input#email`
4 | TYPE "user@example.com"
5 | PRESS Tab
6 | TYPE "password123"
7 | CLICK `button.login-btn`
8 | WAIT `.dashboard` 10


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/responsive_actions.c4a:
--------------------------------------------------------------------------------
1 | GO https://responsive.site.com
2 | WAIT 2
3 | IF (`window.innerWidth < 768`) THEN CLICK `.mobile-menu`
4 | IF (`window.innerWidth < 768`) THEN WAIT `.mobile-nav` 3
5 | IF (`window.innerWidth >= 768`) THEN CLICK `.desktop-menu li:nth-child(2)`
6 | REPEAT (CLICK `.next-slide`, 5)
7 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept-cookies`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/scroll_and_click.c4a:
--------------------------------------------------------------------------------
1 | GO https://news.site.com
2 | WAIT `.article-list` 10
3 | SCROLL DOWN 500
4 | WAIT 1
5 | SCROLL DOWN 500
6 | WAIT 1
7 | CLICK `.article:nth-child(5)`
8 | WAIT `.article-content` 5


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/search_product.c4a:
--------------------------------------------------------------------------------
1 | GO https://shop.example.com
2 | WAIT `.search-bar` 10
3 | CLICK `.search-bar`
4 | TYPE "wireless headphones"
5 | PRESS Enter
6 | WAIT `.results` 5
7 | CLICK `.product-card:first-child`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/simple_form.c4a:
--------------------------------------------------------------------------------
 1 | # Simple form submission example
 2 | # This script fills out a contact form and submits it
 3 | 
 4 | GO https://example.com/contact
 5 | WAIT `form#contact-form` 10
 6 | 
 7 | # Fill out the form fields
 8 | CLICK `input[name="name"]`
 9 | TYPE "Alice Smith"
10 | PRESS Tab
11 | TYPE "alice@example.com"
12 | PRESS Tab
13 | TYPE "I'd like to learn more about your services"
14 | 
15 | # Submit the form
16 | CLICK `button[type="submit"]`
17 | 
18 | # Wait for success message
19 | WAIT "Thank you for your message" 5


--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/smart_form_fill.c4a:
--------------------------------------------------------------------------------
 1 | PROC fill_field
 2 |   TYPE "test@example.com"
 3 |   PRESS Tab
 4 | ENDPROC
 5 | 
 6 | GO https://forms.example.com
 7 | WAIT `form` 5
 8 | IF (EXISTS `input[type="email"]`) THEN CLICK `input[type="email"]`
 9 | IF (EXISTS `input[type="email"]`) THEN fill_field
10 | REPEAT (PRESS Tab, `document.activeElement.type !== 'submit'`)
11 | CLICK `button[type="submit"]`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/c4a_script/tutorial/assets/DankMono-Bold.woff2


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/c4a_script/tutorial/assets/DankMono-Italic.woff2


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/c4a_script/tutorial/assets/DankMono-Regular.woff2


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/blockly-demo.c4a:
--------------------------------------------------------------------------------
 1 | # Demo: Login Flow with Blockly
 2 | # This script can be created visually using Blockly blocks
 3 | 
 4 | GO https://example.com/login
 5 | WAIT `#login-form` 5
 6 | 
 7 | # Check if already logged in
 8 | IF (EXISTS `.user-avatar`) THEN GO https://example.com/dashboard
 9 | 
10 | # Fill login form
11 | CLICK `#email`
12 | TYPE "demo@example.com"
13 | CLICK `#password`
14 | TYPE "password123"
15 | 
16 | # Submit form
17 | CLICK `button[type="submit"]`
18 | WAIT `.dashboard` 10
19 | 
20 | # Success message
21 | EVAL `console.log('Login successful!')`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/requirements.txt:
--------------------------------------------------------------------------------
1 | flask>=2.3.0
2 | flask-cors>=4.0.0


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/01-basic-interaction.c4a:
--------------------------------------------------------------------------------
 1 | # Basic Page Interaction
 2 | # This script demonstrates basic C4A commands
 3 | 
 4 | # Navigate to the playground
 5 | GO http://127.0.0.1:8080/playground/
 6 | 
 7 | # Wait for page to load
 8 | WAIT `body` 2
 9 | 
10 | # Handle cookie banner if present
11 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
12 | 
13 | # Close newsletter popup if it appears
14 | WAIT 3
15 | IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`
16 | 
17 | # Click the start tutorial button
18 | CLICK `#start-tutorial`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/02-login-flow.c4a:
--------------------------------------------------------------------------------
 1 | # Complete Login Flow
 2 | # Demonstrates form interaction and authentication
 3 | 
 4 | # Click login button
 5 | CLICK `#login-btn`
 6 | 
 7 | # Wait for login modal
 8 | WAIT `.login-form` 3
 9 | 
10 | # Fill in credentials
11 | CLICK `#email`
12 | TYPE "demo@example.com"
13 | 
14 | CLICK `#password`
15 | TYPE "demo123"
16 | 
17 | # Check remember me
18 | IF (EXISTS `#remember-me`) THEN CLICK `#remember-me`
19 | 
20 | # Submit form
21 | CLICK `button[type="submit"]`
22 | 
23 | # Wait for success
24 | WAIT `.welcome-message` 5
25 | 
26 | # Verify login succeeded
27 | IF (EXISTS `.user-info`) THEN EVAL `console.log('✅ Login successful!')`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/03-infinite-scroll.c4a:
--------------------------------------------------------------------------------
 1 | # Infinite Scroll Product Loading
 2 | # Load all products using scroll automation
 3 | 
 4 | # Navigate to catalog
 5 | CLICK `#catalog-link`
 6 | WAIT `.product-grid` 3
 7 | 
 8 | # Switch to infinite scroll mode
 9 | CLICK `#infinite-scroll-btn`
10 | 
11 | # Define scroll procedure
12 | PROC load_more_products
13 |   # Get current product count
14 |   EVAL `window.initialCount = document.querySelectorAll('.product-card').length`
15 |   
16 |   # Scroll down
17 |   SCROLL DOWN 1000
18 |   WAIT 2
19 |   
20 |   # Check if more products loaded
21 |   EVAL `
22 |     const newCount = document.querySelectorAll('.product-card').length;
23 |     console.log('Products loaded: ' + newCount);
24 |     window.moreLoaded = newCount > window.initialCount;
25 |   `
26 | ENDPROC
27 | 
28 | # Load products until no more
29 | REPEAT (load_more_products, `window.moreLoaded !== false`)
30 | 
31 | # Final count
32 | EVAL `console.log('✅ Total products: ' + document.querySelectorAll('.product-card').length)`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/04-multi-step-form.c4a:
--------------------------------------------------------------------------------
 1 | # Multi-step Form Wizard
 2 | # Complete a complex form with multiple steps
 3 | 
 4 | # Navigate to forms section
 5 | CLICK `a[href="#forms"]`
 6 | WAIT `#survey-form` 2
 7 | 
 8 | # Step 1: Basic Information
 9 | CLICK `#full-name`
10 | TYPE "John Doe"
11 | 
12 | CLICK `#survey-email`  
13 | TYPE "john.doe@example.com"
14 | 
15 | # Go to next step
16 | CLICK `.next-step`
17 | WAIT 1
18 | 
19 | # Step 2: Select Interests
20 | # Select multiple options
21 | CLICK `#interests`
22 | CLICK `option[value="tech"]`
23 | CLICK `option[value="music"]`
24 | CLICK `option[value="travel"]`
25 | 
26 | # Continue to final step
27 | CLICK `.next-step`
28 | WAIT 1
29 | 
30 | # Step 3: Review and Submit
31 | # Verify we're on the last step
32 | IF (EXISTS `#submit-survey`) THEN EVAL `console.log('📋 On final step')`
33 | 
34 | # Submit the form
35 | CLICK `#submit-survey`
36 | 
37 | # Wait for success message
38 | WAIT `.success-message` 5
39 | 
40 | # Verify submission
41 | IF (EXISTS `.success-message`) THEN EVAL `console.log('✅ Survey submitted successfully!')`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/05-complex-workflow.c4a:
--------------------------------------------------------------------------------
 1 | # Complete E-commerce Workflow
 2 | # Login, browse products, and interact with various elements
 3 | 
 4 | # Define reusable procedures
 5 | PROC handle_popups
 6 |   IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
 7 |   IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`
 8 | ENDPROC
 9 | 
10 | PROC login_user
11 |   CLICK `#login-btn`
12 |   WAIT `.login-form` 2
13 |   CLICK `#email`
14 |   TYPE "demo@example.com"
15 |   CLICK `#password`
16 |   TYPE "demo123"
17 |   CLICK `button[type="submit"]`
18 |   WAIT `.welcome-message` 5
19 | ENDPROC
20 | 
21 | PROC browse_products
22 |   # Go to catalog
23 |   CLICK `#catalog-link`
24 |   WAIT `.product-grid` 3
25 |   
26 |   # Apply filters
27 |   CLICK `.collapsible`
28 |   WAIT 0.5
29 |   CLICK `input[type="checkbox"]`
30 |   
31 |   # Load some products
32 |   SCROLL DOWN 500
33 |   WAIT 1
34 |   SCROLL DOWN 500
35 |   WAIT 1
36 | ENDPROC
37 | 
38 | # Main workflow
39 | GO http://127.0.0.1:8080/playground/
40 | WAIT `body` 2
41 | 
42 | # Handle initial popups
43 | handle_popups
44 | 
45 | # Login if not already
46 | IF (NOT EXISTS `.user-info`) THEN login_user
47 | 
48 | # Browse products
49 | browse_products
50 | 
51 | # Navigate to tabs demo
52 | CLICK `a[href="#tabs"]`
53 | WAIT `.tabs-container` 2
54 | 
55 | # Interact with tabs
56 | CLICK `button[data-tab="reviews"]`
57 | WAIT 1
58 | 
59 | # Load comments
60 | IF (EXISTS `.load-comments`) THEN CLICK `.load-comments`
61 | WAIT `.comments-section` 2
62 | 
63 | # Check specifications
64 | CLICK `button[data-tab="specs"]`
65 | WAIT 1
66 | 
67 | # Final navigation to data tables
68 | CLICK `a[href="#data"]`
69 | WAIT `.data-table` 2
70 | 
71 | # Search in table
72 | CLICK `.search-input`
73 | TYPE "User"
74 | 
75 | # Load more rows
76 | CLICK `.load-more-rows`
77 | WAIT 1
78 | 
79 | # Export data
80 | CLICK `#export-btn`
81 | 
82 | EVAL `console.log('✅ Workflow completed successfully!')`


--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/test_blockly.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Blockly Test</title>
 7 |     <style>
 8 |         body {
 9 |             margin: 0;
10 |             padding: 20px;
11 |             background: #0e0e10;
12 |             color: #e0e0e0;
13 |             font-family: monospace;
14 |         }
15 |         #blocklyDiv {
16 |             height: 600px;
17 |             width: 100%;
18 |             border: 1px solid #2a2a2c;
19 |         }
20 |         #output {
21 |             margin-top: 20px;
22 |             padding: 15px;
23 |             background: #1a1a1b;
24 |             border: 1px solid #2a2a2c;
25 |             white-space: pre-wrap;
26 |         }
27 |     </style>
28 | </head>
29 | <body>
30 |     <h1>C4A-Script Blockly Test</h1>
31 |     <div id="blocklyDiv"></div>
32 |     <div id="output">
33 |         <h3>Generated C4A-Script:</h3>
34 |         <pre id="code-output"></pre>
35 |     </div>
36 |     
37 |     <script src="https://unpkg.com/blockly/blockly.min.js"></script>
38 |     <script src="assets/c4a-blocks.js"></script>
39 |     <script>
40 |         // Simple test
41 |         const workspace = Blockly.inject('blocklyDiv', {
42 |             toolbox: `
43 |                 <xml>
44 |                     <category name="Test" colour="#1E88E5">
45 |                         <block type="c4a_go"></block>
46 |                         <block type="c4a_wait_time"></block>
47 |                         <block type="c4a_click"></block>
48 |                     </category>
49 |                 </xml>
50 |             `,
51 |             theme: Blockly.Theme.defineTheme('dark', {
52 |                 'base': Blockly.Themes.Classic,
53 |                 'componentStyles': {
54 |                     'workspaceBackgroundColour': '#0e0e10',
55 |                     'toolboxBackgroundColour': '#1a1a1b',
56 |                     'toolboxForegroundColour': '#e0e0e0',
57 |                     'flyoutBackgroundColour': '#1a1a1b',
58 |                     'flyoutForegroundColour': '#e0e0e0',
59 |                 }
60 |             })
61 |         });
62 |         
63 |         workspace.addChangeListener((event) => {
64 |             const code = Blockly.JavaScript.workspaceToCode(workspace);
65 |             document.getElementById('code-output').textContent = code;
66 |         });
67 |     </script>
68 | </body>
69 | </html>


--------------------------------------------------------------------------------
/docs/examples/chainlit.md:
--------------------------------------------------------------------------------
1 | # Welcome to Crawl4AI! 🚀🤖
2 | 
3 | Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.


--------------------------------------------------------------------------------
/docs/examples/cli/browser.yml:
--------------------------------------------------------------------------------
 1 | browser_type: "chromium"
 2 | headless: true
 3 | viewport_width: 1280
 4 | viewport_height: 800
 5 | user_agent_mode: "random"
 6 | verbose: true
 7 | text_mode: false
 8 | light_mode: false
 9 | ignore_https_errors: true
10 | java_script_enabled: true
11 | extra_args:
12 |   - "--disable-gpu"
13 |   - "--no-sandbox"


--------------------------------------------------------------------------------
/docs/examples/cli/crawler.yml:
--------------------------------------------------------------------------------
 1 | cache_mode: "bypass"
 2 | wait_until: "networkidle"
 3 | page_timeout: 30000
 4 | delay_before_return_html: 0.5
 5 | word_count_threshold: 100
 6 | scan_full_page: true
 7 | scroll_delay: 0.3
 8 | process_iframes: false
 9 | remove_overlay_elements: true
10 | magic: true
11 | verbose: true
12 | exclude_external_links: true
13 | exclude_social_media_links: true


--------------------------------------------------------------------------------
/docs/examples/cli/css_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ArticleExtractor",
 3 |   "baseSelector": ".cards[data-tax=news] .card__data",
 4 |   "fields": [
 5 |     {
 6 |       "name": "title",
 7 |       "selector": "h4.card__title",
 8 |       "type": "text"
 9 |     },
10 |     {
11 |       "name": "link",
12 |       "selector": "h4.card__title a", 
13 |       "type": "attribute",
14 |       "attribute": "href"
15 |     },
16 |     {
17 |       "name": "details",
18 |       "selector": ".card__details",
19 |       "type": "text"
20 |     },
21 |     {
22 |       "name": "topics",
23 |       "selector": ".card__topics.topics",
24 |       "type": "text"
25 |     }
26 |   ]
27 | }


--------------------------------------------------------------------------------
/docs/examples/cli/extract.yml:
--------------------------------------------------------------------------------
 1 | type: "llm"
 2 | provider: "openai/gpt-4o-mini"
 3 | api_token: "env:OPENAI_API_KEY"
 4 | instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
 5 | params:
 6 |   chunk_token_threshold: 4096
 7 |   overlap_rate: 0.1
 8 |   word_token_rate: 0.75
 9 |   temperature: 0.3
10 |   max_tokens: 1000
11 |   verbose: true


--------------------------------------------------------------------------------
/docs/examples/cli/extract_css.yml:
--------------------------------------------------------------------------------
1 | type: "json-css"
2 | params:
3 |   verbose: true 


--------------------------------------------------------------------------------
/docs/examples/cli/llm_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "NewsArticle",
 3 |   "type": "object",
 4 |   "properties": {
 5 |     "title": {
 6 |       "type": "string",
 7 |       "description": "The title/headline of the news article"
 8 |     },
 9 |     "link": {
10 |       "type": "string",
11 |       "description": "The URL or link to the full article"
12 |     },
13 |     "details": {
14 |       "type": "string", 
15 |       "description": "Brief summary or details about the article content"
16 |     },
17 |     "topics": {
18 |       "type": "array",
19 |       "items": {
20 |         "type": "string"
21 |       },
22 |       "description": "List of topics or categories associated with the article"
23 |     }
24 |   },
25 |   "required": ["title", "details"]
26 | }


--------------------------------------------------------------------------------
/docs/examples/docker_python_sdk.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai.docker_client import Crawl4aiDockerClient
 3 | from crawl4ai import (
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig
 6 | )
 7 | 
 8 | async def main():
 9 |     async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
10 |         # If jwt is enabled, authenticate first
11 |         # await client.authenticate("test@example.com")
12 |         
13 |         # Non-streaming crawl
14 |         results = await client.crawl(
15 |             ["https://example.com", "https://python.org"],
16 |             browser_config=BrowserConfig(headless=True),
17 |             crawler_config=CrawlerRunConfig()
18 |         )
19 |         print(f"Non-streaming results: {results}")
20 |         
21 |         # Streaming crawl
22 |         crawler_config = CrawlerRunConfig(stream=True)
23 |         async for result in await client.crawl(
24 |             ["https://example.com", "https://python.org"],
25 |             browser_config=BrowserConfig(headless=True),
26 |             crawler_config=crawler_config
27 |         ):
28 |             print(f"Streamed result: {result}")
29 |         
30 |         # Get schema
31 |         schema = await client.get_schema()
32 |         print(f"Schema: {schema}")
33 | 
34 | if __name__ == "__main__":
35 |     asyncio.run(main())


--------------------------------------------------------------------------------
/docs/examples/hello_world.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import (
 3 |     AsyncWebCrawler,
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig,
 6 |     DefaultMarkdownGenerator,
 7 |     PruningContentFilter,
 8 |     CrawlResult
 9 | )
10 | 
11 | 
12 | async def main():
13 |     browser_config = BrowserConfig(
14 |         headless=False,
15 |         verbose=True,
16 |     )
17 |     async with AsyncWebCrawler(config=browser_config) as crawler:
18 |         crawler_config = CrawlerRunConfig(
19 |             markdown_generator=DefaultMarkdownGenerator(
20 |                 content_filter=PruningContentFilter()
21 |             ),
22 |         )
23 |         result: CrawlResult = await crawler.arun(
24 |             url="https://www.helloworld.org", config=crawler_config
25 |         )
26 |         print(result.markdown.raw_markdown[:500])
27 | 
28 | if __name__ == "__main__":
29 |     asyncio.run(main())
30 | 


--------------------------------------------------------------------------------
/docs/examples/hello_world_undetected.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import (
 3 |     AsyncWebCrawler,
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig,
 6 |     DefaultMarkdownGenerator,
 7 |     PruningContentFilter,
 8 |     CrawlResult,
 9 |     UndetectedAdapter
10 | )
11 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
12 | 
13 | 
14 | async def main():
15 |     # Create browser config
16 |     browser_config = BrowserConfig(
17 |         headless=False,
18 |         verbose=True,
19 |     )
20 |     
21 |     # Create the undetected adapter
22 |     undetected_adapter = UndetectedAdapter()
23 |     
24 |     # Create the crawler strategy with the undetected adapter
25 |     crawler_strategy = AsyncPlaywrightCrawlerStrategy(
26 |         browser_config=browser_config,
27 |         browser_adapter=undetected_adapter
28 |     )
29 |     
30 |     # Create the crawler with our custom strategy
31 |     async with AsyncWebCrawler(
32 |         crawler_strategy=crawler_strategy,
33 |         config=browser_config
34 |     ) as crawler:
35 |         # Configure the crawl
36 |         crawler_config = CrawlerRunConfig(
37 |             markdown_generator=DefaultMarkdownGenerator(
38 |                 content_filter=PruningContentFilter()
39 |             ),
40 |             capture_console_messages=True,  # Enable console capture to test adapter
41 |         )
42 |         
43 |         # Test on a site that typically detects bots
44 |         print("Testing undetected adapter...")
45 |         result: CrawlResult = await crawler.arun(
46 |             url="https://www.helloworld.org", 
47 |             config=crawler_config
48 |         )
49 |         
50 |         print(f"Status: {result.status_code}")
51 |         print(f"Success: {result.success}")
52 |         print(f"Console messages captured: {len(result.console_messages or [])}")
53 |         print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(main())


--------------------------------------------------------------------------------
/docs/examples/language_support_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
 3 | 
 4 | 
 5 | async def main():
 6 |     # Example 1: Setting language when creating the crawler
 7 |     crawler1 = AsyncWebCrawler(
 8 |         crawler_strategy=AsyncPlaywrightCrawlerStrategy(
 9 |             headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
10 |         )
11 |     )
12 |     result1 = await crawler1.arun("https://www.example.com")
13 |     print(
14 |         "Example 1 result:", result1.extracted_content[:100]
15 |     )  # Print first 100 characters
16 | 
17 |     # Example 2: Setting language before crawling
18 |     crawler2 = AsyncWebCrawler()
19 |     crawler2.crawler_strategy.headers[
20 |         "Accept-Language"
21 |     ] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
22 |     result2 = await crawler2.arun("https://www.example.com")
23 |     print("Example 2 result:", result2.extracted_content[:100])
24 | 
25 |     # Example 3: Setting language when calling arun method
26 |     crawler3 = AsyncWebCrawler()
27 |     result3 = await crawler3.arun(
28 |         "https://www.example.com",
29 |         headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"},
30 |     )
31 |     print("Example 3 result:", result3.extracted_content[:100])
32 | 
33 |     # Example 4: Crawling multiple pages with different languages
34 |     urls = [
35 |         ("https://www.example.com", "fr-FR,fr;q=0.9"),
36 |         ("https://www.example.org", "es-ES,es;q=0.9"),
37 |         ("https://www.example.net", "de-DE,de;q=0.9"),
38 |     ]
39 | 
40 |     crawler4 = AsyncWebCrawler()
41 |     results = await asyncio.gather(
42 |         *[crawler4.arun(url, headers={"Accept-Language": lang}) for url, lang in urls]
43 |     )
44 | 
45 |     for url, result in zip([u for u, _ in urls], results):
46 |         print(f"Result for {url}:", result.extracted_content[:100])
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     asyncio.run(main())
51 | 


--------------------------------------------------------------------------------
/docs/examples/llm_extraction_openai_pricing.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from pydantic import BaseModel, Field
 3 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig, BrowserConfig, CacheMode
 4 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
 5 | from typing import Dict
 6 | import os
 7 | 
 8 | 
 9 | class OpenAIModelFee(BaseModel):
10 |     model_name: str = Field(..., description="Name of the OpenAI model.")
11 |     input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
12 |     output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
13 | 
14 | 
15 | async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
16 |     print(f"\n--- Extracting Structured Data with {provider} ---")
17 | 
18 |     if api_token is None and provider != "ollama":
19 |         print(f"API token is required for {provider}. Skipping this example.")
20 |         return
21 | 
22 |     browser_config = BrowserConfig(headless=True)
23 | 
24 |     extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
25 |     if extra_headers:
26 |         extra_args["extra_headers"] = extra_headers
27 | 
28 |     crawler_config = CrawlerRunConfig(
29 |         cache_mode=CacheMode.BYPASS,
30 |         word_count_threshold=1,
31 |         page_timeout=80000,
32 |         extraction_strategy=LLMExtractionStrategy(
33 |             llm_config=LLMConfig(provider=provider, api_token=api_token),
34 |             schema=OpenAIModelFee.model_json_schema(),
35 |             extraction_type="schema",
36 |             instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
37 |             Do not miss any models in the entire content.""",
38 |             extra_args=extra_args,
39 |         ),
40 |     )
41 | 
42 |     async with AsyncWebCrawler(config=browser_config) as crawler:
43 |         result = await crawler.arun(
44 |             url="https://openai.com/api/pricing/", 
45 |             config=crawler_config
46 |         )
47 |         print(result.extracted_content)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     asyncio.run(
52 |         extract_structured_data_using_llm(
53 |             provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
54 |         )
55 |     )
56 | 


--------------------------------------------------------------------------------
/docs/examples/markdown/content_source_short_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
 3 | """
 4 | 
 5 | import asyncio
 6 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
 7 | 
 8 | async def demo_markdown_source_config():
 9 |     print("\n=== Demo: Configuring Markdown Source ===")
10 | 
11 |     # Example 1: Generate markdown from cleaned HTML (default behavior)
12 |     cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
13 |     config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
14 | 
15 |     async with AsyncWebCrawler() as crawler:
16 |         result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
17 |         print("Markdown from Cleaned HTML (default):")
18 |         print(f"  Length: {len(result_cleaned.markdown.raw_markdown)}")
19 |         print(f"  Start: {result_cleaned.markdown.raw_markdown[:100]}...")
20 | 
21 |     # Example 2: Generate markdown directly from raw HTML
22 |     raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
23 |     config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
24 | 
25 |     async with AsyncWebCrawler() as crawler:
26 |         result_raw = await crawler.arun(url="https://example.com", config=config_raw)
27 |         print("\nMarkdown from Raw HTML:")
28 |         print(f"  Length: {len(result_raw.markdown.raw_markdown)}")
29 |         print(f"  Start: {result_raw.markdown.raw_markdown[:100]}...")
30 | 
31 |     # Example 3: Generate markdown from preprocessed 'fit' HTML
32 |     fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
33 |     config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
34 | 
35 |     async with AsyncWebCrawler() as crawler:
36 |         result_fit = await crawler.arun(url="https://example.com", config=config_fit)
37 |         print("\nMarkdown from Fit HTML:")
38 |         print(f"  Length: {len(result_fit.markdown.raw_markdown)}")
39 |         print(f"  Start: {result_fit.markdown.raw_markdown[:100]}...")
40 | 
41 | if __name__ == "__main__":
42 |     asyncio.run(demo_markdown_source_config())


--------------------------------------------------------------------------------
/docs/examples/rest_call.py:
--------------------------------------------------------------------------------
 1 | import requests, base64, os
 2 | 
 3 | data = {
 4 |     "urls": ["https://www.nbcnews.com/business"],
 5 |     "screenshot": True,
 6 | }
 7 | 
 8 | response = requests.post("https://crawl4ai.com/crawl", json=data)
 9 | result = response.json()["results"][0]
10 | print(result.keys())
11 | # dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
12 | # 'links', 'screenshot', 'markdown', 'extracted_content',
13 | # 'metadata', 'error_message'])
14 | with open("screenshot.png", "wb") as f:
15 |     f.write(base64.b64decode(result["screenshot"]))
16 | 
17 | # Example of filtering the content using CSS selectors
18 | data = {
19 |     "urls": ["https://www.nbcnews.com/business"],
20 |     "css_selector": "article",
21 |     "screenshot": True,
22 | }
23 | 
24 | # Example of executing a JS script on the page before extracting the content
25 | data = {
26 |     "urls": ["https://www.nbcnews.com/business"],
27 |     "screenshot": True,
28 |     "js": [
29 |         """
30 |     const loadMoreButton = Array.from(document.querySelectorAll('button')).
31 |     find(button => button.textContent.includes('Load More'));
32 |     loadMoreButton && loadMoreButton.click();
33 |     """
34 |     ],
35 | }
36 | 
37 | # Example of using a custom extraction strategy
38 | data = {
39 |     "urls": ["https://www.nbcnews.com/business"],
40 |     "extraction_strategy": "CosineStrategy",
41 |     "extraction_strategy_args": {"semantic_filter": "inflation rent prices"},
42 | }
43 | 
44 | # Example of using LLM to extract content
45 | data = {
46 |     "urls": ["https://www.nbcnews.com/business"],
47 |     "extraction_strategy": "LLMExtractionStrategy",
48 |     "extraction_strategy_args": {
49 |         "provider": "groq/llama3-8b-8192",
50 |         "api_token": os.environ.get("GROQ_API_KEY"),
51 |         "instruction": """I am interested in only financial news, 
52 |         and translate them in French.""",
53 |     },
54 | }
55 | 


--------------------------------------------------------------------------------
/docs/examples/session_id_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import (
 3 |     AsyncWebCrawler,
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig,
 6 |     DefaultMarkdownGenerator,
 7 |     PruningContentFilter,
 8 |     CrawlResult
 9 | )
10 | 
11 |                    
12 | 
13 | async def main():    
14 |     browser_config = BrowserConfig(
15 |         headless=False, 
16 |         verbose=True,
17 |     )
18 |     async with AsyncWebCrawler(config=browser_config) as crawler:
19 |         crawler_config = CrawlerRunConfig(
20 |             session_id= "hello_world", # This help us to use the same page 
21 |         )
22 |         result : CrawlResult = await crawler.arun(
23 |             url="https://www.helloworld.org", config=crawler_config
24 |         )
25 |         # Add a breakpoint here, then you will the page is open and browser is not closed
26 |         print(result.markdown.raw_markdown[:500])
27 |         
28 |         new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True)
29 |         result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page
30 |             url="https://www.helloworld.org", config= new_config
31 |         )
32 |         print(result.js_execution_result) # You should see {'data':'hello'} in the console
33 |         
34 |         # Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config 
35 |         page, context = crawler.crawler_strategy.get_page(new_config)
36 | 
37 | if __name__ == "__main__":
38 |     asyncio.run(main())
39 | 


--------------------------------------------------------------------------------
/docs/examples/simple_anti_bot_examples.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
 3 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
 4 | 
 5 | # Example 1: Stealth Mode
 6 | async def stealth_mode_example():
 7 |     browser_config = BrowserConfig(
 8 |         enable_stealth=True,
 9 |         headless=False
10 |     )
11 |     
12 |     async with AsyncWebCrawler(config=browser_config) as crawler:
13 |         result = await crawler.arun("https://example.com")
14 |         return result.html[:500]
15 | 
16 | # Example 2: Undetected Browser
17 | async def undetected_browser_example():
18 |     browser_config = BrowserConfig(
19 |         headless=False
20 |     )
21 |     
22 |     adapter = UndetectedAdapter()
23 |     strategy = AsyncPlaywrightCrawlerStrategy(
24 |         browser_config=browser_config,
25 |         browser_adapter=adapter
26 |     )
27 |     
28 |     async with AsyncWebCrawler(
29 |         crawler_strategy=strategy,
30 |         config=browser_config
31 |     ) as crawler:
32 |         result = await crawler.arun("https://example.com")
33 |         return result.html[:500]
34 | 
35 | # Example 3: Both Combined
36 | async def combined_example():
37 |     browser_config = BrowserConfig(
38 |         enable_stealth=True,
39 |         headless=False
40 |     )
41 |     
42 |     adapter = UndetectedAdapter()
43 |     strategy = AsyncPlaywrightCrawlerStrategy(
44 |         browser_config=browser_config,
45 |         browser_adapter=adapter
46 |     )
47 |     
48 |     async with AsyncWebCrawler(
49 |         crawler_strategy=strategy,
50 |         config=browser_config
51 |     ) as crawler:
52 |         result = await crawler.arun("https://example.com")
53 |         return result.html[:500]
54 | 
55 | # Run examples
56 | if __name__ == "__main__":
57 |     asyncio.run(stealth_mode_example())
58 |     asyncio.run(undetected_browser_example())
59 |     asyncio.run(combined_example())


--------------------------------------------------------------------------------
/docs/examples/ssl_example.py:
--------------------------------------------------------------------------------
 1 | """Example showing how to work with SSL certificates in Crawl4AI."""
 2 | 
 3 | import asyncio
 4 | import os
 5 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 6 | 
 7 | # Create tmp directory if it doesn't exist
 8 | parent_dir = os.path.dirname(
 9 |     os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10 | )
11 | tmp_dir = os.path.join(parent_dir, "tmp")
12 | os.makedirs(tmp_dir, exist_ok=True)
13 | 
14 | 
15 | async def main():
16 |     # Configure crawler to fetch SSL certificate
17 |     config = CrawlerRunConfig(
18 |         fetch_ssl_certificate=True,
19 |         cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
20 |     )
21 | 
22 |     async with AsyncWebCrawler() as crawler:
23 |         result = await crawler.arun(url="https://example.com", config=config)
24 | 
25 |         if result.success and result.ssl_certificate:
26 |             cert = result.ssl_certificate
27 | 
28 |             # 1. Access certificate properties directly
29 |             print("\nCertificate Information:")
30 |             print(f"Issuer: {cert.issuer.get('CN', '')}")
31 |             print(f"Valid until: {cert.valid_until}")
32 |             print(f"Fingerprint: {cert.fingerprint}")
33 | 
34 |             # 2. Export certificate in different formats
35 |             cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
36 |             print("\nCertificate exported to:")
37 |             print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
38 | 
39 |             pem_data = cert.to_pem(
40 |                 os.path.join(tmp_dir, "certificate.pem")
41 |             )  # For web servers
42 |             print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
43 | 
44 |             der_data = cert.to_der(
45 |                 os.path.join(tmp_dir, "certificate.der")
46 |             )  # For Java apps
47 |             print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     asyncio.run(main())
52 | 


--------------------------------------------------------------------------------
/docs/examples/stealth_test_simple.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple test to verify stealth mode is working
 3 | """
 4 | 
 5 | import asyncio
 6 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 7 | 
 8 | 
 9 | async def test_stealth():
10 |     """Test stealth mode effectiveness"""
11 |     
12 |     # Test WITHOUT stealth
13 |     print("=== WITHOUT Stealth ===")
14 |     config1 = BrowserConfig(
15 |         headless=False,
16 |         enable_stealth=False
17 |     )
18 |     
19 |     async with AsyncWebCrawler(config=config1) as crawler:
20 |         result = await crawler.arun(
21 |             url="https://bot.sannysoft.com",
22 |             config=CrawlerRunConfig(
23 |                 wait_until="networkidle",
24 |                 screenshot=True
25 |             )
26 |         )
27 |         print(f"Success: {result.success}")
28 |         # Take screenshot
29 |         if result.screenshot:
30 |             with open("without_stealth.png", "wb") as f:
31 |                 import base64
32 |                 f.write(base64.b64decode(result.screenshot))
33 |             print("Screenshot saved: without_stealth.png")
34 |     
35 |     # Test WITH stealth
36 |     print("\n=== WITH Stealth ===")
37 |     config2 = BrowserConfig(
38 |         headless=False,
39 |         enable_stealth=True
40 |     )
41 |     
42 |     async with AsyncWebCrawler(config=config2) as crawler:
43 |         result = await crawler.arun(
44 |             url="https://bot.sannysoft.com",
45 |             config=CrawlerRunConfig(
46 |                 wait_until="networkidle",
47 |                 screenshot=True
48 |             )
49 |         )
50 |         print(f"Success: {result.success}")
51 |         # Take screenshot
52 |         if result.screenshot:
53 |             with open("with_stealth.png", "wb") as f:
54 |                 import base64
55 |                 f.write(base64.b64decode(result.screenshot))
56 |             print("Screenshot saved: with_stealth.png")
57 |     
58 |     print("\nCheck the screenshots to see the difference in bot detection results!")
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     asyncio.run(test_stealth())


--------------------------------------------------------------------------------
/docs/examples/summarize_page.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from crawl4ai.web_crawler import WebCrawler
 4 | from crawl4ai.chunking_strategy import *
 5 | from crawl4ai import *
 6 | from crawl4ai.crawler_strategy import *
 7 | 
 8 | url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
 9 | 
10 | crawler = WebCrawler()
11 | crawler.warmup()
12 | 
13 | from pydantic import BaseModel, Field
14 | 
15 | 
16 | class PageSummary(BaseModel):
17 |     title: str = Field(..., description="Title of the page.")
18 |     summary: str = Field(..., description="Summary of the page.")
19 |     brief_summary: str = Field(..., description="Brief summary of the page.")
20 |     keywords: list = Field(..., description="Keywords assigned to the page.")
21 | 
22 | 
23 | result = crawler.run(
24 |     url=url,
25 |     word_count_threshold=1,
26 |     extraction_strategy=LLMExtractionStrategy(
27 |         provider="openai/gpt-4o",
28 |         api_token=os.getenv("OPENAI_API_KEY"),
29 |         schema=PageSummary.model_json_schema(),
30 |         extraction_type="schema",
31 |         apply_chunking=False,
32 |         instruction="From the crawled content, extract the following details: "
33 |         "1. Title of the page "
34 |         "2. Summary of the page, which is a detailed summary "
35 |         "3. Brief summary of the page, which is a paragraph text "
36 |         "4. Keywords assigned to the page, which is a list of keywords. "
37 |         "The extracted JSON format should look like this: "
38 |         '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
39 |     ),
40 |     bypass_cache=True,
41 | )
42 | 
43 | page_summary = json.loads(result.extracted_content)
44 | 
45 | print(page_summary)
46 | 
47 | with open(".data/page_summary.json", "w", encoding="utf-8") as f:
48 |     f.write(result.extracted_content)
49 | 


--------------------------------------------------------------------------------
/docs/examples/undetectability/undetected_basic_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Basic Undetected Browser Test
 3 | Simple example to test if undetected mode works
 4 | """
 5 | 
 6 | import asyncio
 7 | from crawl4ai import AsyncWebCrawler, BrowserConfig
 8 | 
 9 | async def test_regular_mode():
10 |     """Test with regular browser"""
11 |     print("Testing Regular Browser Mode...")
12 |     browser_config = BrowserConfig(
13 |         headless=False,
14 |         verbose=True
15 |     )
16 |     
17 |     async with AsyncWebCrawler(config=browser_config) as crawler:
18 |         result = await crawler.arun(url="https://www.example.com")
19 |         print(f"Regular Mode - Success: {result.success}")
20 |         print(f"Regular Mode - Status: {result.status_code}")
21 |         print(f"Regular Mode - Content length: {len(result.markdown.raw_markdown)}")
22 |         print(f"Regular Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
23 |         return result.success
24 | 
25 | async def test_undetected_mode():
26 |     """Test with undetected browser"""
27 |     print("\nTesting Undetected Browser Mode...")
28 |     from crawl4ai import UndetectedAdapter
29 |     from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
30 |     
31 |     browser_config = BrowserConfig(
32 |         headless=False,
33 |         verbose=True
34 |     )
35 |     
36 |     # Create undetected adapter
37 |     undetected_adapter = UndetectedAdapter()
38 |     
39 |     # Create strategy with undetected adapter
40 |     crawler_strategy = AsyncPlaywrightCrawlerStrategy(
41 |         browser_config=browser_config,
42 |         browser_adapter=undetected_adapter
43 |     )
44 |     
45 |     async with AsyncWebCrawler(
46 |         crawler_strategy=crawler_strategy,
47 |         config=browser_config
48 |     ) as crawler:
49 |         result = await crawler.arun(url="https://www.example.com")
50 |         print(f"Undetected Mode - Success: {result.success}")
51 |         print(f"Undetected Mode - Status: {result.status_code}")
52 |         print(f"Undetected Mode - Content length: {len(result.markdown.raw_markdown)}")
53 |         print(f"Undetected Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
54 |         return result.success
55 | 
56 | async def main():
57 |     """Run both tests"""
58 |     print("🤖 Crawl4AI Basic Adapter Test\n")
59 |     
60 |     # Test regular mode
61 |     regular_success = await test_regular_mode()
62 |     
63 |     # Test undetected mode
64 |     undetected_success = await test_undetected_mode()
65 |     
66 |     # Summary
67 |     print("\n" + "="*50)
68 |     print("Summary:")
69 |     print(f"Regular Mode: {'✅ Success' if regular_success else '❌ Failed'}")
70 |     print(f"Undetected Mode: {'✅ Success' if undetected_success else '❌ Failed'}")
71 |     print("="*50)
72 | 
73 | if __name__ == "__main__":
74 |     asyncio.run(main())


--------------------------------------------------------------------------------
/docs/examples/use_geo_location.py:
--------------------------------------------------------------------------------
 1 | # use_geo_location.py
 2 | """
 3 | Example: override locale, timezone, and geolocation using Crawl4ai patterns.
 4 | 
 5 | This demo uses `AsyncWebCrawler.arun()` to fetch a page with
 6 | browser context primed for specific locale, timezone, and GPS,
 7 | and saves a screenshot for visual verification.
 8 | """
 9 | 
10 | import asyncio
11 | import base64
12 | from pathlib import Path
13 | from typing import List
14 | from crawl4ai import (
15 |     AsyncWebCrawler,
16 |     CrawlerRunConfig,
17 |     BrowserConfig,
18 |     GeolocationConfig,
19 |     CrawlResult,
20 | )
21 | 
22 | async def demo_geo_override():
23 |     """Demo: Crawl a geolocation-test page with overrides and screenshot."""
24 |     print("\n=== Geo-Override Crawl ===")
25 | 
26 |     # 1) Browser setup: use Playwright-managed contexts
27 |     browser_cfg = BrowserConfig(
28 |         headless=False,
29 |         viewport_width=1280,
30 |         viewport_height=720,
31 |         use_managed_browser=False,
32 |     )
33 | 
34 |     # 2) Run config: include locale, timezone_id, geolocation, and screenshot
35 |     run_cfg = CrawlerRunConfig(
36 |         url="https://browserleaks.com/geo",          # test page that shows your location
37 |         locale="en-US",                              # Accept-Language & UI locale
38 |         timezone_id="America/Los_Angeles",           # JS Date()/Intl timezone
39 |         geolocation=GeolocationConfig(                 # override GPS coords
40 |             latitude=34.0522,
41 |             longitude=-118.2437,
42 |             accuracy=10.0,
43 |         ),
44 |         screenshot=True,                               # capture screenshot after load
45 |         session_id="geo_test",                       # reuse context if rerunning
46 |         delay_before_return_html=5
47 |     )
48 | 
49 |     async with AsyncWebCrawler(config=browser_cfg) as crawler:
50 |         # 3) Run crawl (returns list even for single URL)
51 |         results: List[CrawlResult] = await crawler.arun(
52 |             url=run_cfg.url,
53 |             config=run_cfg,            
54 |         )
55 |         result = results[0]
56 | 
57 |         # 4) Save screenshot and report path
58 |         if result.screenshot:
59 |             __current_dir = Path(__file__).parent
60 |             out_dir = __current_dir / "tmp"
61 |             out_dir.mkdir(exist_ok=True)
62 |             shot_path = out_dir / "geo_test.png"
63 |             with open(shot_path, "wb") as f:
64 |                 f.write(base64.b64decode(result.screenshot))
65 |             print(f"Saved screenshot to {shot_path}")
66 |         else:
67 |             print("No screenshot captured, check configuration.")
68 | 
69 | if __name__ == "__main__":
70 |     asyncio.run(demo_geo_override())
71 | 


--------------------------------------------------------------------------------
/docs/examples/website-to-api/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Startup script for the Web Scraper API with frontend interface.
 4 | """
 5 | 
 6 | import os
 7 | import sys
 8 | import uvicorn
 9 | from pathlib import Path
10 | 
11 | def main():
12 |     # Check if static directory exists
13 |     static_dir = Path("static")
14 |     if not static_dir.exists():
15 |         print("❌ Static directory not found!")
16 |         print("Please make sure the 'static' directory exists with the frontend files.")
17 |         sys.exit(1)
18 |     
19 |     # Check if required frontend files exist
20 |     required_files = ["index.html", "styles.css", "script.js"]
21 |     missing_files = []
22 |     
23 |     for file in required_files:
24 |         if not (static_dir / file).exists():
25 |             missing_files.append(file)
26 |     
27 |     if missing_files:
28 |         print(f"❌ Missing frontend files: {', '.join(missing_files)}")
29 |         print("Please make sure all frontend files are present in the static directory.")
30 |         sys.exit(1)
31 |     
32 |     print("🚀 Starting Web Scraper API with Frontend Interface")
33 |     print("=" * 50)
34 |     print("📁 Static files found and ready to serve")
35 |     print("🌐 Frontend will be available at: http://localhost:8000")
36 |     print("🔌 API endpoints available at: http://localhost:8000/docs")
37 |     print("=" * 50)
38 |     
39 |     # Start the server
40 |     uvicorn.run(
41 |         "api_server:app",
42 |         host="0.0.0.0",
43 |         port=8000,
44 |         reload=True,
45 |         log_level="info"
46 |     )
47 | 
48 | if __name__ == "__main__":
49 |     main() 


--------------------------------------------------------------------------------
/docs/examples/website-to-api/assets/crawl4ai_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/website-to-api/assets/crawl4ai_logo.jpg


--------------------------------------------------------------------------------
/docs/examples/website-to-api/requirements.txt:
--------------------------------------------------------------------------------
1 | crawl4ai
2 | fastapi
3 | uvicorn
4 | pydantic
5 | litellm


--------------------------------------------------------------------------------
/docs/examples/website-to-api/test_api.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from web_scraper_lib import scrape_website
 3 | import os
 4 | 
 5 | async def test_library():
 6 |     """Test the mini library directly."""
 7 |     print("=== Testing Mini Library ===")
 8 |     
 9 |     # Test 1: Scrape with a custom model
10 |     url = "https://marketplace.mainstreet.co.in/collections/adidas-yeezy/products/adidas-yeezy-boost-350-v2-yecheil-non-reflective"
11 |     query = "Extract the following data: Product name, Product price, Product description, Product size. DO NOT EXTRACT ANYTHING ELSE."
12 |     if os.path.exists("models"):
13 |         model_name = os.listdir("models")[0].split(".")[0]
14 |     else:
15 |         raise Exception("No models found in models directory")
16 | 
17 |     print(f"Scraping: {url}")
18 |     print(f"Query: {query}")
19 |     
20 |     try:
21 |         result = await scrape_website(url, query, model_name)
22 |         print("✅ Library test successful!")
23 |         print(f"Extracted data: {result['extracted_data']}")
24 |     except Exception as e:
25 |         print(f"❌ Library test failed: {e}")
26 | 
27 | if __name__ == "__main__":
28 |     asyncio.run(test_library())


--------------------------------------------------------------------------------
/docs/examples/website-to-api/test_models.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Test script for the new model management functionality.
 4 | This script demonstrates how to save and use custom model configurations.
 5 | """
 6 | 
 7 | import asyncio
 8 | import requests
 9 | import json
10 | 
11 | # API base URL
12 | BASE_URL = "http://localhost:8000"
13 | 
14 | def test_model_management():
15 |     """Test the model management endpoints."""
16 |     
17 |     print("=== Testing Model Management ===")
18 |     
19 |     # 1. List current models
20 |     print("\n1. Listing current models:")
21 |     response = requests.get(f"{BASE_URL}/models")
22 |     print(f"Status: {response.status_code}")
23 |     print(f"Response: {json.dumps(response.json(), indent=2)}")
24 |     
25 |     
26 |     # 2. Save another model configuration (OpenAI example)
27 |     print("\n2. Saving OpenAI model configuration:")
28 |     openai_config = {
29 |         "model_name": "my-openai",
30 |         "provider": "openai",
31 |         "api_token": "your-openai-api-key-here"
32 |     }
33 |     
34 |     response = requests.post(f"{BASE_URL}/models", json=openai_config)
35 |     print(f"Status: {response.status_code}")
36 |     print(f"Response: {json.dumps(response.json(), indent=2)}")
37 |     
38 |     # 3. List models again to see the new ones
39 |     print("\n3. Listing models after adding new ones:")
40 |     response = requests.get(f"{BASE_URL}/models")
41 |     print(f"Status: {response.status_code}")
42 |     print(f"Response: {json.dumps(response.json(), indent=2)}")
43 | 
44 |     # 4. Delete a model configuration
45 |     print("\n4. Deleting a model configuration:")
46 |     response = requests.delete(f"{BASE_URL}/models/my-openai")
47 |     print(f"Status: {response.status_code}")
48 |     print(f"Response: {json.dumps(response.json(), indent=2)}")
49 |     
50 |     # 5. Final list of models
51 |     print("\n5. Final list of models:")
52 |     response = requests.get(f"{BASE_URL}/models")
53 |     print(f"Status: {response.status_code}")
54 |     print(f"Response: {json.dumps(response.json(), indent=2)}")
55 | 
56 | if __name__ == "__main__":
57 |     print("Model Management Test Script")
58 |     print("Make sure the API server is running on http://localhost:8000")
59 |     print("=" * 50)
60 |     
61 |     try:
62 |         test_model_management()
63 |     except requests.exceptions.ConnectionError:
64 |         print("Error: Could not connect to the API server.")
65 |         print("Make sure the server is running with: python api_server.py")
66 |     except Exception as e:
67 |         print(f"Error: {e}") 


--------------------------------------------------------------------------------
/docs/md_v2/advanced/crawl-dispatcher.md:
--------------------------------------------------------------------------------
 1 | # Crawl Dispatcher
 2 | 
 3 | We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress.
 4 | 
 5 | Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X.
 6 | 
 7 | Below is a **sample** of how the dispatcher’s performance monitor might look in action:
 8 | 
 9 | ![Crawl Dispatcher Performance Monitor](../assets/images/dispatcher.png)
10 | 
11 | 
12 | We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates!


--------------------------------------------------------------------------------
/docs/md_v2/apps/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/assets/DankMono-Bold.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/assets/DankMono-Italic.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/assets/DankMono-Regular.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/c4a-script/assets/DankMono-Bold.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/c4a-script/assets/DankMono-Italic.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/c4a-script/assets/DankMono-Regular.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/blockly-demo.c4a:
--------------------------------------------------------------------------------
 1 | # Demo: Login Flow with Blockly
 2 | # This script can be created visually using Blockly blocks
 3 | 
 4 | GO https://example.com/login
 5 | WAIT `#login-form` 5
 6 | 
 7 | # Check if already logged in
 8 | IF (EXISTS `.user-avatar`) THEN GO https://example.com/dashboard
 9 | 
10 | # Fill login form
11 | CLICK `#email`
12 | TYPE "demo@example.com"
13 | CLICK `#password`
14 | TYPE "password123"
15 | 
16 | # Submit form
17 | CLICK `button[type="submit"]`
18 | WAIT `.dashboard` 10
19 | 
20 | # Success message
21 | EVAL `console.log('Login successful!')`


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/requirements.txt:
--------------------------------------------------------------------------------
1 | flask>=2.3.0
2 | flask-cors>=4.0.0


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/01-basic-interaction.c4a:
--------------------------------------------------------------------------------
 1 | # Basic Page Interaction
 2 | # This script demonstrates basic C4A commands
 3 | 
 4 | # Navigate to the playground
 5 | GO http://127.0.0.1:8080/playground/
 6 | 
 7 | # Wait for page to load
 8 | WAIT `body` 2
 9 | 
10 | # Handle cookie banner if present
11 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
12 | 
13 | # Close newsletter popup if it appears
14 | WAIT 3
15 | IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`
16 | 
17 | # Click the start tutorial button
18 | CLICK `#start-tutorial`


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/02-login-flow.c4a:
--------------------------------------------------------------------------------
 1 | # Complete Login Flow
 2 | # Demonstrates form interaction and authentication
 3 | 
 4 | # Click login button
 5 | CLICK `#login-btn`
 6 | 
 7 | # Wait for login modal
 8 | WAIT `.login-form` 3
 9 | 
10 | # Fill in credentials
11 | CLICK `#email`
12 | TYPE "demo@example.com"
13 | 
14 | CLICK `#password`
15 | TYPE "demo123"
16 | 
17 | # Check remember me
18 | IF (EXISTS `#remember-me`) THEN CLICK `#remember-me`
19 | 
20 | # Submit form
21 | CLICK `button[type="submit"]`
22 | 
23 | # Wait for success
24 | WAIT `.welcome-message` 5
25 | 
26 | # Verify login succeeded
27 | IF (EXISTS `.user-info`) THEN EVAL `console.log('✅ Login successful!')`


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/03-infinite-scroll.c4a:
--------------------------------------------------------------------------------
 1 | # Infinite Scroll Product Loading
 2 | # Load all products using scroll automation
 3 | 
 4 | # Navigate to catalog
 5 | CLICK `#catalog-link`
 6 | WAIT `.product-grid` 3
 7 | 
 8 | # Switch to infinite scroll mode
 9 | CLICK `#infinite-scroll-btn`
10 | 
11 | # Define scroll procedure
12 | PROC load_more_products
13 |   # Get current product count
14 |   EVAL `window.initialCount = document.querySelectorAll('.product-card').length`
15 |   
16 |   # Scroll down
17 |   SCROLL DOWN 1000
18 |   WAIT 2
19 |   
20 |   # Check if more products loaded
21 |   EVAL `
22 |     const newCount = document.querySelectorAll('.product-card').length;
23 |     console.log('Products loaded: ' + newCount);
24 |     window.moreLoaded = newCount > window.initialCount;
25 |   `
26 | ENDPROC
27 | 
28 | # Load products until no more
29 | REPEAT (load_more_products, `window.moreLoaded !== false`)
30 | 
31 | # Final count
32 | EVAL `console.log('✅ Total products: ' + document.querySelectorAll('.product-card').length)`


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/04-multi-step-form.c4a:
--------------------------------------------------------------------------------
 1 | # Multi-step Form Wizard
 2 | # Complete a complex form with multiple steps
 3 | 
 4 | # Navigate to forms section
 5 | CLICK `a[href="#forms"]`
 6 | WAIT `#survey-form` 2
 7 | 
 8 | # Step 1: Basic Information
 9 | CLICK `#full-name`
10 | TYPE "John Doe"
11 | 
12 | CLICK `#survey-email`  
13 | TYPE "john.doe@example.com"
14 | 
15 | # Go to next step
16 | CLICK `.next-step`
17 | WAIT 1
18 | 
19 | # Step 2: Select Interests
20 | # Select multiple options
21 | CLICK `#interests`
22 | CLICK `option[value="tech"]`
23 | CLICK `option[value="music"]`
24 | CLICK `option[value="travel"]`
25 | 
26 | # Continue to final step
27 | CLICK `.next-step`
28 | WAIT 1
29 | 
30 | # Step 3: Review and Submit
31 | # Verify we're on the last step
32 | IF (EXISTS `#submit-survey`) THEN EVAL `console.log('📋 On final step')`
33 | 
34 | # Submit the form
35 | CLICK `#submit-survey`
36 | 
37 | # Wait for success message
38 | WAIT `.success-message` 5
39 | 
40 | # Verify submission
41 | IF (EXISTS `.success-message`) THEN EVAL `console.log('✅ Survey submitted successfully!')`


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/05-complex-workflow.c4a:
--------------------------------------------------------------------------------
 1 | # Complete E-commerce Workflow
 2 | # Login, browse products, and interact with various elements
 3 | 
 4 | # Define reusable procedures
 5 | PROC handle_popups
 6 |   IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
 7 |   IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`
 8 | ENDPROC
 9 | 
10 | PROC login_user
11 |   CLICK `#login-btn`
12 |   WAIT `.login-form` 2
13 |   CLICK `#email`
14 |   TYPE "demo@example.com"
15 |   CLICK `#password`
16 |   TYPE "demo123"
17 |   CLICK `button[type="submit"]`
18 |   WAIT `.welcome-message` 5
19 | ENDPROC
20 | 
21 | PROC browse_products
22 |   # Go to catalog
23 |   CLICK `#catalog-link`
24 |   WAIT `.product-grid` 3
25 |   
26 |   # Apply filters
27 |   CLICK `.collapsible`
28 |   WAIT 0.5
29 |   CLICK `input[type="checkbox"]`
30 |   
31 |   # Load some products
32 |   SCROLL DOWN 500
33 |   WAIT 1
34 |   SCROLL DOWN 500
35 |   WAIT 1
36 | ENDPROC
37 | 
38 | # Main workflow
39 | GO http://127.0.0.1:8080/playground/
40 | WAIT `body` 2
41 | 
42 | # Handle initial popups
43 | handle_popups
44 | 
45 | # Login if not already
46 | IF (NOT EXISTS `.user-info`) THEN login_user
47 | 
48 | # Browse products
49 | browse_products
50 | 
51 | # Navigate to tabs demo
52 | CLICK `a[href="#tabs"]`
53 | WAIT `.tabs-container` 2
54 | 
55 | # Interact with tabs
56 | CLICK `button[data-tab="reviews"]`
57 | WAIT 1
58 | 
59 | # Load comments
60 | IF (EXISTS `.load-comments`) THEN CLICK `.load-comments`
61 | WAIT `.comments-section` 2
62 | 
63 | # Check specifications
64 | CLICK `button[data-tab="specs"]`
65 | WAIT 1
66 | 
67 | # Final navigation to data tables
68 | CLICK `a[href="#data"]`
69 | WAIT `.data-table` 2
70 | 
71 | # Search in table
72 | CLICK `.search-input`
73 | TYPE "User"
74 | 
75 | # Load more rows
76 | CLICK `.load-more-rows`
77 | WAIT 1
78 | 
79 | # Export data
80 | CLICK `#export-btn`
81 | 
82 | EVAL `console.log('✅ Workflow completed successfully!')`


--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/test_blockly.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Blockly Test</title>
 7 |     <style>
 8 |         body {
 9 |             margin: 0;
10 |             padding: 20px;
11 |             background: #0e0e10;
12 |             color: #e0e0e0;
13 |             font-family: monospace;
14 |         }
15 |         #blocklyDiv {
16 |             height: 600px;
17 |             width: 100%;
18 |             border: 1px solid #2a2a2c;
19 |         }
20 |         #output {
21 |             margin-top: 20px;
22 |             padding: 15px;
23 |             background: #1a1a1b;
24 |             border: 1px solid #2a2a2c;
25 |             white-space: pre-wrap;
26 |         }
27 |     </style>
28 | </head>
29 | <body>
30 |     <h1>C4A-Script Blockly Test</h1>
31 |     <div id="blocklyDiv"></div>
32 |     <div id="output">
33 |         <h3>Generated C4A-Script:</h3>
34 |         <pre id="code-output"></pre>
35 |     </div>
36 |     
37 |     <script src="https://unpkg.com/blockly/blockly.min.js"></script>
38 |     <script src="assets/c4a-blocks.js"></script>
39 |     <script>
40 |         // Simple test
41 |         const workspace = Blockly.inject('blocklyDiv', {
42 |             toolbox: `
43 |                 <xml>
44 |                     <category name="Test" colour="#1E88E5">
45 |                         <block type="c4a_go"></block>
46 |                         <block type="c4a_wait_time"></block>
47 |                         <block type="c4a_click"></block>
48 |                     </category>
49 |                 </xml>
50 |             `,
51 |             theme: Blockly.Theme.defineTheme('dark', {
52 |                 'base': Blockly.Themes.Classic,
53 |                 'componentStyles': {
54 |                     'workspaceBackgroundColour': '#0e0e10',
55 |                     'toolboxBackgroundColour': '#1a1a1b',
56 |                     'toolboxForegroundColour': '#e0e0e0',
57 |                     'flyoutBackgroundColour': '#1a1a1b',
58 |                     'flyoutForegroundColour': '#e0e0e0',
59 |                 }
60 |             })
61 |         });
62 |         
63 |         workspace.addChangeListener((event) => {
64 |             const code = Blockly.JavaScript.workspaceToCode(workspace);
65 |             document.getElementById('code-output').textContent = code;
66 |         });
67 |     </script>
68 | </body>
69 | </html>


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Bold.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Italic.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Regular.woff2


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/background/service-worker.js:
--------------------------------------------------------------------------------
 1 | // Service worker for Crawl4AI Assistant
 2 | 
 3 | // Handle messages from content script
 4 | chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
 5 |   if (message.action === 'downloadCode' || message.action === 'downloadScript') {
 6 |     try {
 7 |       // Create a data URL for the Python code
 8 |       const dataUrl = 'data:text/plain;charset=utf-8,' + encodeURIComponent(message.code);
 9 |       
10 |       // Download the file
11 |       chrome.downloads.download({
12 |         url: dataUrl,
13 |         filename: message.filename || 'crawl4ai_schema.py',
14 |         saveAs: true
15 |       }, (downloadId) => {
16 |         if (chrome.runtime.lastError) {
17 |           console.error('Download failed:', chrome.runtime.lastError);
18 |           sendResponse({ success: false, error: chrome.runtime.lastError.message });
19 |         } else {
20 |           console.log('Download started with ID:', downloadId);
21 |           sendResponse({ success: true, downloadId: downloadId });
22 |         }
23 |       });
24 |     } catch (error) {
25 |       console.error('Error creating download:', error);
26 |       sendResponse({ success: false, error: error.message });
27 |     }
28 |     
29 |     return true; // Keep the message channel open for async response
30 |   }
31 |   
32 |   return false;
33 | });
34 | 
35 | // Clean up on extension install/update
36 | chrome.runtime.onInstalled.addListener(() => {
37 |   // Clear any stored state
38 |   chrome.storage.local.clear();
39 | });


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/content/content.js:
--------------------------------------------------------------------------------
 1 | // Main content script for Crawl4AI Assistant
 2 | // Coordinates between Click2Crawl, ScriptBuilder, and MarkdownExtraction
 3 | 
 4 | let activeBuilder = null;
 5 | 
 6 | // Listen for messages from popup
 7 | chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
 8 |   if (request.action === 'startCapture') {
 9 |     if (activeBuilder) {
10 |       console.log('Stopping existing capture session');
11 |       activeBuilder.stop();
12 |       activeBuilder = null;
13 |     }
14 | 
15 |     if (request.mode === 'schema') {
16 |       console.log('Starting Click2Crawl');
17 |       activeBuilder = new Click2Crawl();
18 |       activeBuilder.start();
19 |     } else if (request.mode === 'script') {
20 |       console.log('Starting Script Builder');
21 |       activeBuilder = new ScriptBuilder();
22 |       activeBuilder.start();
23 |     }
24 |     
25 |     sendResponse({ success: true });
26 |   } else if (request.action === 'stopCapture') {
27 |     if (activeBuilder) {
28 |       activeBuilder.stop();
29 |       activeBuilder = null;
30 |     }
31 |     sendResponse({ success: true });
32 |   } else if (request.action === 'startSchemaCapture') {
33 |     if (activeBuilder) {
34 |       activeBuilder.deactivate?.();
35 |       activeBuilder = null;
36 |     }
37 |     console.log('Starting Click2Crawl');
38 |     activeBuilder = new Click2Crawl();
39 |     activeBuilder.start();
40 |     sendResponse({ success: true });
41 |   } else if (request.action === 'startScriptCapture') {
42 |     if (activeBuilder) {
43 |       activeBuilder.deactivate?.();
44 |       activeBuilder = null;
45 |     }
46 |     console.log('Starting Script Builder');
47 |     activeBuilder = new ScriptBuilder();
48 |     activeBuilder.start();
49 |     sendResponse({ success: true });
50 |   } else if (request.action === 'startClick2Crawl') {
51 |     if (activeBuilder) {
52 |       activeBuilder.deactivate?.();
53 |       activeBuilder = null;
54 |     }
55 |     console.log('Starting Markdown Extraction');
56 |     activeBuilder = new MarkdownExtraction();
57 |     sendResponse({ success: true });
58 |   } else if (request.action === 'generateCode') {
59 |     if (activeBuilder && activeBuilder.generateCode) {
60 |       activeBuilder.generateCode();
61 |     }
62 |     sendResponse({ success: true });
63 |   }
64 | });
65 | 
66 | // Cleanup on page unload
67 | window.addEventListener('beforeunload', () => {
68 |   if (activeBuilder) {
69 |     if (activeBuilder.deactivate) {
70 |       activeBuilder.deactivate();
71 |     } else if (activeBuilder.stop) {
72 |       activeBuilder.stop();
73 |     }
74 |     activeBuilder = null;
75 |   }
76 | });
77 | 
78 | console.log('Crawl4AI Assistant content script loaded');


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.2.1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.2.1.zip


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/icons/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/icons/favicon.ico


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/icons/icon-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/icons/icon-128.png


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/icons/icon-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/icons/icon-16.png


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/icons/icon-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/icons/icon-48.png


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "manifest_version": 3,
 3 |   "name": "Crawl4AI Assistant",
 4 |   "version": "1.3.0",
 5 |   "description": "Visual schema and script builder for Crawl4AI - Build extraction schemas and automation scripts by clicking and recording actions",
 6 |   "permissions": [
 7 |     "activeTab",
 8 |     "storage",
 9 |     "downloads"
10 |   ],
11 |   "host_permissions": [
12 |     "<all_urls>"
13 |   ],
14 |   "action": {
15 |     "default_popup": "popup/popup.html",
16 |     "default_icon": {
17 |       "16": "icons/icon-16.png",
18 |       "48": "icons/icon-48.png",
19 |       "128": "icons/icon-128.png"
20 |     }
21 |   },
22 |   "content_scripts": [
23 |     {
24 |       "matches": ["<all_urls>"],
25 |       "js": [
26 |         "libs/marked.min.js",
27 |         "content/shared/utils.js",
28 |         "content/markdownPreviewModal.js",
29 |         "content/click2crawl.js",
30 |         "content/scriptBuilder.js",
31 |         "content/contentAnalyzer.js",
32 |         "content/markdownConverter.js",
33 |         "content/markdownExtraction.js",
34 |         "content/content.js"
35 |       ],
36 |       "css": ["content/overlay.css"],
37 |       "run_at": "document_idle"
38 |     }
39 |   ],
40 |   "background": {
41 |     "service_worker": "background/service-worker.js"
42 |   },
43 |   "icons": {
44 |     "16": "icons/icon-16.png",
45 |     "48": "icons/icon-48.png",
46 |     "128": "icons/icon-128.png"
47 |   },
48 |   "web_accessible_resources": [
49 |     {
50 |       "resources": ["icons/*", "assets/*"],
51 |       "matches": ["<all_urls>"]
52 |     }
53 |   ]
54 | }


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/popup/icons/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/popup/icons/favicon.ico


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-128.png


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-16.png


--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-48.png


--------------------------------------------------------------------------------
/docs/md_v2/ask_ai/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Crawl4AI Assistant</title>
 7 |     <!-- Link main styles first for variable access -->
 8 |     <link rel="stylesheet" href="../assets/layout.css">
 9 |     <link rel="stylesheet" href="../assets/styles.css">
10 |     <!-- Link specific AI styles -->
11 |     <link rel="stylesheet" href="../assets/highlight.css">
12 |     <link rel="stylesheet" href="ask-ai.css">
13 | </head>
14 | <body>
15 |     <div class="ai-assistant-container">
16 | 
17 |         <!-- Left Sidebar: Conversation History -->
18 |         <aside id="history-panel" class="sidebar left-sidebar">
19 |             <header>
20 |                 <h3>History</h3>
21 |                 <button id="new-chat-button" class="btn btn-sm">New Chat</button>
22 |             </header>
23 |             <ul id="history-list">
24 |                 <!-- History items populated by JS -->
25 |             </ul>
26 |         </aside>
27 | 
28 |         <!-- Main Area: Chat Interface -->
29 |         <main id="chat-panel">
30 |             <div id="chat-messages">
31 |                 <!-- Chat messages populated by JS -->
32 |                  <div class="message ai-message welcome-message">
33 |                     Welcome to the Crawl4AI Assistant! How can I help you today?
34 |                  </div>
35 |             </div>
36 |             <div id="chat-input-area">
37 |                 <!-- Loading indicator for general waiting (optional) -->
38 |                 <!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
39 |                 <textarea id="chat-input" placeholder="We will roll out this feature very soon." rows="2" disabled></textarea> 
40 |                 <button id="send-button">Send</button>
41 |             </div>
42 |         </main>
43 | 
44 |         <!-- Right Sidebar: Citations / Context -->
45 |         <aside id="citations-panel" class="sidebar right-sidebar">
46 |             <header>
47 |                 <h3>Citations</h3>
48 |             </header>
49 |             <ul id="citations-list">
50 |                 <!-- Citations populated by JS -->
51 |                 <li class="no-citations">No citations for this response yet.</li>
52 |             </ul>
53 |         </aside>
54 | 
55 |     </div>
56 | 
57 |     <!-- Include Marked.js library -->
58 |     <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
59 |     <script src="../assets/highlight.min.js"></script> 
60 | 
61 |     <!-- Your AI Assistant Logic -->
62 |     <script src="ask-ai.js"></script>
63 | </body>
64 | </html>


--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/DankMono-Bold.woff2


--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/DankMono-Italic.woff2


--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/DankMono-Regular.woff2


--------------------------------------------------------------------------------
/docs/md_v2/assets/Monaco.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/Monaco.woff


--------------------------------------------------------------------------------
/docs/md_v2/assets/copy_code.js:
--------------------------------------------------------------------------------
 1 | // ==== File: docs/assets/copy_code.js ====
 2 | 
 3 | document.addEventListener('DOMContentLoaded', () => {
 4 |     // Target specifically code blocks within the main content area
 5 |     const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
 6 | 
 7 |     codeBlocks.forEach((codeElement) => {
 8 |         const preElement = codeElement.parentElement; // The <pre> tag
 9 | 
10 |         // Ensure the <pre> tag can contain a positioned button
11 |         if (window.getComputedStyle(preElement).position === 'static') {
12 |             preElement.style.position = 'relative';
13 |         }
14 | 
15 |         // Create the button
16 |         const copyButton = document.createElement('button');
17 |         copyButton.className = 'copy-code-button';
18 |         copyButton.type = 'button';
19 |         copyButton.setAttribute('aria-label', 'Copy code to clipboard');
20 |         copyButton.title = 'Copy code to clipboard';
21 |         copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
22 | 
23 |         // Append the button to the <pre> element
24 |         preElement.appendChild(copyButton);
25 | 
26 |         // Add click event listener
27 |         copyButton.addEventListener('click', () => {
28 |             copyCodeToClipboard(codeElement, copyButton);
29 |         });
30 |     });
31 | 
32 |     async function copyCodeToClipboard(codeElement, button) {
33 |         // Use innerText to get the rendered text content, preserving line breaks
34 |         const textToCopy = codeElement.innerText;
35 | 
36 |         try {
37 |             await navigator.clipboard.writeText(textToCopy);
38 | 
39 |             // Visual feedback
40 |             button.innerHTML = 'Copied!';
41 |             button.classList.add('copied');
42 |             button.disabled = true; // Temporarily disable
43 | 
44 |             // Revert button state after a short delay
45 |             setTimeout(() => {
46 |                 button.innerHTML = 'Copy';
47 |                 button.classList.remove('copied');
48 |                 button.disabled = false;
49 |             }, 2000); // Show "Copied!" for 2 seconds
50 | 
51 |         } catch (err) {
52 |             console.error('Failed to copy code: ', err);
53 |             // Optional: Provide error feedback on the button
54 |             button.innerHTML = 'Error';
55 |             setTimeout(() => {
56 |                 button.innerHTML = 'Copy';
57 |             }, 2000);
58 |         }
59 |     }
60 | 
61 |     console.log("Copy Code Button script loaded.");
62 | });


--------------------------------------------------------------------------------
/docs/md_v2/assets/crawl4ai-skill.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/crawl4ai-skill.zip


--------------------------------------------------------------------------------
/docs/md_v2/assets/docs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/docs.zip


--------------------------------------------------------------------------------
/docs/md_v2/assets/feedback-overrides.css:
--------------------------------------------------------------------------------
 1 | /* docs/assets/feedback-overrides.css */
 2 | :root {
 3 |   /* brand */
 4 |   --feedback-primary-color: #09b5a5;
 5 |   --feedback-highlight-color: #fed500;          /* stars etc */
 6 | 
 7 |   /* modal shell / text */
 8 |   --feedback-modal-content-bg-color: var(--background-color);
 9 |   --feedback-modal-content-text-color: var(--font-color);
10 |   --feedback-modal-content-border-color: var(--primary-dimmed-color);
11 |   --feedback-modal-content-border-radius: 4px;
12 | 
13 |   /* overlay */
14 |   --feedback-overlay-bg-color: rgba(0,0,0,.75);
15 | 
16 |   /* rating buttons */
17 |   --feedback-modal-rating-button-color: var(--secondary-color);
18 |   --feedback-modal-rating-button-selected-color: var(--primary-color);
19 | 
20 |   /* inputs */
21 |   --feedback-modal-input-bg-color: var(--code-bg-color);
22 |   --feedback-modal-input-text-color: var(--font-color);
23 |   --feedback-modal-input-border-color: var(--primary-dimmed-color);
24 |   --feedback-modal-input-border-color-focused: var(--primary-color);
25 | 
26 |   /* submit / secondary buttons */
27 |   --feedback-modal-button-submit-bg-color: var(--primary-color);
28 |   --feedback-modal-button-submit-bg-color-hover: var(--primary-dimmed-color);
29 |   --feedback-modal-button-submit-text-color: var(--invert-font-color);
30 | 
31 |   --feedback-modal-button-bg-color: transparent;       /* screenshot btn */
32 |   --feedback-modal-button-border-color: var(--primary-color);
33 |   --feedback-modal-button-icon-color: var(--primary-color);
34 | }
35 | 
36 | /* optional: keep the “Powered by” link subtle */
37 | .feedback-logo a{color:var(--secondary-color);}
38 | 


--------------------------------------------------------------------------------
/docs/md_v2/assets/floating_ask_ai_button.js:
--------------------------------------------------------------------------------
 1 | // ==== File: docs/assets/floating_ask_ai_button.js ====
 2 | 
 3 | document.addEventListener('DOMContentLoaded', () => {
 4 |     const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
 5 |     const currentPath = window.location.pathname;
 6 | 
 7 |     // Determine the base URL for constructing the link correctly,
 8 |     // especially if deployed in a sub-directory.
 9 |     // This assumes a simple structure; adjust if needed.
10 |     const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
11 | 
12 | 
13 |     // Check if the current page IS the Ask AI page
14 |     // Use includes() for flexibility (handles trailing slash or .html)
15 |     if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
16 |         console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
17 |         return; // Don't add the button on the target page
18 |     }
19 | 
20 |     // --- Create the button ---
21 |     const fabLink = document.createElement('a');
22 |     fabLink.className = 'floating-ask-ai-button';
23 |     fabLink.href = askAiPagePath; // Construct the correct URL
24 |     fabLink.title = 'Ask Crawl4AI Assistant';
25 |     fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
26 | 
27 |     // Add content (using SVG icon for better visuals)
28 |     fabLink.innerHTML = `
29 |         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
30 |             <path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
31 |         </svg>
32 |         <span>Ask AI</span>
33 |     `;
34 | 
35 |     // Append to body
36 |     document.body.appendChild(fabLink);
37 | 
38 |     console.log("Floating Ask AI Button added.");
39 | });


--------------------------------------------------------------------------------
/docs/md_v2/assets/gtag.js:
--------------------------------------------------------------------------------
1 |   window.dataLayer = window.dataLayer || [];
2 |   function gtag(){dataLayer.push(arguments);}
3 |   gtag('js', new Date());
4 | 
5 |   gtag('config', 'G-58W0K2ZQ25');
6 | 


--------------------------------------------------------------------------------
/docs/md_v2/assets/highlight.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/highlight.css


--------------------------------------------------------------------------------
/docs/md_v2/assets/highlight_init.js:
--------------------------------------------------------------------------------
1 | document.addEventListener('DOMContentLoaded', (event) => {
2 |     document.querySelectorAll('pre code').forEach((block) => {
3 |       hljs.highlightBlock(block);
4 |     });
5 |   });
6 |   


--------------------------------------------------------------------------------
/docs/md_v2/assets/images/dispatcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/images/dispatcher.png


--------------------------------------------------------------------------------
/docs/md_v2/assets/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/images/logo.png


--------------------------------------------------------------------------------
/docs/md_v2/blog/releases/0.7.1.md:
--------------------------------------------------------------------------------
 1 | # 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
 2 | 
 3 | *July 17, 2025 • 2 min read*
 4 | 
 5 | ---
 6 | 
 7 | A small maintenance release that removes unused code and improves documentation.
 8 | 
 9 | ## 🎯 What's Changed
10 | 
11 | - **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
12 | - **Updated documentation** with better examples and parameter explanations
13 | - **Fixed virtual scroll configuration** examples in docs
14 | 
15 | ## 🧹 Code Cleanup
16 | 
17 | Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
18 | 
19 | ```python
20 | # Removed unused code:
21 | from playwright_stealth import StealthConfig
22 | stealth_config = StealthConfig(...)  # This was never used
23 | ```
24 | 
25 | ## 📖 Documentation Updates
26 | 
27 | - Fixed adaptive crawling parameter examples
28 | - Updated session management documentation
29 | - Corrected virtual scroll configuration examples
30 | 
31 | ## 🚀 Installation
32 | 
33 | ```bash
34 | pip install crawl4ai==0.7.1
35 | ```
36 | 
37 | No breaking changes - upgrade directly from v0.7.0.
38 | 
39 | ---
40 | 
41 | Questions? Issues? 
42 | - GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
43 | - Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)


--------------------------------------------------------------------------------
/docs/md_v2/core/cache-modes.md:
--------------------------------------------------------------------------------
 1 | # Crawl4AI Cache System and Migration Guide
 2 | 
 3 | ## Overview
 4 | Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
 5 | 
 6 | ## Old vs New Approach
 7 | 
 8 | ### Old Way (Deprecated)
 9 | The old system used multiple boolean flags:
10 | - `bypass_cache`: Skip cache entirely
11 | - `disable_cache`: Disable all caching
12 | - `no_cache_read`: Don't read from cache
13 | - `no_cache_write`: Don't write to cache
14 | 
15 | ### New Way (Recommended)
16 | The new system uses a single `CacheMode` enum:
17 | - `CacheMode.ENABLED`: Normal caching (read/write)
18 | - `CacheMode.DISABLED`: No caching at all
19 | - `CacheMode.READ_ONLY`: Only read from cache
20 | - `CacheMode.WRITE_ONLY`: Only write to cache
21 | - `CacheMode.BYPASS`: Skip cache for this operation
22 | 
23 | ## Migration Example
24 | 
25 | ### Old Code (Deprecated)
26 | ```python
27 | import asyncio
28 | from crawl4ai import AsyncWebCrawler
29 | 
30 | async def use_proxy():
31 |     async with AsyncWebCrawler(verbose=True) as crawler:
32 |         result = await crawler.arun(
33 |             url="https://www.nbcnews.com/business",
34 |             bypass_cache=True  # Old way
35 |         )
36 |         print(len(result.markdown))
37 | 
38 | async def main():
39 |     await use_proxy()
40 | 
41 | if __name__ == "__main__":
42 |     asyncio.run(main())
43 | ```
44 | 
45 | ### New Code (Recommended)
46 | ```python
47 | import asyncio
48 | from crawl4ai import AsyncWebCrawler, CacheMode
49 | from crawl4ai.async_configs import CrawlerRunConfig
50 | 
51 | async def use_proxy():
52 |     # Use CacheMode in CrawlerRunConfig
53 |     config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)  
54 |     async with AsyncWebCrawler(verbose=True) as crawler:
55 |         result = await crawler.arun(
56 |             url="https://www.nbcnews.com/business",
57 |             config=config  # Pass the configuration object
58 |         )
59 |         print(len(result.markdown))
60 | 
61 | async def main():
62 |     await use_proxy()
63 | 
64 | if __name__ == "__main__":
65 |     asyncio.run(main())
66 | ```
67 | 
68 | ## Common Migration Patterns
69 | 
70 | | Old Flag              | New Mode                       |
71 | |-----------------------|---------------------------------|
72 | | `bypass_cache=True`   | `cache_mode=CacheMode.BYPASS`  |
73 | | `disable_cache=True`  | `cache_mode=CacheMode.DISABLED`|
74 | | `no_cache_read=True`  | `cache_mode=CacheMode.WRITE_ONLY` |
75 | | `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |


--------------------------------------------------------------------------------
/docs/md_v2/core/llmtxt.md:
--------------------------------------------------------------------------------
 1 | I<div class="llmtxt-container">
 2 | <iframe id="llmtxt-frame" src="../../llmtxt/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI LLM Context Builder"></iframe>
 3 | </div>
 4 | 
 5 | <script>
 6 | // Iframe height adjustment
 7 | function resizeLLMtxtIframe() {
 8 |   const iframe = document.getElementById('llmtxt-frame');
 9 |   if (iframe) {
10 |     const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
11 |     const topOffset = headerHeight + 20;
12 |     const availableHeight = window.innerHeight - topOffset;
13 |     iframe.style.height = Math.max(800, availableHeight) + 'px';
14 |   }
15 | }
16 | 
17 | // Run immediately and on resize/load
18 | resizeLLMtxtIframe();
19 | let resizeTimer;
20 | window.addEventListener('load', resizeLLMtxtIframe);
21 | window.addEventListener('resize', () => {
22 |     clearTimeout(resizeTimer);
23 |     resizeTimer = setTimeout(resizeLLMtxtIframe, 150);
24 | });
25 | 
26 | // Remove Footer & HR from parent page
27 | document.addEventListener('DOMContentLoaded', () => {
28 |     setTimeout(() => {
29 |         const footer = window.parent.document.querySelector('footer');
30 |         if (footer) {
31 |             const hrBeforeFooter = footer.previousElementSibling;
32 |             if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
33 |                 hrBeforeFooter.remove();
34 |             }
35 |             footer.remove();
36 |             resizeLLMtxtIframe();
37 |         }
38 |     }, 100);
39 | });
40 | </script>
41 | 
42 | <style>
43 | #terminal-mkdocs-main-content {
44 |     padding: 0 !important;
45 |     margin: 0;
46 |     width: 100%;
47 |     height: 100%;
48 |     overflow: hidden;
49 | }
50 | 
51 | #terminal-mkdocs-main-content .llmtxt-container {
52 |     margin: 0;
53 |     padding: 0;
54 |     max-width: none;
55 |     overflow: hidden;
56 | }
57 | 
58 | #terminal-mkdocs-toc-panel {
59 |     display: none !important;
60 | }
61 | </style>


--------------------------------------------------------------------------------
/docs/md_v2/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/favicon.ico


--------------------------------------------------------------------------------
/docs/md_v2/img/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/img/favicon-32x32.png


--------------------------------------------------------------------------------
/docs/md_v2/img/favicon-x-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/img/favicon-x-32x32.png


--------------------------------------------------------------------------------
/docs/md_v2/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/img/favicon.ico


--------------------------------------------------------------------------------
/docs/md_v2/marketplace/README.md:
--------------------------------------------------------------------------------
 1 | # Crawl4AI Marketplace
 2 | 
 3 | A terminal-themed marketplace for tools, integrations, and resources related to Crawl4AI.
 4 | 
 5 | ## Setup
 6 | 
 7 | ### Backend
 8 | 
 9 | 1. Install dependencies:
10 | ```bash
11 | cd backend
12 | pip install -r requirements.txt
13 | ```
14 | 
15 | 2. Generate dummy data:
16 | ```bash
17 | python dummy_data.py
18 | ```
19 | 
20 | 3. Run the server:
21 | ```bash
22 | python server.py
23 | ```
24 | 
25 | The API will be available at http://localhost:8100
26 | 
27 | ### Frontend
28 | 
29 | 1. Open `frontend/index.html` in your browser
30 | 2. Or serve via MkDocs as part of the documentation site
31 | 
32 | ## Database Schema
33 | 
34 | The marketplace uses SQLite with automatic migration from `schema.yaml`. Tables include:
35 | - **apps**: Tools and integrations
36 | - **articles**: Reviews, tutorials, and news
37 | - **categories**: App categories
38 | - **sponsors**: Sponsored content
39 | 
40 | ## API Endpoints
41 | 
42 | - `GET /api/apps` - List apps with filters
43 | - `GET /api/articles` - List articles
44 | - `GET /api/categories` - Get all categories
45 | - `GET /api/sponsors` - Get active sponsors
46 | - `GET /api/search?q=query` - Search across content
47 | - `GET /api/stats` - Marketplace statistics
48 | 
49 | ## Features
50 | 
51 | - **Smart caching**: LocalStorage with TTL (1 hour)
52 | - **Terminal theme**: Consistent with Crawl4AI branding
53 | - **Responsive design**: Works on all devices
54 | - **Fast search**: Debounced with 300ms delay
55 | - **CORS protected**: Only crawl4ai.com and localhost
56 | 
57 | ## Admin Panel
58 | 
59 | Coming soon - for now, edit the database directly or modify `dummy_data.py`
60 | 
61 | ## Deployment
62 | 
63 | For production deployment on EC2:
64 | 1. Update `API_BASE` in `marketplace.js` to production URL
65 | 2. Run FastAPI with proper production settings (use gunicorn/uvicorn)
66 | 3. Set up nginx proxy if needed


--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/.env.example:
--------------------------------------------------------------------------------
 1 | # Marketplace Configuration
 2 | # Copy this to .env and update with your values
 3 | 
 4 | # Admin password (required)
 5 | MARKETPLACE_ADMIN_PASSWORD=change_this_password
 6 | 
 7 | # JWT secret key (required) - generate with: python3 -c "import secrets; print(secrets.token_urlsafe(32))"
 8 | MARKETPLACE_JWT_SECRET=change_this_to_a_secure_random_key
 9 | 
10 | # Database path (optional, defaults to ./marketplace.db)
11 | MARKETPLACE_DB_PATH=./marketplace.db
12 | 
13 | # Token expiry in hours (optional, defaults to 4)
14 | MARKETPLACE_TOKEN_EXPIRY=4


--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Marketplace Configuration - Loads from .env file
 3 | """
 4 | import os
 5 | import sys
 6 | import hashlib
 7 | from pathlib import Path
 8 | from dotenv import load_dotenv
 9 | 
10 | # Load .env file
11 | env_path = Path(__file__).parent / '.env'
12 | if not env_path.exists():
13 |     print("\n❌ ERROR: No .env file found!")
14 |     print("Please copy .env.example to .env and update with your values:")
15 |     print(f"  cp {Path(__file__).parent}/.env.example {Path(__file__).parent}/.env")
16 |     print("\nThen edit .env with your secure values.")
17 |     sys.exit(1)
18 | 
19 | load_dotenv(env_path)
20 | 
21 | # Required environment variables
22 | required_vars = ['MARKETPLACE_ADMIN_PASSWORD', 'MARKETPLACE_JWT_SECRET']
23 | missing_vars = [var for var in required_vars if not os.getenv(var)]
24 | 
25 | if missing_vars:
26 |     print(f"\n❌ ERROR: Missing required environment variables: {', '.join(missing_vars)}")
27 |     print("Please check your .env file and ensure all required variables are set.")
28 |     sys.exit(1)
29 | 
30 | class Config:
31 |     """Configuration loaded from environment variables"""
32 | 
33 |     # Admin authentication - hashed from password in .env
34 |     ADMIN_PASSWORD_HASH = hashlib.sha256(
35 |         os.getenv('MARKETPLACE_ADMIN_PASSWORD').encode()
36 |     ).hexdigest()
37 | 
38 |     # JWT secret for token generation
39 |     JWT_SECRET_KEY = os.getenv('MARKETPLACE_JWT_SECRET')
40 | 
41 |     # Database path
42 |     DATABASE_PATH = os.getenv('MARKETPLACE_DB_PATH', './marketplace.db')
43 | 
44 |     # Token expiry in hours
45 |     TOKEN_EXPIRY_HOURS = int(os.getenv('MARKETPLACE_TOKEN_EXPIRY', '4'))
46 | 
47 |     # CORS origins - hardcoded as they don't contain secrets
48 |     ALLOWED_ORIGINS = [
49 |         "http://localhost:8000",
50 |         "http://localhost:8080",
51 |         "http://localhost:8100",
52 |         "http://127.0.0.1:8000",
53 |         "http://127.0.0.1:8080",
54 |         "http://127.0.0.1:8100",
55 |         "https://crawl4ai.com",
56 |         "https://www.crawl4ai.com",
57 |         "https://docs.crawl4ai.com",
58 |         "https://market.crawl4ai.com"
59 |     ]


--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn
3 | pyyaml
4 | python-multipart
5 | python-dotenv


--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/schema.yaml:
--------------------------------------------------------------------------------
 1 | database:
 2 |   name: marketplace.db
 3 | 
 4 | tables:
 5 |   apps:
 6 |     columns:
 7 |       id: {type: INTEGER, primary: true, autoincrement: true}
 8 |       name: {type: TEXT, required: true}
 9 |       slug: {type: TEXT, unique: true}
10 |       description: {type: TEXT}
11 |       long_description: {type: TEXT}
12 |       logo_url: {type: TEXT}
13 |       image: {type: TEXT}
14 |       screenshots: {type: JSON, default: '[]'}
15 |       category: {type: TEXT}
16 |       type: {type: TEXT, default: 'Open Source'}
17 |       status: {type: TEXT, default: 'Active'}
18 |       website_url: {type: TEXT}
19 |       github_url: {type: TEXT}
20 |       demo_url: {type: TEXT}
21 |       video_url: {type: TEXT}
22 |       documentation_url: {type: TEXT}
23 |       support_url: {type: TEXT}
24 |       discord_url: {type: TEXT}
25 |       pricing: {type: TEXT}
26 |       rating: {type: REAL, default: 0.0}
27 |       downloads: {type: INTEGER, default: 0}
28 |       featured: {type: BOOLEAN, default: 0}
29 |       sponsored: {type: BOOLEAN, default: 0}
30 |       integration_guide: {type: TEXT}
31 |       documentation: {type: TEXT}
32 |       examples: {type: TEXT}
33 |       installation_command: {type: TEXT}
34 |       requirements: {type: TEXT}
35 |       changelog: {type: TEXT}
36 |       tags: {type: JSON, default: '[]'}
37 |       added_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
38 |       updated_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
39 |       contact_email: {type: TEXT}
40 |       views: {type: INTEGER, default: 0}
41 | 
42 |   articles:
43 |     columns:
44 |       id: {type: INTEGER, primary: true, autoincrement: true}
45 |       title: {type: TEXT, required: true}
46 |       slug: {type: TEXT, unique: true}
47 |       content: {type: TEXT}
48 |       author: {type: TEXT, default: 'Crawl4AI Team'}
49 |       category: {type: TEXT}
50 |       related_apps: {type: JSON, default: '[]'}
51 |       featured_image: {type: TEXT}
52 |       published_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
53 |       tags: {type: JSON, default: '[]'}
54 |       views: {type: INTEGER, default: 0}
55 | 
56 |   categories:
57 |     columns:
58 |       id: {type: INTEGER, primary: true, autoincrement: true}
59 |       name: {type: TEXT, unique: true}
60 |       slug: {type: TEXT, unique: true}
61 |       icon: {type: TEXT}
62 |       description: {type: TEXT}
63 |       order_index: {type: INTEGER, default: 0}
64 | 
65 |   sponsors:
66 |     columns:
67 |       id: {type: INTEGER, primary: true, autoincrement: true}
68 |       company_name: {type: TEXT, required: true}
69 |       logo_url: {type: TEXT}
70 |       tier: {type: TEXT, default: 'Bronze'}
71 |       banner_url: {type: TEXT}
72 |       landing_url: {type: TEXT}
73 |       active: {type: BOOLEAN, default: 1}
74 |       start_date: {type: DATETIME}
75 |       end_date: {type: DATETIME}


--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/uploads/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/docs/md_v2/overrides/main.html:
--------------------------------------------------------------------------------
 1 | {% set extra_html_attrs = 'data-theme="dark"' %}
 2 | {% extends "base.html" %}
 3 | 
 4 | {% block extrahead %}
 5 | {{ super() }}
 6 | <script>
 7 |     document.documentElement.setAttribute("data-theme", "dark");
 8 | </script>
 9 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pushfeedback/dist/pushfeedback/pushfeedback.css">
10 | 
11 | <style>
12 |     :root {
13 |         /* brand */
14 |         --feedback-primary-color: #09b5a5;
15 |         --feedback-highlight-color: #fed500;
16 | 
17 | 
18 |         /* align with the value you really use in :root */
19 |         --header-height: 65px;
20 | 
21 |         /* Push modal content down */
22 |         --feedback-modal-content-position-top: var(--header-height);
23 | 
24 |         --feedback-modal-modal-wrapper-z-index: 1100;
25 |         /*  >  header’s 1000 */
26 |         --feedback-modal-content-z-index: 1101;
27 |     }
28 | 
29 |     feedback-modal::part(overlay) {
30 |         top: var(--header-height);
31 |         /* start below header */
32 |         height: calc(100vh - var(--header-height));
33 |         /* fill the rest */
34 | 
35 | 
36 |     }
37 | </style>
38 | <script type="module"
39 |     src="https://cdn.jsdelivr.net/npm/pushfeedback@latest/dist/pushfeedback/pushfeedback.esm.js"></script>
40 | {% endblock %}
41 | 
42 | {% block footer %}
43 | <feedback-button project="w8plzp8vjp" button-style="dark" button-position="center-right" modal-position="sidebar-right">
44 |     >
45 |     Feedback
46 | </feedback-button>
47 | {% endblock %}


--------------------------------------------------------------------------------
/docs/tutorials/coming_soon.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/tutorials/coming_soon.md


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Note: These requirements are also specified in pyproject.toml
 2 | # This file is kept for development environment setup and compatibility
 3 | aiofiles>=24.1.0
 4 | aiohttp>=3.11.11
 5 | aiosqlite~=0.20
 6 | anyio>=4.0.0
 7 | lxml~=5.3
 8 | litellm>=1.53.1
 9 | numpy>=1.26.0,<3
10 | pillow>=10.4
11 | playwright>=1.49.0
12 | patchright>=1.49.0
13 | python-dotenv~=1.0
14 | requests~=2.26
15 | beautifulsoup4~=4.12
16 | tf-playwright-stealth>=1.1.0
17 | xxhash~=3.4
18 | rank-bm25~=0.2
19 | colorama~=0.4
20 | snowballstemmer~=2.2
21 | pydantic>=2.10
22 | pyOpenSSL>=24.3.0
23 | psutil>=6.1.1
24 | PyYAML>=6.0
25 | nltk>=3.9.1
26 | rich>=13.9.4
27 | cssselect>=1.2.0
28 | chardet>=5.2.0
29 | brotli>=1.1.0
30 | httpx[http2]>=0.27.2
31 | alphashape>=1.3.1
32 | shapely>=2.0.0
33 | 
34 | fake-useragent>=2.2.0
35 | pdf2image>=1.17.0
36 | PyPDF2>=3.0.1


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [options]
2 | include_package_data = True


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os
 3 | from pathlib import Path
 4 | import shutil
 5 | 
 6 | # Note: Most configuration is now in pyproject.toml
 7 | # This setup.py is kept for backwards compatibility
 8 | 
 9 | # Create the .crawl4ai folder in the user's home directory if it doesn't exist
10 | # If the folder already exists, remove the cache folder
11 | base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
12 | crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
13 | crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
14 | cache_folder = crawl4ai_folder / "cache"
15 | content_folders = [
16 |     "html_content",
17 |     "cleaned_html",
18 |     "markdown_content",
19 |     "extracted_content",
20 |     "screenshots",
21 | ]
22 | 
23 | # Clean up old cache if exists
24 | if cache_folder.exists():
25 |     shutil.rmtree(cache_folder)
26 | 
27 | # Create new folder structure
28 | crawl4ai_folder.mkdir(exist_ok=True)
29 | cache_folder.mkdir(exist_ok=True)
30 | for folder in content_folders:
31 |     (crawl4ai_folder / folder).mkdir(exist_ok=True)
32 | 
33 | version = "0.0.0"  # This will be overridden by pyproject.toml's dynamic version
34 | try:
35 |     with open("crawl4ai/__version__.py") as f:
36 |         for line in f:
37 |             if line.startswith("__version__"):
38 |                 version = line.split("=")[1].strip().strip('"')
39 |                 break
40 | except Exception:
41 |     pass  # Let pyproject.toml handle version
42 | 
43 | setup(
44 |     name="Crawl4AI",
45 |     version=version,
46 |     description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
47 |     long_description=open("README.md", encoding="utf-8").read(),
48 |     long_description_content_type="text/markdown",
49 |     url="https://github.com/unclecode/crawl4ai",
50 |     author="Unclecode",
51 |     author_email="unclecode@kidocode.com",
52 |     license="Apache-2.0",
53 |     packages=find_packages(),
54 |     package_data={"crawl4ai": ["js_snippet/*.js"]},
55 |     classifiers=[
56 |         "Development Status :: 3 - Alpha",
57 |         "Intended Audience :: Developers",
58 |         "Programming Language :: Python :: 3",
59 |         "Programming Language :: Python :: 3.10",
60 |         "Programming Language :: Python :: 3.11",
61 |         "Programming Language :: Python :: 3.12",
62 |         "Programming Language :: Python :: 3.13",
63 |     ],
64 |     python_requires=">=3.10",
65 | )
66 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async/test_crawler_strategy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | 
 5 | # Add the parent directory to the Python path
 6 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 7 | sys.path.append(parent_dir)
 8 | 
 9 | from crawl4ai.async_webcrawler import AsyncWebCrawler
10 | 
11 | 
12 | @pytest.mark.asyncio
13 | async def test_custom_user_agent():
14 |     async with AsyncWebCrawler(verbose=True) as crawler:
15 |         custom_user_agent = "MyCustomUserAgent/1.0"
16 |         crawler.crawler_strategy.update_user_agent(custom_user_agent)
17 |         url = "https://httpbin.org/user-agent"
18 |         result = await crawler.arun(url=url, bypass_cache=True)
19 |         assert result.success
20 |         assert custom_user_agent in result.html
21 | 
22 | 
23 | @pytest.mark.asyncio
24 | async def test_custom_headers():
25 |     async with AsyncWebCrawler(verbose=True) as crawler:
26 |         custom_headers = {"X-Test-Header": "TestValue"}
27 |         crawler.crawler_strategy.set_custom_headers(custom_headers)
28 |         url = "https://httpbin.org/headers"
29 |         result = await crawler.arun(url=url, bypass_cache=True)
30 |         assert result.success
31 |         assert "X-Test-Header" in result.html
32 |         assert "TestValue" in result.html
33 | 
34 | 
35 | @pytest.mark.asyncio
36 | async def test_javascript_execution():
37 |     async with AsyncWebCrawler(verbose=True) as crawler:
38 |         js_code = "document.body.innerHTML = '<h1>Modified by JS</h1>';"
39 |         url = "https://www.example.com"
40 |         result = await crawler.arun(url=url, bypass_cache=True, js_code=js_code)
41 |         assert result.success
42 |         assert "<h1>Modified by JS</h1>" in result.html
43 | 
44 | 
45 | @pytest.mark.asyncio
46 | async def test_hook_execution():
47 |     async with AsyncWebCrawler(verbose=True) as crawler:
48 | 
49 |         async def test_hook(page):
50 |             await page.evaluate("document.body.style.backgroundColor = 'red';")
51 |             return page
52 | 
53 |         crawler.crawler_strategy.set_hook("after_goto", test_hook)
54 |         url = "https://www.example.com"
55 |         result = await crawler.arun(url=url, bypass_cache=True)
56 |         assert result.success
57 |         assert "background-color: red" in result.html
58 | 
59 | 
60 | @pytest.mark.asyncio
61 | async def test_screenshot():
62 |     async with AsyncWebCrawler(verbose=True) as crawler:
63 |         url = "https://www.example.com"
64 |         result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
65 |         assert result.success
66 |         assert result.screenshot
67 |         assert isinstance(result.screenshot, str)
68 |         assert len(result.screenshot) > 0
69 | 
70 | 
71 | # Entry point for debugging
72 | if __name__ == "__main__":
73 |     pytest.main([__file__, "-v"])
74 | 


--------------------------------------------------------------------------------
/tests/async/test_performance.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | import time
 5 | 
 6 | # Add the parent directory to the Python path
 7 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(parent_dir)
 9 | 
10 | from crawl4ai.async_webcrawler import AsyncWebCrawler
11 | 
12 | 
13 | @pytest.mark.asyncio
14 | async def test_crawl_speed():
15 |     async with AsyncWebCrawler(verbose=True) as crawler:
16 |         url = "https://www.nbcnews.com/business"
17 |         start_time = time.time()
18 |         result = await crawler.arun(url=url, bypass_cache=True)
19 |         end_time = time.time()
20 | 
21 |         assert result.success
22 |         crawl_time = end_time - start_time
23 |         print(f"Crawl time: {crawl_time:.2f} seconds")
24 | 
25 |         assert crawl_time < 10, f"Crawl took too long: {crawl_time:.2f} seconds"
26 | 
27 | 
28 | @pytest.mark.asyncio
29 | async def test_concurrent_crawling_performance():
30 |     async with AsyncWebCrawler(verbose=True) as crawler:
31 |         urls = [
32 |             "https://www.nbcnews.com/business",
33 |             "https://www.example.com",
34 |             "https://www.python.org",
35 |             "https://www.github.com",
36 |             "https://www.stackoverflow.com",
37 |         ]
38 | 
39 |         start_time = time.time()
40 |         results = await crawler.arun_many(urls=urls, bypass_cache=True)
41 |         end_time = time.time()
42 | 
43 |         total_time = end_time - start_time
44 |         print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
45 | 
46 |         assert all(result.success for result in results)
47 |         assert len(results) == len(urls)
48 | 
49 |         assert (
50 |             total_time < len(urls) * 5
51 |         ), f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
52 | 
53 | 
54 | @pytest.mark.asyncio
55 | async def test_crawl_speed_with_caching():
56 |     async with AsyncWebCrawler(verbose=True) as crawler:
57 |         url = "https://www.nbcnews.com/business"
58 | 
59 |         start_time = time.time()
60 |         result1 = await crawler.arun(url=url, bypass_cache=True)
61 |         end_time = time.time()
62 |         first_crawl_time = end_time - start_time
63 | 
64 |         start_time = time.time()
65 |         result2 = await crawler.arun(url=url, bypass_cache=False)
66 |         end_time = time.time()
67 |         second_crawl_time = end_time - start_time
68 | 
69 |         assert result1.success and result2.success
70 |         print(f"First crawl time: {first_crawl_time:.2f} seconds")
71 |         print(f"Second crawl time (cached): {second_crawl_time:.2f} seconds")
72 | 
73 |         assert (
74 |             second_crawl_time < first_crawl_time / 2
75 |         ), "Cached crawl not significantly faster"
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     pytest.main([__file__, "-v"])
80 | 


--------------------------------------------------------------------------------
/tests/browser/docker/__init__.py:
--------------------------------------------------------------------------------
1 | """Docker browser strategy tests.
2 | 
3 | This package contains tests for the Docker browser strategy implementation.
4 | """


--------------------------------------------------------------------------------
/tests/browser/test_combined.py:
--------------------------------------------------------------------------------
 1 | """Combined test runner for all browser module tests.
 2 | 
 3 | This script runs all the browser module tests in sequence and
 4 | provides a comprehensive summary.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | import time
11 | 
12 | # Add the project root to Python path if running directly
13 | if __name__ == "__main__":
14 |     sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
15 | 
16 | from crawl4ai.async_logger import AsyncLogger
17 | 
18 | # Create a logger for clear terminal output
19 | logger = AsyncLogger(verbose=True, log_file=None)
20 | 
21 | async def run_test_module(module_name, header):
22 |     """Run all tests in a module and return results."""
23 |     logger.info(f"\n{'-'*30}", tag="TEST")
24 |     logger.info(f"RUNNING: {header}", tag="TEST")
25 |     logger.info(f"{'-'*30}", tag="TEST")
26 |     
27 |     # Import the module dynamically
28 |     module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
29 |     
30 |     # Track time for performance measurement
31 |     start_time = time.time()
32 |     
33 |     # Run the tests
34 |     await module.run_tests()
35 |     
36 |     # Calculate time taken
37 |     time_taken = time.time() - start_time
38 |     logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
39 |     
40 |     return time_taken
41 | 
42 | async def main():
43 |     """Run all test modules."""
44 |     logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
45 |     
46 |     # List of test modules to run
47 |     test_modules = [
48 |         ("test_browser_manager", "Browser Manager Tests"),
49 |         ("test_playwright_strategy", "Playwright Strategy Tests"),
50 |         ("test_cdp_strategy", "CDP Strategy Tests"),
51 |         ("test_builtin_strategy", "Builtin Browser Strategy Tests"),
52 |         ("test_profiles", "Profile Management Tests")
53 |     ]
54 |     
55 |     # Run each test module
56 |     timings = {}
57 |     for module_name, header in test_modules:
58 |         try:
59 |             time_taken = await run_test_module(module_name, header)
60 |             timings[module_name] = time_taken
61 |         except Exception as e:
62 |             logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
63 |     
64 |     # Print summary
65 |     logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
66 |     logger.info(f"{'-'*50}", tag="SUMMARY")
67 |     for module_name, header in test_modules:
68 |         if module_name in timings:
69 |             logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
70 |         else:
71 |             logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
72 |     logger.info(f"{'-'*50}", tag="SUMMARY")
73 |     total_time = sum(timings.values())
74 |     logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
75 | 
76 | if __name__ == "__main__":
77 |     asyncio.run(main())
78 | 


--------------------------------------------------------------------------------
/tests/browser/test_launch_standalone.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai.browser_profiler import BrowserProfiler
 2 | import asyncio
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     # Test launching a standalone browser
 7 |     async def test_standalone_browser():
 8 |         profiler = BrowserProfiler()
 9 |         cdp_url = await profiler.launch_standalone_browser(
10 |             browser_type="chromium",
11 |             user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
12 |             debugging_port=9222,
13 |             headless=False
14 |         )
15 |         print(f"CDP URL: {cdp_url}")
16 | 
17 |     asyncio.run(test_standalone_browser())


--------------------------------------------------------------------------------
/tests/docker/test_dockerclient.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai.docker_client import Crawl4aiDockerClient
 3 | from crawl4ai import (
 4 |     BrowserConfig,
 5 |     CrawlerRunConfig
 6 | )
 7 | 
 8 | async def main():
 9 |     async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
10 |         await client.authenticate("test@example.com")
11 |         
12 |         # Non-streaming crawl
13 |         results = await client.crawl(
14 |             ["https://example.com", "https://python.org"],
15 |             browser_config=BrowserConfig(headless=True),
16 |             crawler_config=CrawlerRunConfig()
17 |         )
18 |         print(f"Non-streaming results: {results}")
19 |         
20 |         # Streaming crawl
21 |         crawler_config = CrawlerRunConfig(stream=True)
22 |         async for result in await client.crawl(
23 |             ["https://example.com", "https://python.org"],
24 |             browser_config=BrowserConfig(headless=True),
25 |             crawler_config=crawler_config
26 |         ):
27 |             print(f"Streamed result: {result}")
28 |         
29 |         # Get schema
30 |         schema = await client.get_schema()
31 |         print(f"Schema: {schema}")
32 | 
33 | if __name__ == "__main__":
34 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import (
 3 |     AsyncWebCrawler,
 4 |     CrawlerRunConfig,
 5 |     HTTPCrawlerConfig,
 6 |     CacheMode,
 7 |     DefaultMarkdownGenerator,
 8 |     PruningContentFilter
 9 | )
10 | from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
11 | from crawl4ai.async_logger import AsyncLogger
12 | 
13 | async def main():
14 |     # Initialize HTTP crawler strategy
15 |     http_strategy = AsyncHTTPCrawlerStrategy(
16 |         browser_config=HTTPCrawlerConfig(
17 |             method="GET",
18 |             verify_ssl=True,
19 |             follow_redirects=True
20 |         ),
21 |         logger=AsyncLogger(verbose=True)
22 |     )
23 | 
24 |     # Initialize web crawler with HTTP strategy
25 |     async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
26 |         crawler_config = CrawlerRunConfig(
27 |             cache_mode=CacheMode.BYPASS,
28 |             markdown_generator=DefaultMarkdownGenerator(
29 |                 content_filter=PruningContentFilter(
30 |                     threshold=0.48, 
31 |                     threshold_type="fixed", 
32 |                     min_word_threshold=0
33 |                 )
34 |             )
35 |         )
36 |         
37 |         # Test different URLs
38 |         urls = [
39 |             "https://example.com",
40 |             "https://httpbin.org/get",
41 |             "raw://<html><body>Test content</body></html>"
42 |         ]
43 |         
44 |         for url in urls:
45 |             print(f"\n=== Testing {url} ===")
46 |             try:
47 |                 result = await crawler.arun(url=url, config=crawler_config)
48 |                 print(f"Status: {result.status_code}")
49 |                 print(f"Raw HTML length: {len(result.html)}")
50 |                 if hasattr(result, 'markdown'):
51 |                     print(f"Markdown length: {len(result.markdown.raw_markdown)}")
52 |             except Exception as e:
53 |                 print(f"Error: {e}")
54 | 
55 | if __name__ == "__main__":
56 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_advanced_deep_crawl.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | 
 4 | 
 5 | from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
 6 | from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 7 | from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
 8 | from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
 9 | from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
10 | # from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
11 | 
12 | 
13 | async def main():
14 |     """Example deep crawl of documentation site."""
15 |     filter_chain = FilterChain([
16 |         URLPatternFilter(patterns=["*2025*"]),
17 |         DomainFilter(allowed_domains=["techcrunch.com"]),
18 |         ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
19 |         ContentTypeFilter(allowed_types=["text/html","application/javascript"])
20 |     ])
21 |     config = CrawlerRunConfig(
22 |         deep_crawl_strategy = BestFirstCrawlingStrategy(
23 |             max_depth=2,
24 |             include_external=False,
25 |             filter_chain=filter_chain,
26 |             url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
27 |         ),
28 |         stream=False,
29 |         verbose=True,
30 |         cache_mode=CacheMode.BYPASS,
31 |         scraping_strategy=LXMLWebScrapingStrategy()
32 |     )
33 | 
34 |     async with AsyncWebCrawler() as crawler:
35 |         print("Starting deep crawl in streaming mode:")
36 |         config.stream = True
37 |         start_time = time.perf_counter()
38 |         async for result in await crawler.arun(
39 |             url="https://techcrunch.com",
40 |             config=config
41 |         ):
42 |             print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
43 |         print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
44 | 
45 | if __name__ == "__main__":
46 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_crawlers.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # example_usageexample_usageexample_usage# example_usage.py
 3 | import asyncio
 4 | from crawl4ai.crawlers import get_crawler
 5 | 
 6 | async def main():
 7 |     # Get the registered crawler
 8 |     example_crawler = get_crawler("example_site.content")
 9 |     
10 |     # Crawl example.com
11 |     result = await example_crawler(url="https://example.com")
12 |         
13 |     print(result)
14 |             
15 | 
16 | if __name__ == "__main__":
17 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_deep_crawl.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | 
 4 | 
 5 | from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
 6 | from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 7 | from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 8 | # from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
 9 | 
10 | 
11 | async def main():
12 |     """Example deep crawl of documentation site."""
13 |     config = CrawlerRunConfig(
14 |         deep_crawl_strategy = BFSDeepCrawlStrategy(
15 |             max_depth=2,
16 |             include_external=False
17 |         ),
18 |         stream=False,
19 |         verbose=True,
20 |         cache_mode=CacheMode.BYPASS,
21 |         scraping_strategy=LXMLWebScrapingStrategy()
22 |     )
23 | 
24 |     async with AsyncWebCrawler() as crawler:
25 |         start_time = time.perf_counter()
26 |         print("\nStarting deep crawl in batch mode:")
27 |         results = await crawler.arun(
28 |             url="https://docs.crawl4ai.com",
29 |             config=config
30 |         )
31 |         print(f"Crawled {len(results)} pages")
32 |         print(f"Example page: {results[0].url}")
33 |         print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
34 | 
35 |         print("Starting deep crawl in streaming mode:")
36 |         config.stream = True
37 |         start_time = time.perf_counter()
38 |         async for result in await crawler.arun(
39 |             url="https://docs.crawl4ai.com",
40 |             config=config
41 |         ):
42 |             print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
43 |         print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
44 | 
45 | if __name__ == "__main__":
46 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/general/test_download_file.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, BrowserConfig
 3 | from pathlib import Path
 4 | import os
 5 | 
 6 | async def test_basic_download():
 7 |     
 8 |     # Custom folder (otherwise defaults to ~/.crawl4ai/downloads)
 9 |     downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
10 |     os.makedirs(downloads_path, exist_ok=True)
11 |     browser_config = BrowserConfig(
12 |         accept_downloads=True,
13 |         downloads_path=downloads_path
14 |     )
15 |     async with AsyncWebCrawler(config=browser_config) as crawler:
16 |         run_config = CrawlerRunConfig(
17 |             js_code="""
18 |                 const link = document.querySelector('a[href$=".exe"]');
19 |                 if (link) { link.click(); }
20 |             """,
21 |             delay_before_return_html=5  
22 |         )
23 |         result = await crawler.arun("https://www.python.org/downloads/", config=run_config)
24 | 
25 |         if result.downloaded_files:
26 |             print("Downloaded files:")
27 |             for file_path in result.downloaded_files:
28 |                 print("•", file_path)
29 |         else:
30 |             print("No files downloaded.")
31 | 
32 | if __name__ == "__main__":
33 |     asyncio.run(test_basic_download())
34 |  


--------------------------------------------------------------------------------
/tests/general/test_persistent_context.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | from crawl4ai.async_webcrawler import AsyncWebCrawler
 4 | from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
 5 | 
 6 | # Simple concurrency test for persistent context page creation
 7 | # Usage: python scripts/test_persistent_context.py
 8 | 
 9 | URLS = [
10 |     # "https://example.com",
11 |     "https://httpbin.org/html",
12 |     "https://www.python.org/",
13 |     "https://www.rust-lang.org/",
14 | ]
15 | 
16 | async def main():
17 |     profile_dir = os.path.join(os.path.expanduser("~"), ".crawl4ai", "profiles", "test-persistent-profile")
18 |     os.makedirs(profile_dir, exist_ok=True)
19 | 
20 |     browser_config = BrowserConfig(
21 |         browser_type="chromium",
22 |         headless=True,
23 |         use_persistent_context=True,
24 |         user_data_dir=profile_dir,
25 |         use_managed_browser=True,
26 |         verbose=True,
27 |     )
28 | 
29 |     run_cfg = CrawlerRunConfig(
30 |         cache_mode=CacheMode.BYPASS,
31 |         stream=False,
32 |         verbose=True,
33 |     )
34 | 
35 |     async with AsyncWebCrawler(config=browser_config) as crawler:
36 |         results = await crawler.arun_many(URLS, config=run_cfg)
37 |         for r in results:
38 |             print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0)
39 |         # r = await crawler.arun(url=URLS[0], config=run_cfg)
40 |         # print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0)
41 | 
42 | if __name__ == "__main__":
43 |     asyncio.run(main())
44 | 


--------------------------------------------------------------------------------
/tests/general/test_stream.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | # append 2 parent directories to sys.path to import crawl4ai
 3 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 4 | sys.path.append(parent_dir)
 5 | parent_parent_dir = os.path.dirname(parent_dir)
 6 | sys.path.append(parent_parent_dir)
 7 | 
 8 | import asyncio
 9 | from crawl4ai import *
10 | 
11 | async def test_crawler():
12 |     # Setup configurations
13 |     browser_config = BrowserConfig(headless=True, verbose=False)
14 |     crawler_config = CrawlerRunConfig(
15 |         cache_mode=CacheMode.BYPASS,
16 |         markdown_generator=DefaultMarkdownGenerator(
17 |             content_filter=PruningContentFilter(
18 |                 threshold=0.48, 
19 |                 threshold_type="fixed", 
20 |                 min_word_threshold=0
21 |             )
22 |         ),
23 |     )
24 | 
25 |     # Test URLs - mix of different sites
26 |     urls = [
27 |         "http://example.com",
28 |         "http://example.org",
29 |         "http://example.net",
30 |     ] * 10  # 15 total URLs
31 | 
32 |     async with AsyncWebCrawler(config=browser_config) as crawler:
33 |         print("\n=== Testing Streaming Mode ===")
34 |         async for result in await crawler.arun_many(
35 |             urls=urls,
36 |             config=crawler_config.clone(stream=True),
37 |         ):
38 |             print(f"Received result for: {result.url} - Success: {result.success}")
39 |             
40 |         print("\n=== Testing Batch Mode ===")
41 |         results = await crawler.arun_many(
42 |             urls=urls,
43 |             config=crawler_config,
44 |         )
45 |         print(f"Received all {len(results)} results at once")
46 |         for result in results:
47 |             print(f"Batch result for: {result.url} - Success: {result.success}")
48 | 
49 | if __name__ == "__main__":
50 |     asyncio.run(test_crawler())


--------------------------------------------------------------------------------
/tests/general/test_stream_dispatch.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | # append 2 parent directories to sys.path to import crawl4ai
 3 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 4 | sys.path.append(parent_dir)
 5 | parent_parent_dir = os.path.dirname(parent_dir)
 6 | sys.path.append(parent_parent_dir)
 7 | 
 8 | 
 9 | import asyncio
10 | from typing import List
11 | from crawl4ai import *
12 | from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
13 | 
14 | async def test_streaming():
15 |     browser_config = BrowserConfig(headless=True, verbose=True)
16 |     crawler_config = CrawlerRunConfig(
17 |         cache_mode=CacheMode.BYPASS,
18 |         markdown_generator=DefaultMarkdownGenerator(
19 |             # content_filter=PruningContentFilter(
20 |             #     threshold=0.48, 
21 |             #     threshold_type="fixed", 
22 |             #     min_word_threshold=0
23 |             # )
24 |         ),
25 |     )
26 | 
27 |     urls = ["http://example.com"] * 10
28 |     
29 |     async with AsyncWebCrawler(config=browser_config) as crawler:
30 |         dispatcher = MemoryAdaptiveDispatcher(
31 |             max_session_permit=5,
32 |             check_interval=0.5
33 |         )
34 |         
35 |         async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
36 |             print(f"Got result for {result.url} - Success: {result.result.success}")
37 | 
38 | if __name__ == "__main__":
39 |     asyncio.run(test_streaming())


--------------------------------------------------------------------------------
/tests/general/tets_robot.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai import *
 3 | 
 4 | async def test_real_websites():
 5 |     print("\n=== Testing Real Website Robots.txt Compliance ===\n")
 6 |     
 7 |     browser_config = BrowserConfig(headless=True, verbose=True)
 8 |     async with AsyncWebCrawler(config=browser_config) as crawler:
 9 |         
10 |         # Test cases with URLs
11 |         test_cases = [
12 |             # Public sites that should be allowed
13 |             ("https://example.com", True),  # Simple public site
14 |             ("https://httpbin.org/get", True),  # API endpoint
15 |             
16 |             # Sites with known strict robots.txt
17 |             ("https://www.facebook.com/robots.txt", False),  # Social media
18 |             ("https://www.google.com/search", False),  # Search pages
19 |             
20 |             # Edge cases
21 |             ("https://api.github.com", True),  # API service
22 |             ("https://raw.githubusercontent.com", True),  # Content delivery
23 |             
24 |             # Non-existent/error cases
25 |             ("https://thisisnotarealwebsite.com", True),  # Non-existent domain
26 |             ("https://localhost:12345", True),  # Invalid port
27 |         ]
28 | 
29 |         for url, expected in test_cases:
30 |             print(f"\nTesting: {url}")
31 |             try:
32 |                 config = CrawlerRunConfig(
33 |                     cache_mode=CacheMode.BYPASS,
34 |                     check_robots_txt=True,  # Enable robots.txt checking
35 |                     verbose=True
36 |                 )
37 |                 
38 |                 result = await crawler.arun(url=url, config=config)
39 |                 allowed = result.success and not result.error_message
40 |                 
41 |                 print(f"Expected: {'allowed' if expected else 'denied'}")
42 |                 print(f"Actual: {'allowed' if allowed else 'denied'}")
43 |                 print(f"Status Code: {result.status_code}")
44 |                 if result.error_message:
45 |                     print(f"Error: {result.error_message}")
46 |                 
47 |                 # Optional: Print robots.txt content if available
48 |                 if result.metadata and 'robots_txt' in result.metadata:
49 |                     print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
50 |                 
51 |             except Exception as e:
52 |                 print(f"Test failed with error: {str(e)}")
53 | 
54 | async def main():
55 |     try:
56 |         await test_real_websites()
57 |     except Exception as e:
58 |         print(f"Test suite failed: {str(e)}")
59 |         raise
60 | 
61 | if __name__ == "__main__":
62 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/hub/test_simple.py:
--------------------------------------------------------------------------------
 1 | # test.py
 2 | from crawl4ai import CrawlerHub
 3 | import json
 4 | 
 5 | async def amazon_example():
 6 |     if (crawler_cls := CrawlerHub.get("amazon_product")) :
 7 |         crawler = crawler_cls()
 8 |         print(f"Crawler version: {crawler_cls.meta['version']}")
 9 |         print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
10 |         print(await crawler.run("https://amazon.com/test"))
11 |     else:
12 |         print("Crawler not found!")
13 | 
14 | async def google_example():
15 |     # Get crawler dynamically
16 |     crawler_cls = CrawlerHub.get("google_search")
17 |     crawler = crawler_cls()
18 | 
19 |     # Text search
20 |     text_results = await crawler.run(
21 |         query="apple inc", 
22 |         search_type="text",  
23 |         schema_cache_path="/Users/unclecode/.crawl4ai"
24 |     )
25 |     print(json.dumps(json.loads(text_results), indent=4))
26 | 
27 |     # Image search
28 |     # image_results = await crawler.run(query="apple inc", search_type="image")
29 |     # print(image_results)
30 | 
31 | if __name__ == "__main__":
32 |     import asyncio
33 |     # asyncio.run(amazon_example())
34 |     asyncio.run(google_example())


--------------------------------------------------------------------------------
/tests/mcp/test_mcp_sse.py:
--------------------------------------------------------------------------------
 1 | from mcp.client.sse import sse_client
 2 | from mcp.client.session import ClientSession
 3 | 
 4 | async def main():
 5 |     async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
 6 |         async with ClientSession(r, w) as sess:
 7 |             print(await sess.list_tools())      # now works
 8 |             
 9 | if __name__ == "__main__":
10 |     import asyncio
11 |     asyncio.run(main())
12 | 


--------------------------------------------------------------------------------
/tests/memory/cap_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
 4 | """
 5 | 
 6 | import asyncio, httpx, json, uuid, argparse
 7 | 
 8 | API = "http://localhost:8020/crawl"
 9 | URLS_PER_CALL = 1          # keep it minimal so each arun() == 1 page
10 | CONCURRENT_CALLS = 20      # way above your cap
11 | 
12 | payload_template = {
13 |     "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
14 |     "crawler_config": {
15 |         "type": "CrawlerRunConfig",
16 |         "params": {"cache_mode": "BYPASS", "verbose": False},
17 |     }
18 | }
19 | 
20 | async def one_call(client):
21 |     payload = payload_template.copy()
22 |     payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
23 |     r = await client.post(API, json=payload)
24 |     r.raise_for_status()
25 |     return r.json()["server_peak_memory_mb"]
26 | 
27 | async def main():
28 |     async with httpx.AsyncClient(timeout=60) as client:
29 |         tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
30 |         mem_usages = await asyncio.gather(*tasks)
31 |         print("Calls finished OK, server peaks reported:", mem_usages)
32 | 
33 | if __name__ == "__main__":
34 |     asyncio.run(main())
35 | 


--------------------------------------------------------------------------------
/tests/memory/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=1.5.0
2 | matplotlib>=3.5.0
3 | seaborn>=0.12.0
4 | rich>=12.0.0


--------------------------------------------------------------------------------
/tests/memory/test_docker_config_gen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Quick sanity‑check for /config/dump endpoint.
 4 | 
 5 | Usage:
 6 |     python test_config_dump.py  [http://localhost:8020]
 7 | 
 8 | If the server isn’t running, start it first:
 9 |     uvicorn deploy.docker.server:app --port 8020
10 | """
11 | 
12 | import sys, json, textwrap, requests
13 | 
14 | # BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
15 | BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
16 | URL  = f"{BASE.rstrip('/')}/config/dump"
17 | 
18 | CASES = [
19 |     # --- CrawlRunConfig variants ---
20 |     "CrawlerRunConfig()",
21 |     "CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
22 |     "CrawlerRunConfig(js_only=True, wait_until='networkidle')",
23 | 
24 |     # --- BrowserConfig variants ---
25 |     "BrowserConfig()",
26 |     "BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
27 |     "BrowserConfig(browser_mode='builtin', proxy_config={'server': 'http://1.2.3.4:8080'})",
28 | ]
29 | 
30 | for code in CASES:
31 |     print("\n===  POST:", code)
32 |     resp = requests.post(URL, json={"code": code}, timeout=15)
33 |     if resp.ok:
34 |         print(json.dumps(resp.json(), indent=2)[:400] + "...")
35 |     else:
36 |         print("ERROR", resp.status_code, resp.text[:200])
37 | 


--------------------------------------------------------------------------------
/tests/profiler/test_create_profile.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai import BrowserProfiler
 2 | import asyncio
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     # Example usage
 7 |     profiler = BrowserProfiler()
 8 |     
 9 |     # Create a new profile
10 |     import os
11 |     from pathlib import Path
12 |     home_dir = Path.home()
13 |     profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
14 |     
15 |     print(f"Profile created at: {profile_path}")
16 | 
17 |         
18 |             
19 |     # # Launch a standalone browser
20 |     # asyncio.run(profiler.launch_standalone_browser())
21 |     
22 |     # # List profiles
23 |     # profiles = profiler.list_profiles()
24 |     # for profile in profiles:
25 |     #     print(f"Profile: {profile['name']}, Path: {profile['path']}")
26 |     
27 |     # # Delete a profile
28 |     # success = profiler.delete_profile("my-profile")
29 |     # if success:
30 |     #     print("Profile deleted successfully")
31 |     # else:
32 |     #     print("Failed to delete profile")


--------------------------------------------------------------------------------
/tests/profiler/test_keyboard_handle.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pytest
 3 | import asyncio
 4 | from unittest.mock import patch, MagicMock
 5 | from crawl4ai.browser_profiler import BrowserProfiler
 6 | 
 7 | @pytest.mark.asyncio
 8 | @pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific msvcrt test")
 9 | async def test_keyboard_input_handling():
10 |     # Mock sequence of keystrokes: arrow key followed by 'q'
11 |     mock_keys = [b'\x00K', b'q']
12 |     mock_kbhit = MagicMock(side_effect=[True, True, False])
13 |     mock_getch = MagicMock(side_effect=mock_keys)
14 | 
15 |     with patch('msvcrt.kbhit', mock_kbhit), patch('msvcrt.getch', mock_getch):
16 |         # profiler = BrowserProfiler()
17 |         user_done_event = asyncio.Event()
18 |         
19 |         # Create a local async function to simulate the keyboard input handling
20 |         async def test_listen_for_quit_command():
21 |             if sys.platform == "win32":
22 |                 while True:
23 |                     try:
24 |                         if mock_kbhit():
25 |                             raw = mock_getch()
26 |                             try:
27 |                                 key = raw.decode("utf-8")
28 |                             except UnicodeDecodeError:
29 |                                 continue
30 | 
31 |                             if len(key) != 1 or not key.isprintable():
32 |                                 continue
33 | 
34 |                             if key.lower() == "q":
35 |                                 user_done_event.set()
36 |                                 return
37 | 
38 |                         await asyncio.sleep(0.1)
39 |                     except Exception as e:
40 |                         continue
41 | 
42 |         # Run the listener
43 |         listener_task = asyncio.create_task(test_listen_for_quit_command())
44 |         
45 |         # Wait for the event to be set
46 |         try:
47 |             await asyncio.wait_for(user_done_event.wait(), timeout=1.0)
48 |             assert user_done_event.is_set()
49 |         finally:
50 |             if not listener_task.done():
51 |                 listener_task.cancel()
52 |                 try:
53 |                     await listener_task
54 |                 except asyncio.CancelledError:
55 |                     pass


--------------------------------------------------------------------------------
/tests/proxy/test_proxy_deprecation.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import pytest
 4 | 
 5 | from crawl4ai.async_configs import BrowserConfig, ProxyConfig
 6 | 
 7 | 
 8 | def test_browser_config_proxy_string_emits_deprecation_and_autoconverts():
 9 |     warnings.simplefilter("always", DeprecationWarning)
10 | 
11 |     proxy_str = "23.95.150.145:6114:username:password"
12 |     with warnings.catch_warnings(record=True) as caught:
13 |         cfg = BrowserConfig(proxy=proxy_str, headless=True)
14 | 
15 |     dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
16 |     assert dep_warnings, "Expected DeprecationWarning when using BrowserConfig(proxy=...)"
17 | 
18 |     assert cfg.proxy is None, "cfg.proxy should be None after auto-conversion"
19 |     assert isinstance(cfg.proxy_config, ProxyConfig), "cfg.proxy_config should be ProxyConfig instance"
20 |     assert cfg.proxy_config.username == "username"
21 |     assert cfg.proxy_config.password == "password"
22 |     assert cfg.proxy_config.server.startswith("http://")
23 |     assert cfg.proxy_config.server.endswith(":6114")
24 | 
25 | 
26 | def test_browser_config_with_proxy_config_emits_no_deprecation():
27 |     warnings.simplefilter("always", DeprecationWarning)
28 | 
29 |     with warnings.catch_warnings(record=True) as caught:
30 |         cfg = BrowserConfig(
31 |             headless=True,
32 |             proxy_config={
33 |                 "server": "http://127.0.0.1:8080",
34 |                 "username": "u",
35 |                 "password": "p",
36 |             },
37 |         )
38 | 
39 |     dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
40 |     assert not dep_warnings, "Did not expect DeprecationWarning when using proxy_config"
41 |     assert cfg.proxy is None
42 |     assert isinstance(cfg.proxy_config, ProxyConfig)
43 | 


--------------------------------------------------------------------------------
/tests/test_arun_many.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test example for multiple crawler configs feature
 3 | """
 4 | import asyncio
 5 | import sys
 6 | from pathlib import Path
 7 | 
 8 | # Add parent directory to path for imports
 9 | sys.path.insert(0, str(Path(__file__).parent.parent))
10 | 
11 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
12 | from crawl4ai.processors.pdf import PDFContentScrapingStrategy
13 | 
14 | 
15 | async def test_run_many():
16 |     default_config = CrawlerRunConfig(
17 |         cache_mode=CacheMode.BYPASS,
18 |         # scraping_strategy=PDFContentScrapingStrategy()
19 |     )
20 |     
21 |     test_urls = [
22 |         # "https://blog.python.org/",  # Blog URL  
23 |         "https://www.python.org/",  # Generic HTTPS page
24 |         "https://www.kidocode.com/",  # Generic HTTPS page
25 |         "https://www.example.com/",  # Generic HTTPS page
26 |         # "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
27 |     ]
28 |     
29 |     async with AsyncWebCrawler() as crawler:
30 |         # Single config - traditional usage still works
31 |         print("Test 1: Single config (backwards compatible)")
32 |         result = await crawler.arun_many(
33 |             urls=test_urls[:2],
34 |             config=default_config
35 |         )
36 |         print(f"Crawled {len(result)} URLs with single config\n")
37 |         for item in result:
38 |             print(f"  {item.url} -> {item.status_code}")
39 |         
40 | 
41 | if __name__ == "__main__":
42 |     asyncio.run(test_run_many())
43 | 


--------------------------------------------------------------------------------
/tests/test_cli_docs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from crawl4ai.docs_manager import DocsManager
 3 | from click.testing import CliRunner
 4 | from crawl4ai.cli import cli
 5 | 
 6 | 
 7 | def test_cli():
 8 |     """Test all CLI commands"""
 9 |     runner = CliRunner()
10 | 
11 |     print("\n1. Testing docs update...")
12 |     # Use sync version for testing
13 |     docs_manager = DocsManager()
14 |     loop = asyncio.get_event_loop()
15 |     loop.run_until_complete(docs_manager.fetch_docs())
16 | 
17 |     # print("\n2. Testing listing...")
18 |     # result = runner.invoke(cli, ['docs', 'list'])
19 |     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
20 |     # print(result.output)
21 | 
22 |     # print("\n2. Testing index building...")
23 |     # result = runner.invoke(cli, ['docs', 'index'])
24 |     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
25 |     # print(f"Output: {result.output}")
26 | 
27 |     # print("\n3. Testing search...")
28 |     # result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index'])
29 |     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
30 |     # print(f"First 200 chars: {result.output[:200]}...")
31 | 
32 |     # print("\n4. Testing combine with sections...")
33 |     # result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended'])
34 |     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
35 |     # print(f"First 200 chars: {result.output[:200]}...")
36 | 
37 |     print("\n5. Testing combine all sections...")
38 |     result = runner.invoke(cli, ["docs", "combine", "--mode", "condensed"])
39 |     print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
40 |     print(f"First 200 chars: {result.output[:200]}...")
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     test_cli()
45 | 


--------------------------------------------------------------------------------
/tests/test_llmtxt.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai.llmtxt import AsyncLLMTextManager  # Changed to AsyncLLMTextManager
 2 | from crawl4ai.async_logger import AsyncLogger
 3 | from pathlib import Path
 4 | import asyncio
 5 | 
 6 | 
 7 | async def main():
 8 |     current_file = Path(__file__).resolve()
 9 |     # base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs"
10 |     base_dir = current_file.parent.parent / "local/_docs/llm.txt"
11 |     docs_dir = base_dir
12 | 
13 |     # Create directory if it doesn't exist
14 |     docs_dir.mkdir(parents=True, exist_ok=True)
15 | 
16 |     # Initialize logger
17 |     logger = AsyncLogger()
18 |     # Updated initialization with default batching params
19 |     # manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2)
20 |     manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2)
21 | 
22 |     # Let's first check what files we have
23 |     print("\nAvailable files:")
24 |     for f in docs_dir.glob("*.md"):
25 |         print(f"- {f.name}")
26 | 
27 |     # Generate index files
28 |     print("\nGenerating index files...")
29 |     await manager.generate_index_files(
30 |         force_generate_facts=False, clear_bm25_cache=False
31 |     )
32 | 
33 |     # Test some relevant queries about Crawl4AI
34 |     test_queries = [
35 |         "How is using the `arun_many` method?",
36 |     ]
37 | 
38 |     print("\nTesting search functionality:")
39 |     for query in test_queries:
40 |         print(f"\nQuery: {query}")
41 |         results = manager.search(query, top_k=2)
42 |         print(f"Results length: {len(results)} characters")
43 |         if results:
44 |             print(
45 |                 "First 200 chars of results:", results[:200].replace("\n", " "), "..."
46 |             )
47 |         else:
48 |             print("No results found")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     asyncio.run(main())
53 | 


--------------------------------------------------------------------------------
/tests/test_memory_macos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Test script to verify macOS memory calculation accuracy."""
 3 | 
 4 | import psutil
 5 | import platform
 6 | import time
 7 | from crawl4ai.utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
 8 | 
 9 | 
10 | def test_memory_calculation():
11 |     """Test and compare memory calculations."""
12 |     print(f"Platform: {platform.system()}")
13 |     print(f"Python version: {platform.python_version()}")
14 |     print("-" * 60)
15 |     
16 |     # Get psutil's view
17 |     vm = psutil.virtual_memory()
18 |     psutil_percent = vm.percent
19 |     psutil_available_gb = vm.available / (1024**3)
20 |     total_gb = vm.total / (1024**3)
21 |     
22 |     # Get our corrected view
23 |     true_percent = get_true_memory_usage_percent()
24 |     true_available_gb = get_true_available_memory_gb()
25 |     true_percent_calc, available_calc, total_calc = get_memory_stats()
26 |     
27 |     print("Memory Statistics Comparison:")
28 |     print(f"Total Memory: {total_gb:.2f} GB")
29 |     print()
30 |     
31 |     print("PSUtil (Standard) Calculation:")
32 |     print(f"  - Memory Used: {psutil_percent:.1f}%")
33 |     print(f"  - Available: {psutil_available_gb:.2f} GB")
34 |     print()
35 |     
36 |     print("Platform-Aware Calculation:")
37 |     print(f"  - Memory Used: {true_percent:.1f}%")
38 |     print(f"  - Available: {true_available_gb:.2f} GB")
39 |     print(f"  - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
40 |     print()
41 |     
42 |     # Show the impact on dispatcher behavior
43 |     print("Impact on MemoryAdaptiveDispatcher:")
44 |     thresholds = {
45 |         "Normal": 90.0,
46 |         "Critical": 95.0,
47 |         "Recovery": 85.0
48 |     }
49 |     
50 |     for name, threshold in thresholds.items():
51 |         psutil_triggered = psutil_percent >= threshold
52 |         true_triggered = true_percent >= threshold
53 |         print(f"  - {name} Threshold ({threshold}%):")
54 |         print(f"    PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
55 |         print(f"    Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
56 |         if psutil_triggered != true_triggered:
57 |             print(f"    → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
58 |     print()
59 |     
60 |     # Monitor for a few seconds
61 |     print("Monitoring memory for 10 seconds...")
62 |     for i in range(10):
63 |         vm = psutil.virtual_memory()
64 |         true_pct = get_true_memory_usage_percent()
65 |         print(f"  {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
66 |         time.sleep(1)
67 |     print("\n")
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     test_memory_calculation()


--------------------------------------------------------------------------------
/tests/test_scraping_strategy.py:
--------------------------------------------------------------------------------
 1 | import nest_asyncio
 2 | 
 3 | nest_asyncio.apply()
 4 | 
 5 | import asyncio
 6 | from crawl4ai import (
 7 |     AsyncWebCrawler,
 8 |     CrawlerRunConfig,
 9 |     LXMLWebScrapingStrategy,
10 |     CacheMode,
11 | )
12 | 
13 | 
14 | async def main():
15 |     config = CrawlerRunConfig(
16 |         cache_mode=CacheMode.BYPASS,
17 |         scraping_strategy=LXMLWebScrapingStrategy(),  # Faster alternative to default BeautifulSoup
18 |     )
19 |     async with AsyncWebCrawler() as crawler:
20 |         result = await crawler.arun(url="https://example.com", config=config)
21 |         print(f"Success: {result.success}")
22 |         print(f"Markdown length: {len(result.markdown.raw_markdown)}")
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     asyncio.run(main())
27 | 


--------------------------------------------------------------------------------