├── .claude
└── settings.local.json
├── .env.txt
├── .gitattributes
├── .github
├── DISCUSSION_TEMPLATE
│ └── feature-requests.yml
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.yml
│ └── config.yml
├── pull_request_template.md
└── workflows
│ ├── docker-release.yml
│ ├── docs
│ ├── ARCHITECTURE.md
│ ├── README.md
│ └── WORKFLOW_REFERENCE.md
│ ├── main.yml
│ ├── release.yml
│ ├── release.yml.backup
│ └── test-release.yml.disabled
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTORS.md
├── Dockerfile
├── JOURNAL.md
├── LICENSE
├── MANIFEST.in
├── MISSION.md
├── PROGRESSIVE_CRAWLING.md
├── README-first.md
├── README.md
├── ROADMAP.md
├── SPONSORS.md
├── cliff.toml
├── crawl4ai
├── __init__.py
├── __version__.py
├── adaptive_crawler copy.py
├── adaptive_crawler.py
├── async_configs.py
├── async_crawler_strategy.back.py
├── async_crawler_strategy.py
├── async_database.py
├── async_dispatcher.py
├── async_logger.py
├── async_url_seeder.py
├── async_webcrawler.py
├── browser_adapter.py
├── browser_manager.py
├── browser_profiler.py
├── cache_context.py
├── chunking_strategy.py
├── cli.py
├── components
│ └── crawler_monitor.py
├── config.py
├── content_filter_strategy.py
├── content_scraping_strategy.py
├── crawlers
│ ├── __init__.py
│ ├── amazon_product
│ │ ├── __init__.py
│ │ └── crawler.py
│ └── google_search
│ │ ├── __init__.py
│ │ ├── crawler.py
│ │ └── script.js
├── deep_crawling
│ ├── __init__.py
│ ├── base_strategy.py
│ ├── bff_strategy.py
│ ├── bfs_strategy.py
│ ├── crazy.py
│ ├── dfs_strategy.py
│ ├── filters.py
│ └── scorers.py
├── docker_client.py
├── extraction_strategy.py
├── html2text
│ ├── __init__.py
│ ├── __main__.py
│ ├── _typing.py
│ ├── cli.py
│ ├── config.py
│ ├── elements.py
│ └── utils.py
├── hub.py
├── install.py
├── js_snippet
│ ├── __init__.py
│ ├── navigator_overrider.js
│ ├── remove_overlay_elements.js
│ └── update_image_dimensions.js
├── legacy
│ ├── __init__.py
│ ├── cli.py
│ ├── crawler_strategy.py
│ ├── database.py
│ ├── docs_manager.py
│ ├── llmtxt.py
│ ├── version_manager.py
│ └── web_crawler.py
├── link_preview.py
├── markdown_generation_strategy.py
├── migrations.py
├── model_loader.py
├── models.py
├── processors
│ └── pdf
│ │ ├── __init__.py
│ │ ├── processor.py
│ │ └── utils.py
├── prompts.py
├── proxy_strategy.py
├── script
│ ├── __init__.py
│ ├── c4a_compile.py
│ ├── c4a_result.py
│ └── c4ai_script.py
├── ssl_certificate.py
├── table_extraction.py
├── types.py
├── user_agent_generator.py
└── utils.py
├── deploy
└── docker
│ ├── .dockerignore
│ ├── .llm.env.example
│ ├── README.md
│ ├── WEBHOOK_EXAMPLES.md
│ ├── api.py
│ ├── auth.py
│ ├── c4ai-code-context.md
│ ├── c4ai-doc-context.md
│ ├── config.yml
│ ├── crawler_pool.py
│ ├── hook_manager.py
│ ├── job.py
│ ├── mcp_bridge.py
│ ├── requirements.txt
│ ├── schemas.py
│ ├── server.py
│ ├── static
│ └── playground
│ │ └── index.html
│ ├── supervisord.conf
│ ├── utils.py
│ └── webhook.py
├── docker-compose.yml
├── docs
├── apps
│ ├── iseeyou
│ │ └── llms-full.txt
│ └── linkdin
│ │ ├── Crawl4ai_Linkedin_Data_Discovery_Part_1.ipynb
│ │ ├── Crawl4ai_Linkedin_Data_Discovery_Part_2.ipynb
│ │ ├── README.md
│ │ ├── c4ai_discover.py
│ │ ├── c4ai_insights.py
│ │ ├── samples
│ │ ├── companies.jsonl
│ │ └── people.jsonl
│ │ ├── schemas
│ │ ├── company_card.json
│ │ └── people_card.json
│ │ ├── snippets
│ │ ├── company.html
│ │ └── people.html
│ │ └── templates
│ │ ├── ai.js
│ │ └── graph_view_template.html
├── assets
│ ├── pitch-dark.png
│ ├── pitch-dark.svg
│ ├── powered-by-dark.svg
│ ├── powered-by-disco.svg
│ ├── powered-by-light.svg
│ └── powered-by-night.svg
├── blog
│ ├── release-v0.7.0.md
│ ├── release-v0.7.1.md
│ ├── release-v0.7.3.md
│ ├── release-v0.7.4.md
│ ├── release-v0.7.5.md
│ └── release-v0.7.6.md
├── codebase
│ ├── browser.md
│ └── cli.md
├── deprecated
│ └── docker-deployment.md
├── examples
│ ├── README_BUILTIN_BROWSER.md
│ ├── adaptive_crawling
│ │ ├── README.md
│ │ ├── advanced_configuration.py
│ │ ├── basic_usage.py
│ │ ├── custom_strategies.py
│ │ ├── embedding_configuration.py
│ │ ├── embedding_strategy.py
│ │ ├── embedding_vs_statistical.py
│ │ ├── export_import_kb.py
│ │ └── llm_config_example.py
│ ├── amazon_product_extraction_direct_url.py
│ ├── amazon_product_extraction_using_hooks.py
│ ├── amazon_product_extraction_using_use_javascript.py
│ ├── arun_vs_arun_many.py
│ ├── assets
│ │ ├── audio.mp3
│ │ ├── basic.png
│ │ ├── cosine_extraction.png
│ │ ├── css_js.png
│ │ ├── css_selector.png
│ │ ├── exec_script.png
│ │ ├── instagram_grid_result.png
│ │ ├── llm_extraction.png
│ │ ├── semantic_extraction_cosine.png
│ │ ├── semantic_extraction_llm.png
│ │ ├── virtual_scroll_append_only.html
│ │ ├── virtual_scroll_instagram_grid.html
│ │ ├── virtual_scroll_news_feed.html
│ │ └── virtual_scroll_twitter_like.html
│ ├── async_webcrawler_multiple_urls_example.py
│ ├── browser_optimization_example.py
│ ├── builtin_browser_example.py
│ ├── c4a_script
│ │ ├── amazon_example
│ │ │ ├── README.md
│ │ │ ├── amazon_r2d2_search.py
│ │ │ ├── extracted_products.json
│ │ │ ├── generated_product_schema.json
│ │ │ ├── generated_search_script.js
│ │ │ ├── header.html
│ │ │ └── product.html
│ │ ├── api_usage_examples.py
│ │ ├── c4a_script_hello_world.py
│ │ ├── c4a_script_hello_world_error.py
│ │ ├── demo_c4a_crawl4ai.py
│ │ ├── generate_script_hello_world.py
│ │ ├── github_search
│ │ │ ├── extracted_repositories.json
│ │ │ ├── generated_result_schema.json
│ │ │ ├── generated_search_script.js
│ │ │ ├── github_search_crawler.py
│ │ │ ├── result.html
│ │ │ └── search_form.html
│ │ ├── script_samples
│ │ │ ├── add_to_cart.c4a
│ │ │ ├── advanced_control_flow.c4a
│ │ │ ├── conditional_login.c4a
│ │ │ ├── data_extraction.c4a
│ │ │ ├── fill_contact.c4a
│ │ │ ├── load_more_content.c4a
│ │ │ ├── login_flow.c4a
│ │ │ ├── multi_step_workflow.c4a
│ │ │ ├── navigate_tabs.c4a
│ │ │ ├── quick_login.c4a
│ │ │ ├── responsive_actions.c4a
│ │ │ ├── scroll_and_click.c4a
│ │ │ ├── search_product.c4a
│ │ │ ├── simple_form.c4a
│ │ │ └── smart_form_fill.c4a
│ │ └── tutorial
│ │ │ ├── README.md
│ │ │ ├── assets
│ │ │ ├── DankMono-Bold.woff2
│ │ │ ├── DankMono-Italic.woff2
│ │ │ ├── DankMono-Regular.woff2
│ │ │ ├── app.css
│ │ │ ├── app.js
│ │ │ ├── blockly-manager.js
│ │ │ ├── blockly-theme.css
│ │ │ ├── c4a-blocks.js
│ │ │ ├── c4a-generator.js
│ │ │ └── styles.css
│ │ │ ├── blockly-demo.c4a
│ │ │ ├── index.html
│ │ │ ├── playground
│ │ │ ├── app.js
│ │ │ ├── index.html
│ │ │ └── styles.css
│ │ │ ├── requirements.txt
│ │ │ ├── scripts
│ │ │ ├── 01-basic-interaction.c4a
│ │ │ ├── 02-login-flow.c4a
│ │ │ ├── 03-infinite-scroll.c4a
│ │ │ ├── 04-multi-step-form.c4a
│ │ │ └── 05-complex-workflow.c4a
│ │ │ ├── server.py
│ │ │ └── test_blockly.html
│ ├── chainlit.md
│ ├── cli
│ │ ├── browser.yml
│ │ ├── crawler.yml
│ │ ├── css_schema.json
│ │ ├── extract.yml
│ │ ├── extract_css.yml
│ │ └── llm_schema.json
│ ├── crawlai_vs_firecrawl.py
│ ├── crawler_monitor_example.py
│ ├── crypto_analysis_example.py
│ ├── deepcrawl_example.py
│ ├── demo_multi_config_clean.py
│ ├── dispatcher_example.py
│ ├── docker
│ │ ├── demo_docker_api.py
│ │ └── demo_docker_polling.py
│ ├── docker_client_hooks_example.py
│ ├── docker_config_obj.py
│ ├── docker_example.py
│ ├── docker_hooks_examples.py
│ ├── docker_python_rest_api.py
│ ├── docker_python_sdk.py
│ ├── docker_webhook_example.py
│ ├── extraction_strategies_examples.py
│ ├── full_page_screenshot_and_pdf_export.md
│ ├── hello_world.py
│ ├── hello_world_undetected.py
│ ├── hooks_example.py
│ ├── identity_based_browsing.py
│ ├── language_support_example.py
│ ├── link_head_extraction_example.py
│ ├── llm_extraction_openai_pricing.py
│ ├── llm_markdown_generator.py
│ ├── llm_table_extraction_example.py
│ ├── markdown
│ │ ├── content_source_example.py
│ │ └── content_source_short_example.py
│ ├── network_console_capture_example.py
│ ├── proxy_rotation_demo.py
│ ├── quickstart.ipynb
│ ├── quickstart.py
│ ├── quickstart_examples_set_1.py
│ ├── quickstart_examples_set_2.py
│ ├── regex_extraction_quickstart.py
│ ├── research_assistant.py
│ ├── rest_call.py
│ ├── sample_ecommerce.html
│ ├── scraping_strategies_performance.py
│ ├── serp_api_project_11_feb.py
│ ├── session_id_example.py
│ ├── simple_anti_bot_examples.py
│ ├── ssl_example.py
│ ├── stealth_mode_example.py
│ ├── stealth_mode_quick_start.py
│ ├── stealth_test_simple.py
│ ├── storage_state_tutorial.md
│ ├── summarize_page.py
│ ├── table_extraction_example.py
│ ├── tutorial_dynamic_clicks.md
│ ├── tutorial_v0.5.py
│ ├── undetectability
│ │ ├── undetected_basic_test.py
│ │ ├── undetected_bot_test.py
│ │ ├── undetected_cloudflare_test.py
│ │ └── undetected_vs_regular_comparison.py
│ ├── undetected_simple_demo.py
│ ├── url_seeder
│ │ ├── Crawl4AI_URL_Seeder_Tutorial.ipynb
│ │ ├── bbc_sport_research_assistant.py
│ │ ├── convert_tutorial_to_colab.py
│ │ ├── tutorial_url_seeder.md
│ │ ├── url_seeder_demo.py
│ │ └── url_seeder_quick_demo.py
│ ├── use_geo_location.py
│ ├── virtual_scroll_example.py
│ └── website-to-api
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── api_server.py
│ │ ├── app.py
│ │ ├── assets
│ │ └── crawl4ai_logo.jpg
│ │ ├── requirements.txt
│ │ ├── static
│ │ ├── index.html
│ │ ├── script.js
│ │ └── styles.css
│ │ ├── test_api.py
│ │ ├── test_models.py
│ │ └── web_scraper_lib.py
├── md_v2
│ ├── advanced
│ │ ├── adaptive-strategies.md
│ │ ├── advanced-features.md
│ │ ├── crawl-dispatcher.md
│ │ ├── file-downloading.md
│ │ ├── hooks-auth.md
│ │ ├── identity-based-crawling.md
│ │ ├── lazy-loading.md
│ │ ├── multi-url-crawling.md
│ │ ├── network-console-capture.md
│ │ ├── pdf-parsing.md
│ │ ├── proxy-security.md
│ │ ├── session-management.md
│ │ ├── ssl-certificate.md
│ │ ├── undetected-browser.md
│ │ └── virtual-scroll.md
│ ├── api
│ │ ├── adaptive-crawler.md
│ │ ├── arun.md
│ │ ├── arun_many.md
│ │ ├── async-webcrawler.md
│ │ ├── c4a-script-reference.md
│ │ ├── crawl-result.md
│ │ ├── digest.md
│ │ ├── parameters.md
│ │ └── strategies.md
│ ├── apps
│ │ ├── assets
│ │ │ ├── DankMono-Bold.woff2
│ │ │ ├── DankMono-Italic.woff2
│ │ │ └── DankMono-Regular.woff2
│ │ ├── c4a-script
│ │ │ ├── README.md
│ │ │ ├── assets
│ │ │ │ ├── DankMono-Bold.woff2
│ │ │ │ ├── DankMono-Italic.woff2
│ │ │ │ ├── DankMono-Regular.woff2
│ │ │ │ ├── app.css
│ │ │ │ ├── app.js
│ │ │ │ ├── blockly-manager.js
│ │ │ │ ├── blockly-theme.css
│ │ │ │ ├── c4a-blocks.js
│ │ │ │ ├── c4a-generator.js
│ │ │ │ └── styles.css
│ │ │ ├── blockly-demo.c4a
│ │ │ ├── index.html
│ │ │ ├── playground
│ │ │ │ ├── app.js
│ │ │ │ ├── index.html
│ │ │ │ └── styles.css
│ │ │ ├── requirements.txt
│ │ │ ├── scripts
│ │ │ │ ├── 01-basic-interaction.c4a
│ │ │ │ ├── 02-login-flow.c4a
│ │ │ │ ├── 03-infinite-scroll.c4a
│ │ │ │ ├── 04-multi-step-form.c4a
│ │ │ │ └── 05-complex-workflow.c4a
│ │ │ ├── server.py
│ │ │ └── test_blockly.html
│ │ ├── crawl4ai-assistant
│ │ │ ├── README.md
│ │ │ ├── assets
│ │ │ │ ├── DankMono-Bold.woff2
│ │ │ │ ├── DankMono-Italic.woff2
│ │ │ │ └── DankMono-Regular.woff2
│ │ │ ├── assistant.css
│ │ │ ├── background
│ │ │ │ └── service-worker.js
│ │ │ ├── content
│ │ │ │ ├── click2crawl.js
│ │ │ │ ├── content.js
│ │ │ │ ├── contentAnalyzer.js
│ │ │ │ ├── markdownConverter.js
│ │ │ │ ├── markdownExtraction.js
│ │ │ │ ├── markdownPreviewModal.js
│ │ │ │ ├── overlay.css
│ │ │ │ ├── scriptBuilder.js
│ │ │ │ └── shared
│ │ │ │ │ └── utils.js
│ │ │ ├── crawl4ai-assistant-v1.2.1.zip
│ │ │ ├── crawl4ai-assistant-v1.3.0.zip
│ │ │ ├── icons
│ │ │ │ ├── favicon.ico
│ │ │ │ ├── icon-128.png
│ │ │ │ ├── icon-16.png
│ │ │ │ └── icon-48.png
│ │ │ ├── index.html
│ │ │ ├── libs
│ │ │ │ └── marked.min.js
│ │ │ ├── manifest.json
│ │ │ └── popup
│ │ │ │ ├── icons
│ │ │ │ ├── favicon.ico
│ │ │ │ ├── icon-128.png
│ │ │ │ ├── icon-16.png
│ │ │ │ └── icon-48.png
│ │ │ │ ├── popup.css
│ │ │ │ ├── popup.html
│ │ │ │ └── popup.js
│ │ ├── index.md
│ │ └── llmtxt
│ │ │ ├── build.md
│ │ │ ├── index.html
│ │ │ ├── llmtxt.css
│ │ │ ├── llmtxt.js
│ │ │ └── why.md
│ ├── ask_ai
│ │ ├── ask-ai.css
│ │ ├── ask-ai.js
│ │ └── index.html
│ ├── assets
│ │ ├── DankMono-Bold.woff2
│ │ ├── DankMono-Italic.woff2
│ │ ├── DankMono-Regular.woff2
│ │ ├── Monaco.woff
│ │ ├── copy_code.js
│ │ ├── crawl4ai-skill.zip
│ │ ├── dmvendor.css
│ │ ├── docs.zip
│ │ ├── feedback-overrides.css
│ │ ├── floating_ask_ai_button.js
│ │ ├── github_stats.js
│ │ ├── gtag.js
│ │ ├── highlight.css
│ │ ├── highlight.min.js
│ │ ├── highlight_init.js
│ │ ├── images
│ │ │ ├── dispatcher.png
│ │ │ └── logo.png
│ │ ├── layout.css
│ │ ├── llm.txt
│ │ │ ├── diagrams
│ │ │ │ ├── cli.txt
│ │ │ │ ├── config_objects.txt
│ │ │ │ ├── deep_crawl_advanced_filters_scorers.txt
│ │ │ │ ├── deep_crawling.txt
│ │ │ │ ├── docker.txt
│ │ │ │ ├── extraction-llm.txt
│ │ │ │ ├── extraction-no-llm.txt
│ │ │ │ ├── http_based_crawler_strategy.txt
│ │ │ │ ├── installation.txt
│ │ │ │ ├── llms-diagram.txt
│ │ │ │ ├── multi_urls_crawling.txt
│ │ │ │ ├── simple_crawling.txt
│ │ │ │ └── url_seeder.txt
│ │ │ └── txt
│ │ │ │ ├── cli.txt
│ │ │ │ ├── config_objects.txt
│ │ │ │ ├── deep_crawl_advanced_filters_scorers.txt
│ │ │ │ ├── deep_crawling.txt
│ │ │ │ ├── docker.txt
│ │ │ │ ├── extraction-llm.txt
│ │ │ │ ├── extraction-no-llm.txt
│ │ │ │ ├── http_based_crawler_strategy.txt
│ │ │ │ ├── installation.txt
│ │ │ │ ├── llms-full-v0.1.1.txt
│ │ │ │ ├── llms-full.txt
│ │ │ │ ├── multi_urls_crawling.txt
│ │ │ │ ├── simple_crawling.txt
│ │ │ │ └── url_seeder.txt
│ │ ├── mobile_menu.js
│ │ ├── page_actions.css
│ │ ├── page_actions.js
│ │ ├── selection_ask_ai.js
│ │ ├── styles.css
│ │ ├── test
│ │ │ └── toc.js
│ │ └── toc.js
│ ├── basic
│ │ └── installation.md
│ ├── blog
│ │ ├── articles
│ │ │ ├── adaptive-crawling-revolution.md
│ │ │ ├── dockerize_hooks.md
│ │ │ ├── llm-context-revolution.md
│ │ │ └── virtual-scroll-revolution.md
│ │ ├── index.md
│ │ ├── index.md.bak
│ │ └── releases
│ │ │ ├── 0.4.0.md
│ │ │ ├── 0.4.1.md
│ │ │ ├── 0.4.2.md
│ │ │ ├── 0.5.0.md
│ │ │ ├── 0.6.0.md
│ │ │ ├── 0.7.0.md
│ │ │ ├── 0.7.1.md
│ │ │ ├── 0.7.2.md
│ │ │ ├── 0.7.3.md
│ │ │ ├── 0.7.6.md
│ │ │ ├── v0.4.3b1.md
│ │ │ └── v0.7.5.md
│ ├── branding
│ │ └── index.md
│ ├── complete-sdk-reference.md
│ ├── core
│ │ ├── adaptive-crawling.md
│ │ ├── ask-ai.md
│ │ ├── browser-crawler-config.md
│ │ ├── c4a-script.md
│ │ ├── cache-modes.md
│ │ ├── cli.md
│ │ ├── content-selection.md
│ │ ├── crawler-result.md
│ │ ├── deep-crawling.md
│ │ ├── docker-deployment.md
│ │ ├── examples.md
│ │ ├── fit-markdown.md
│ │ ├── installation.md
│ │ ├── link-media.md
│ │ ├── llmtxt.md
│ │ ├── local-files.md
│ │ ├── markdown-generation.md
│ │ ├── page-interaction.md
│ │ ├── quickstart.md
│ │ ├── simple-crawling.md
│ │ ├── table_extraction.md
│ │ └── url-seeding.md
│ ├── extraction
│ │ ├── chunking.md
│ │ ├── clustring-strategies.md
│ │ ├── llm-strategies.md
│ │ └── no-llm-strategies.md
│ ├── favicon.ico
│ ├── img
│ │ ├── favicon-32x32.png
│ │ ├── favicon-x-32x32.png
│ │ └── favicon.ico
│ ├── index.md
│ ├── marketplace
│ │ ├── README.md
│ │ ├── admin
│ │ │ ├── admin.css
│ │ │ ├── admin.js
│ │ │ └── index.html
│ │ ├── app-detail.css
│ │ ├── app-detail.html
│ │ ├── app-detail.js
│ │ ├── backend
│ │ │ ├── .env.example
│ │ │ ├── config.py
│ │ │ ├── database.py
│ │ │ ├── dummy_data.py
│ │ │ ├── requirements.txt
│ │ │ ├── schema.yaml
│ │ │ ├── server.py
│ │ │ └── uploads
│ │ │ │ └── .gitignore
│ │ ├── frontend
│ │ │ ├── app-detail.css
│ │ │ ├── app-detail.html
│ │ │ ├── app-detail.js
│ │ │ ├── index.html
│ │ │ ├── marketplace.css
│ │ │ └── marketplace.js
│ │ ├── index.html
│ │ ├── marketplace.css
│ │ └── marketplace.js
│ ├── migration
│ │ ├── table_extraction_v073.md
│ │ └── webscraping-strategy-migration.md
│ └── overrides
│ │ └── main.html
├── releases_review
│ ├── Crawl4AI_v0.3.72_Release_Announcement.ipynb
│ ├── crawl4ai_v0_7_0_showcase.py
│ ├── demo_v0.7.0.py
│ ├── demo_v0.7.5.py
│ ├── demo_v0.7.6.py
│ ├── v0.3.74.overview.py
│ ├── v0.7.5_docker_hooks_demo.py
│ ├── v0.7.5_video_walkthrough.ipynb
│ ├── v0_4_24_walkthrough.py
│ ├── v0_4_3b2_features_demo.py
│ └── v0_7_0_features_demo.py
├── snippets
│ └── deep_crawl
│ │ ├── 1.intro.py
│ │ └── 2.filters.py
└── tutorials
│ └── coming_soon.md
├── mkdocs.yml
├── prompts
└── prompt_net_requests.md
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
├── test_llm_webhook_feature.py
├── test_webhook_implementation.py
├── tests
├── WEBHOOK_TEST_README.md
├── __init__.py
├── adaptive
│ ├── compare_performance.py
│ ├── test_adaptive_crawler.py
│ ├── test_confidence_debug.py
│ ├── test_embedding_performance.py
│ ├── test_embedding_strategy.py
│ └── test_llm_embedding.py
├── async
│ ├── sample_wikipedia.html
│ ├── test_0.4.2_browser_manager.py
│ ├── test_0.4.2_config_params.py
│ ├── test_async_doanloader.py
│ ├── test_basic_crawling.py
│ ├── test_caching.py
│ ├── test_chunking_and_extraction_strategies.py
│ ├── test_content_extraction.py
│ ├── test_content_filter_bm25.py
│ ├── test_content_filter_prune.py
│ ├── test_content_scraper_strategy.py
│ ├── test_crawler_strategy.py
│ ├── test_database_operations.py
│ ├── test_dispatchers.py
│ ├── test_edge_cases.py
│ ├── test_error_handling.py
│ ├── test_evaluation_scraping_methods_performance.configs.py
│ ├── test_markdown_genertor.py
│ ├── test_parameters_and_options.py
│ ├── test_performance.py
│ └── test_screenshot.py
├── async_assistant
│ ├── test_extract_pipeline.py
│ └── test_extract_pipeline_v2.py
├── browser
│ ├── docker
│ │ ├── __init__.py
│ │ └── test_docker_browser.py
│ ├── manager
│ │ └── demo_browser_manager.py
│ ├── test_browser_manager.py
│ ├── test_builtin_browser.py
│ ├── test_builtin_strategy.py
│ ├── test_cdp_strategy.py
│ ├── test_combined.py
│ ├── test_launch_standalone.py
│ ├── test_parallel_crawling.py
│ ├── test_playwright_strategy.py
│ └── test_profiles.py
├── check_dependencies.py
├── cli
│ └── test_cli.py
├── deep_crwaling
│ └── test_filter.py
├── docker
│ ├── simple_api_test.py
│ ├── test_config_object.py
│ ├── test_docker.py
│ ├── test_dockerclient.py
│ ├── test_filter_deep_crawl.py
│ ├── test_hooks_client.py
│ ├── test_hooks_comprehensive.py
│ ├── test_hooks_utility.py
│ ├── test_llm_params.py
│ ├── test_rest_api_deep_crawl.py
│ ├── test_serialization.py
│ ├── test_server.py
│ ├── test_server_requests.py
│ └── test_server_token.py
├── docker_example.py
├── general
│ ├── generate_dummy_site.py
│ ├── test_acyn_crawl_wuth_http_crawler_strategy.py
│ ├── test_advanced_deep_crawl.py
│ ├── test_async_crawler_strategy.py
│ ├── test_async_markdown_generator.py
│ ├── test_async_url_seeder_bm25.py
│ ├── test_async_webcrawler.py
│ ├── test_bff_scoring.py
│ ├── test_cache_context.py
│ ├── test_content_source_parameter.py
│ ├── test_crawlers.py
│ ├── test_deep_crawl.py
│ ├── test_deep_crawl_filters.py
│ ├── test_deep_crawl_scorers.py
│ ├── test_download_file.py
│ ├── test_http_crawler_strategy.py
│ ├── test_llm_filter.py
│ ├── test_max_scroll.py
│ ├── test_mhtml.py
│ ├── test_network_console_capture.py
│ ├── test_persistent_context.py
│ ├── test_robot_parser.py
│ ├── test_schema_builder.py
│ ├── test_stream.py
│ ├── test_stream_dispatch.py
│ ├── test_url_pattern.py
│ └── tets_robot.py
├── hub
│ └── test_simple.py
├── loggers
│ └── test_logger.py
├── mcp
│ ├── test_mcp_socket.py
│ └── test_mcp_sse.py
├── memory
│ ├── README.md
│ ├── benchmark_report.py
│ ├── cap_test.py
│ ├── requirements.txt
│ ├── run_benchmark.py
│ ├── test_crawler_monitor.py
│ ├── test_dispatcher_stress.py
│ ├── test_docker_config_gen.py
│ ├── test_stress_api.py
│ ├── test_stress_api_xs.py
│ ├── test_stress_docker_api.py
│ └── test_stress_sdk.py
├── profiler
│ ├── test_create_profile.py
│ └── test_keyboard_handle.py
├── proxy
│ ├── test_proxy_config.py
│ └── test_proxy_deprecation.py
├── releases
│ ├── test_release_0.6.4.py
│ └── test_release_0.7.0.py
├── test_arun_many.py
├── test_cli_docs.py
├── test_config_matching_only.py
├── test_config_selection.py
├── test_docker.py
├── test_docker_api_with_llm_provider.py
├── test_link_extractor.py
├── test_llm_simple_url.py
├── test_llmtxt.py
├── test_main.py
├── test_memory_macos.py
├── test_multi_config.py
├── test_normalize_url.py
├── test_preserve_https_for_internal_links.py
├── test_scraping_strategy.py
├── test_virtual_scroll.py
├── test_web_crawler.py
└── test_webhook_feature.sh
└── uv.lock
/.claude/settings.local.json:
--------------------------------------------------------------------------------
1 | {
2 | "permissions": {
3 | "allow": [
4 | "Bash(cd:*)",
5 | "Bash(python3:*)",
6 | "Bash(python:*)",
7 | "Bash(grep:*)",
8 | "Bash(mkdir:*)",
9 | "Bash(cp:*)",
10 | "Bash(rm:*)",
11 | "Bash(true)",
12 | "Bash(./package-extension.sh:*)",
13 | "Bash(find:*)",
14 | "Bash(chmod:*)",
15 | "Bash(rg:*)",
16 | "Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -A 5 -B 5 \"Script Builder\" docs/md_v2/apps/crawl4ai-assistant/)",
17 | "Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -A 30 \"generateCode\\(events, format\\)\" docs/md_v2/apps/crawl4ai-assistant/content/content.js)",
18 | "Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg \"<style>\" docs/md_v2/apps/crawl4ai-assistant/index.html -A 5)",
19 | "Bash(git checkout:*)",
20 | "Bash(docker logs:*)",
21 | "Bash(curl:*)",
22 | "Bash(docker compose:*)",
23 | "Bash(./test-final-integration.sh:*)",
24 | "Bash(mv:*)"
25 | ]
26 | },
27 | "enableAllProjectMcpServers": false
28 | }
--------------------------------------------------------------------------------
/.env.txt:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY = "YOUR_GROQ_API"
2 | OPENAI_API_KEY = "YOUR_OPENAI_API"
3 | ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API"
4 | # You can add more API keys here
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Documentation
2 | *.html linguist-documentation
3 | docs/* linguist-documentation
4 | docs/examples/* linguist-documentation
5 | docs/md_v2/* linguist-documentation
6 |
7 | # Explicitly mark Python as the main language
8 | *.py linguist-detectable=true
9 | *.py linguist-language=Python
10 |
11 | # Exclude HTML from language statistics
12 | *.html linguist-detectable=false
13 |
--------------------------------------------------------------------------------
/.github/DISCUSSION_TEMPLATE/feature-requests.yml:
--------------------------------------------------------------------------------
1 | title: "[Feature Request]: "
2 | labels: ["⚙️ New"]
3 | body:
4 | - type: markdown
5 | attributes:
6 | value: |
7 | Thank you for your interest in suggesting a new feature! Before you submit, please take a moment to check if already exists in
8 | this discussions category to avoid duplicates. 😊
9 |
10 | - type: textarea
11 | id: needs_to_be_done
12 | attributes:
13 | label: What needs to be done?
14 | description: Please describe the feature or functionality you'd like to see.
15 | placeholder: "e.g., Return alt text along with images scraped from a webpages in Result"
16 | validations:
17 | required: true
18 |
19 | - type: textarea
20 | id: problem_to_solve
21 | attributes:
22 | label: What problem does this solve?
23 | description: Explain the pain point or issue this feature will help address.
24 | placeholder: "e.g., Bypass Captchas added by cloudflare"
25 | validations:
26 | required: true
27 |
28 | - type: textarea
29 | id: target_users
30 | attributes:
31 | label: Target users/beneficiaries
32 | description: Who would benefit from this feature? (e.g., specific teams, developers, users, etc.)
33 | placeholder: "e.g., Marketing teams, developers"
34 | validations:
35 | required: false
36 |
37 | - type: textarea
38 | id: current_workarounds
39 | attributes:
40 | label: Current alternatives/workarounds
41 | description: Are there any existing solutions or workarounds? How does this feature improve upon them?
42 | placeholder: "e.g., Users manually select the css classes mapped to data fields to extract them"
43 | validations:
44 | required: false
45 |
46 | - type: markdown
47 | attributes:
48 | value: |
49 | ### 💡 Implementation Ideas
50 |
51 | - type: textarea
52 | id: proposed_approach
53 | attributes:
54 | label: Proposed approach
55 | description: Share any ideas you have for how this feature could be implemented. Point out any challenges your foresee
56 | and the success metrics for this feature
57 | placeholder: "e.g., Implement a breadth first traversal algorithm for scraper"
58 | validations:
59 | required: false
60 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | # GitHub Sponsors
4 | github: unclecode
5 |
6 | # Custom links for enterprise inquiries (uncomment when ready)
7 | # custom: ["https://crawl4ai.com/enterprise"]
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: Feature Requests
4 | url: https://github.com/unclecode/crawl4ai/discussions/categories/feature-requests
5 | about: "Suggest new features or enhancements for Crawl4AI"
6 | - name: Forums - Q&A
7 | url: https://github.com/unclecode/crawl4ai/discussions/categories/forums-q-a
8 | about: "Ask questions or engage in general discussions about Crawl4AI"
9 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | ## Summary
2 | Please include a summary of the change and/or which issues are fixed.
3 |
4 | eg: `Fixes #123` (Tag GitHub issue numbers in this format, so it automatically links the issues with your PR)
5 |
6 | ## List of files changed and why
7 | eg: quickstart.py - To update the example as per new changes
8 |
9 | ## How Has This Been Tested?
10 | Please describe the tests that you ran to verify your changes.
11 |
12 | ## Checklist:
13 |
14 | - [ ] My code follows the style guidelines of this project
15 | - [ ] I have performed a self-review of my own code
16 | - [ ] I have commented my code, particularly in hard-to-understand areas
17 | - [ ] I have made corresponding changes to the documentation
18 | - [ ] I have added/updated unit tests that prove my fix is effective or that my feature works
19 | - [ ] New and existing unit tests pass locally with my changes
20 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Discord GitHub Notifications
2 |
3 | on:
4 | issues:
5 | types: [opened]
6 | issue_comment:
7 | types: [created]
8 | pull_request:
9 | types: [opened]
10 | discussion:
11 | types: [created]
12 | watch:
13 | types: [started]
14 |
15 | jobs:
16 | notify-discord:
17 | runs-on: ubuntu-latest
18 | steps:
19 | - name: Send to Google Apps Script (Stars only)
20 | if: github.event_name == 'watch'
21 | run: |
22 | curl -fSs -X POST "${{ secrets.GOOGLE_SCRIPT_ENDPOINT }}" \
23 | -H 'Content-Type: application/json' \
24 | -d '{"url":"${{ github.event.sender.html_url }}"}'
25 | - name: Set webhook based on event type
26 | id: set-webhook
27 | run: |
28 | if [ "${{ github.event_name }}" == "discussion" ]; then
29 | echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT
30 | elif [ "${{ github.event_name }}" == "watch" ]; then
31 | echo "webhook=${{ secrets.DISCORD_STAR_GAZERS }}" >> $GITHUB_OUTPUT
32 | else
33 | echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT
34 | fi
35 |
36 | - name: Discord Notification
37 | uses: Ilshidur/action-discord@master
38 | env:
39 | DISCORD_WEBHOOK: ${{ steps.set-webhook.outputs.webhook }}
40 | with:
41 | args: |
42 | ${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) ||
43 | github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) ||
44 | github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) ||
45 | github.event_name == 'watch' && format('⭐ {0} starred Crawl4AI 🥳! Check out their profile: {1}', github.event.sender.login, github.event.sender.html_url) ||
46 | format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }}
47 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | recursive-include crawl4ai/js_snippet *.js
--------------------------------------------------------------------------------
/SPONSORS.md:
--------------------------------------------------------------------------------
1 | # 💖 Sponsors & Supporters
2 |
3 | Thank you to everyone supporting Crawl4AI! Your sponsorship helps keep this project open-source and actively maintained.
4 |
5 | ## 👑 Founding Sponsors
6 | *The first 50 sponsors who believed in our vision - permanently recognized*
7 |
8 | <!-- Founding sponsors will be listed here with special recognition -->
9 | 🎉 **Become a Founding Sponsor!** Only [X/50] spots remaining! [Join now →](https://github.com/sponsors/unclecode)
10 |
11 | ---
12 |
13 | ## 🏢 Data Infrastructure Partners ($2000/month)
14 | *These organizations are building their data sovereignty with Crawl4AI at the core*
15 |
16 | <!-- Data Infrastructure Partners will be listed here -->
17 | *Be the first Data Infrastructure Partner! [Join us →](https://github.com/sponsors/unclecode)*
18 |
19 | ---
20 |
21 | ## 💼 Growing Teams ($500/month)
22 | *Teams scaling their data extraction with Crawl4AI*
23 |
24 | <!-- Growing Teams will be listed here -->
25 | *Your team could be here! [Become a sponsor →](https://github.com/sponsors/unclecode)*
26 |
27 | ---
28 |
29 | ## 🚀 Builders ($50/month)
30 | *Developers and entrepreneurs building with Crawl4AI*
31 |
32 | <!-- Builders will be listed here -->
33 | *Join the builders! [Start sponsoring →](https://github.com/sponsors/unclecode)*
34 |
35 | ---
36 |
37 | ## 🌱 Believers ($5/month)
38 | *The community supporting data democratization*
39 |
40 | <!-- Believers will be listed here -->
41 | *Thank you to all our community believers!*
42 |
43 | ---
44 |
45 | ## 🤝 Want to Sponsor?
46 |
47 | Crawl4AI is the #1 trending open-source web crawler. We're building the future of data extraction - where organizations own their data pipelines instead of relying on rate-limited APIs.
48 |
49 | ### Available Sponsorship Tiers:
50 | - **🌱 Believer** ($5/mo) - Support the movement
51 | - **🚀 Builder** ($50/mo) - Priority support & early access
52 | - **💼 Growing Team** ($500/mo) - Bi-weekly syncs & optimization
53 | - **🏢 Data Infrastructure Partner** ($2000/mo) - Full partnership & dedicated support
54 |
55 | [View all tiers and benefits →](https://github.com/sponsors/unclecode)
56 |
57 | ### Enterprise & Custom Partnerships
58 |
59 | Building data extraction at scale? Need dedicated support or infrastructure? Let's talk about a custom partnership.
60 |
61 | 📧 Contact: [hello@crawl4ai.com](mailto:hello@crawl4ai.com) | 📅 [Schedule a call](https://calendar.app.google/rEpvi2UBgUQjWHfJ9)
62 |
63 | ---
64 |
65 | *This list is updated regularly. Sponsors at $50+ tiers can submit their logos via [hello@crawl4ai.com](mailto:hello@crawl4ai.com)*
--------------------------------------------------------------------------------
/cliff.toml:
--------------------------------------------------------------------------------
1 | [changelog]
2 | # Template format
3 | header = """
4 | # Changelog\n
5 | All notable changes to this project will be documented in this file.\n
6 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
7 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n
8 | """
9 |
10 | # Organize commits by type
11 | [git]
12 | conventional_commits = true
13 | filter_unconventional = true
14 | commit_parsers = [
15 | { message = "^feat", group = "Added"},
16 | { message = "^fix", group = "Fixed"},
17 | { message = "^doc", group = "Documentation"},
18 | { message = "^perf", group = "Performance"},
19 | { message = "^refactor", group = "Changed"},
20 | { message = "^style", group = "Changed"},
21 | { message = "^test", group = "Testing"},
22 | { message = "^chore\\(release\\): prepare for", skip = true},
23 | { message = "^chore", group = "Miscellaneous Tasks"},
24 | ]
--------------------------------------------------------------------------------
/crawl4ai/__version__.py:
--------------------------------------------------------------------------------
1 | # crawl4ai/__version__.py
2 |
3 | # This is the version that will be used for stable releases
4 | __version__ = "0.7.6"
5 |
6 | # For nightly builds, this gets set during build process
7 | __nightly_version__ = None
8 |
9 |
--------------------------------------------------------------------------------
/crawl4ai/crawlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/crawl4ai/crawlers/__init__.py
--------------------------------------------------------------------------------
/crawl4ai/crawlers/amazon_product/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/crawl4ai/crawlers/amazon_product/__init__.py
--------------------------------------------------------------------------------
/crawl4ai/crawlers/amazon_product/crawler.py:
--------------------------------------------------------------------------------
1 | from crawl4ai.hub import BaseCrawler
2 |
3 | __meta__ = {
4 | "version": "1.2.0",
5 | "tested_on": ["amazon.com"],
6 | "rate_limit": "50 RPM",
7 | "schema": {"product": ["name", "price"]}
8 | }
9 |
10 | class AmazonProductCrawler(BaseCrawler):
11 | async def run(self, url: str, **kwargs) -> str:
12 | try:
13 | self.logger.info(f"Crawling {url}")
14 | return '{"product": {"name": "Test Amazon Product"}}'
15 | except Exception as e:
16 | self.logger.error(f"Crawl failed: {str(e)}")
17 | return json.dumps({
18 | "error": str(e),
19 | "metadata": self.meta # Include meta in error response
20 | })
--------------------------------------------------------------------------------
/crawl4ai/crawlers/google_search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/crawl4ai/crawlers/google_search/__init__.py
--------------------------------------------------------------------------------
/crawl4ai/deep_crawling/__init__.py:
--------------------------------------------------------------------------------
1 | # deep_crawling/__init__.py
2 | from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
3 | from .bfs_strategy import BFSDeepCrawlStrategy
4 | from .bff_strategy import BestFirstCrawlingStrategy
5 | from .dfs_strategy import DFSDeepCrawlStrategy
6 | from .filters import (
7 | FilterChain,
8 | ContentTypeFilter,
9 | DomainFilter,
10 | URLFilter,
11 | URLPatternFilter,
12 | FilterStats,
13 | ContentRelevanceFilter,
14 | SEOFilter
15 | )
16 | from .scorers import (
17 | KeywordRelevanceScorer,
18 | URLScorer,
19 | CompositeScorer,
20 | DomainAuthorityScorer,
21 | FreshnessScorer,
22 | PathDepthScorer,
23 | ContentTypeScorer
24 | )
25 |
26 | __all__ = [
27 | "DeepCrawlDecorator",
28 | "DeepCrawlStrategy",
29 | "BFSDeepCrawlStrategy",
30 | "BestFirstCrawlingStrategy",
31 | "DFSDeepCrawlStrategy",
32 | "FilterChain",
33 | "ContentTypeFilter",
34 | "DomainFilter",
35 | "URLFilter",
36 | "URLPatternFilter",
37 | "FilterStats",
38 | "ContentRelevanceFilter",
39 | "SEOFilter",
40 | "KeywordRelevanceScorer",
41 | "URLScorer",
42 | "CompositeScorer",
43 | "DomainAuthorityScorer",
44 | "FreshnessScorer",
45 | "PathDepthScorer",
46 | "ContentTypeScorer",
47 | ]
48 |
--------------------------------------------------------------------------------
/crawl4ai/html2text/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import main
2 |
3 | main()
4 |
--------------------------------------------------------------------------------
/crawl4ai/html2text/_typing.py:
--------------------------------------------------------------------------------
1 | class OutCallback:
2 | def __call__(self, s: str) -> None:
3 | ...
4 |
--------------------------------------------------------------------------------
/crawl4ai/html2text/elements.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Optional
2 |
3 |
4 | class AnchorElement:
5 | __slots__ = ["attrs", "count", "outcount"]
6 |
7 | def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
8 | self.attrs = attrs
9 | self.count = count
10 | self.outcount = outcount
11 |
12 |
13 | class ListElement:
14 | __slots__ = ["name", "num"]
15 |
16 | def __init__(self, name: str, num: int):
17 | self.name = name
18 | self.num = num
19 |
--------------------------------------------------------------------------------
/crawl4ai/hub.py:
--------------------------------------------------------------------------------
1 | # crawl4ai/hub.py
2 | from abc import ABC, abstractmethod
3 | from typing import Dict, Type, Union
4 | import logging
5 | import importlib
6 | from pathlib import Path
7 | import inspect
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | class BaseCrawler(ABC):
13 | def __init__(self):
14 | self.logger = logging.getLogger(self.__class__.__name__)
15 |
16 | @abstractmethod
17 | async def run(self, url: str = "", **kwargs) -> str:
18 | """
19 | Implement this method to return JSON string.
20 | Must accept URL + arbitrary kwargs for flexibility.
21 | """
22 | pass
23 |
24 | def __init_subclass__(cls, **kwargs):
25 | """Enforce interface validation on subclassing"""
26 | super().__init_subclass__(**kwargs)
27 |
28 | # Verify run method signature
29 | run_method = cls.run
30 | if not run_method.__code__.co_argcount >= 2: # self + url
31 | raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'")
32 |
33 | # Verify async nature
34 | if not inspect.iscoroutinefunction(run_method):
35 | raise TypeError(f"{cls.__name__}.run must be async")
36 |
37 | class CrawlerHub:
38 | _crawlers: Dict[str, Type[BaseCrawler]] = {}
39 |
40 | @classmethod
41 | def _discover_crawlers(cls):
42 | """Dynamically load crawlers from /crawlers in 3 lines"""
43 | base_path = Path(__file__).parent / "crawlers"
44 | for crawler_dir in base_path.iterdir():
45 | if crawler_dir.is_dir():
46 | try:
47 | module = importlib.import_module(
48 | f"crawl4ai.crawlers.{crawler_dir.name}.crawler"
49 | )
50 | for attr in dir(module):
51 | cls._maybe_register_crawler(
52 | getattr(module, attr), crawler_dir.name
53 | )
54 | except Exception as e:
55 | logger.warning(f"Failed {crawler_dir.name}: {str(e)}")
56 |
57 | @classmethod
58 | def _maybe_register_crawler(cls, obj, name: str):
59 | """Brilliant one-liner registration"""
60 | if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler:
61 | module = importlib.import_module(obj.__module__)
62 | obj.meta = getattr(module, "__meta__", {})
63 | cls._crawlers[name] = obj
64 |
65 | @classmethod
66 | def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
67 | if not cls._crawlers:
68 | cls._discover_crawlers()
69 | return cls._crawlers.get(name)
--------------------------------------------------------------------------------
/crawl4ai/js_snippet/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | # Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free
5 | def load_js_script(script_name):
6 | # Get the path of the current script
7 | current_script_path = os.path.dirname(os.path.realpath(__file__))
8 | # Get the path of the script to load
9 | script_path = os.path.join(current_script_path, script_name + ".js")
10 | # Check if the script exists
11 | if not os.path.exists(script_path):
12 | raise ValueError(
13 | f"Script {script_name} not found in the folder {current_script_path}"
14 | )
15 | # Load the content of the script
16 | with open(script_path, "r") as f:
17 | script_content = f.read()
18 | return script_content
19 |
--------------------------------------------------------------------------------
/crawl4ai/js_snippet/navigator_overrider.js:
--------------------------------------------------------------------------------
1 | // Pass the Permissions Test.
2 | const originalQuery = window.navigator.permissions.query;
3 | window.navigator.permissions.query = (parameters) =>
4 | parameters.name === "notifications"
5 | ? Promise.resolve({ state: Notification.permission })
6 | : originalQuery(parameters);
7 | Object.defineProperty(navigator, "webdriver", {
8 | get: () => undefined,
9 | });
10 | window.navigator.chrome = {
11 | runtime: {},
12 | // Add other properties if necessary
13 | };
14 | Object.defineProperty(navigator, "plugins", {
15 | get: () => [1, 2, 3, 4, 5],
16 | });
17 | Object.defineProperty(navigator, "languages", {
18 | get: () => ["en-US", "en"],
19 | });
20 | Object.defineProperty(document, "hidden", {
21 | get: () => false,
22 | });
23 | Object.defineProperty(document, "visibilityState", {
24 | get: () => "visible",
25 | });
26 |
--------------------------------------------------------------------------------
/crawl4ai/js_snippet/update_image_dimensions.js:
--------------------------------------------------------------------------------
1 | () => {
2 | return new Promise((resolve) => {
3 | const filterImage = (img) => {
4 | // Filter out images that are too small
5 | if (img.width < 100 && img.height < 100) return false;
6 |
7 | // Filter out images that are not visible
8 | const rect = img.getBoundingClientRect();
9 | if (rect.width === 0 || rect.height === 0) return false;
10 |
11 | // Filter out images with certain class names (e.g., icons, thumbnails)
12 | if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false;
13 |
14 | // Filter out images with certain patterns in their src (e.g., placeholder images)
15 | if (img.src.includes("placeholder") || img.src.includes("icon")) return false;
16 |
17 | return true;
18 | };
19 |
20 | const images = Array.from(document.querySelectorAll("img")).filter(filterImage);
21 | let imagesLeft = images.length;
22 |
23 | if (imagesLeft === 0) {
24 | resolve();
25 | return;
26 | }
27 |
28 | const checkImage = (img) => {
29 | if (img.complete && img.naturalWidth !== 0) {
30 | img.setAttribute("width", img.naturalWidth);
31 | img.setAttribute("height", img.naturalHeight);
32 | imagesLeft--;
33 | if (imagesLeft === 0) resolve();
34 | }
35 | };
36 |
37 | images.forEach((img) => {
38 | checkImage(img);
39 | if (!img.complete) {
40 | img.onload = () => {
41 | checkImage(img);
42 | };
43 | img.onerror = () => {
44 | imagesLeft--;
45 | if (imagesLeft === 0) resolve();
46 | };
47 | }
48 | });
49 |
50 | // Fallback timeout of 5 seconds
51 | // setTimeout(() => resolve(), 5000);
52 | resolve();
53 | });
54 | };
55 |
--------------------------------------------------------------------------------
/crawl4ai/legacy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/crawl4ai/legacy/__init__.py
--------------------------------------------------------------------------------
/crawl4ai/legacy/version_manager.py:
--------------------------------------------------------------------------------
1 | # version_manager.py
2 | from pathlib import Path
3 | from packaging import version
4 | from . import __version__
5 |
6 |
7 | class VersionManager:
8 | def __init__(self):
9 | self.home_dir = Path.home() / ".crawl4ai"
10 | self.version_file = self.home_dir / "version.txt"
11 |
12 | def get_installed_version(self):
13 | """Get the version recorded in home directory"""
14 | if not self.version_file.exists():
15 | return None
16 | try:
17 | return version.parse(self.version_file.read_text().strip())
18 | except:
19 | return None
20 |
21 | def update_version(self):
22 | """Update the version file to current library version"""
23 | self.version_file.write_text(__version__.__version__)
24 |
25 | def needs_update(self):
26 | """Check if database needs update based on version"""
27 | installed = self.get_installed_version()
28 | current = version.parse(__version__.__version__)
29 | return installed is None or installed < current
30 |
--------------------------------------------------------------------------------
/crawl4ai/script/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | C4A-Script: A domain-specific language for web automation in Crawl4AI
3 | """
4 |
5 | from .c4a_compile import C4ACompiler, compile, validate, compile_file
6 | from .c4a_result import (
7 | CompilationResult,
8 | ValidationResult,
9 | ErrorDetail,
10 | WarningDetail,
11 | ErrorType,
12 | Severity,
13 | Suggestion
14 | )
15 |
16 | __all__ = [
17 | # Main compiler
18 | "C4ACompiler",
19 |
20 | # Convenience functions
21 | "compile",
22 | "validate",
23 | "compile_file",
24 |
25 | # Result types
26 | "CompilationResult",
27 | "ValidationResult",
28 | "ErrorDetail",
29 | "WarningDetail",
30 |
31 | # Enums
32 | "ErrorType",
33 | "Severity",
34 | "Suggestion"
35 | ]
--------------------------------------------------------------------------------
/deploy/docker/.dockerignore:
--------------------------------------------------------------------------------
1 | # .dockerignore
2 | *
3 |
4 | # Allow specific files and directories when using local installation
5 | !crawl4ai/
6 | !docs/
7 | !deploy/docker/
8 | !setup.py
9 | !pyproject.toml
10 | !README.md
11 | !LICENSE
12 | !MANIFEST.in
13 | !setup.cfg
14 | !mkdocs.yml
15 |
16 | .git/
17 | __pycache__/
18 | *.pyc
19 | *.pyo
20 | *.pyd
21 | .DS_Store
22 | .env
23 | .venv
24 | venv/
25 | tests/
26 | coverage.xml
27 | *.log
28 | *.swp
29 | *.egg-info/
30 | dist/
31 | build/
--------------------------------------------------------------------------------
/deploy/docker/.llm.env.example:
--------------------------------------------------------------------------------
1 | # LLM Provider Keys
2 | OPENAI_API_KEY=your_openai_key_here
3 | DEEPSEEK_API_KEY=your_deepseek_key_here
4 | ANTHROPIC_API_KEY=your_anthropic_key_here
5 | GROQ_API_KEY=your_groq_key_here
6 | TOGETHER_API_KEY=your_together_key_here
7 | MISTRAL_API_KEY=your_mistral_key_here
8 | GEMINI_API_TOKEN=your_gemini_key_here
9 |
10 | # Optional: Override the default LLM provider
11 | # Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
12 | # If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
13 | # LLM_PROVIDER=anthropic/claude-3-opus
14 |
15 | # Optional: Global LLM temperature setting (0.0-2.0)
16 | # Controls randomness in responses. Lower = more focused, Higher = more creative
17 | # LLM_TEMPERATURE=0.7
18 |
19 | # Optional: Global custom API base URL
20 | # Use this to point to custom endpoints or proxy servers
21 | # LLM_BASE_URL=https://api.custom.com/v1
22 |
23 | # Optional: Provider-specific temperature overrides
24 | # These take precedence over the global LLM_TEMPERATURE
25 | # OPENAI_TEMPERATURE=0.5
26 | # ANTHROPIC_TEMPERATURE=0.3
27 | # GROQ_TEMPERATURE=0.8
28 |
29 | # Optional: Provider-specific base URL overrides
30 | # Use for provider-specific proxy endpoints
31 | # OPENAI_BASE_URL=https://custom-openai.company.com/v1
32 | # GROQ_BASE_URL=https://custom-groq.company.com/v1
--------------------------------------------------------------------------------
/deploy/docker/crawler_pool.py:
--------------------------------------------------------------------------------
1 | # crawler_pool.py (new file)
2 | import asyncio, json, hashlib, time, psutil
3 | from contextlib import suppress
4 | from typing import Dict
5 | from crawl4ai import AsyncWebCrawler, BrowserConfig
6 | from typing import Dict
7 | from utils import load_config
8 |
9 | CONFIG = load_config()
10 |
11 | POOL: Dict[str, AsyncWebCrawler] = {}
12 | LAST_USED: Dict[str, float] = {}
13 | LOCK = asyncio.Lock()
14 |
15 | MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM – refuse new browsers above this
16 | IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30 min
17 |
18 | def _sig(cfg: BrowserConfig) -> str:
19 | payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
20 | return hashlib.sha1(payload.encode()).hexdigest()
21 |
22 | async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
23 | try:
24 | sig = _sig(cfg)
25 | async with LOCK:
26 | if sig in POOL:
27 | LAST_USED[sig] = time.time();
28 | return POOL[sig]
29 | if psutil.virtual_memory().percent >= MEM_LIMIT:
30 | raise MemoryError("RAM pressure – new browser denied")
31 | crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
32 | await crawler.start()
33 | POOL[sig] = crawler; LAST_USED[sig] = time.time()
34 | return crawler
35 | except MemoryError as e:
36 | raise MemoryError(f"RAM pressure – new browser denied: {e}")
37 | except Exception as e:
38 | raise RuntimeError(f"Failed to start browser: {e}")
39 | finally:
40 | if sig in POOL:
41 | LAST_USED[sig] = time.time()
42 | else:
43 | # If we failed to start the browser, we should remove it from the pool
44 | POOL.pop(sig, None)
45 | LAST_USED.pop(sig, None)
46 | # If we failed to start the browser, we should remove it from the pool
47 | async def close_all():
48 | async with LOCK:
49 | await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
50 | POOL.clear(); LAST_USED.clear()
51 |
52 | async def janitor():
53 | while True:
54 | await asyncio.sleep(60)
55 | now = time.time()
56 | async with LOCK:
57 | for sig, crawler in list(POOL.items()):
58 | if now - LAST_USED[sig] > IDLE_TTL:
59 | with suppress(Exception): await crawler.close()
60 | POOL.pop(sig, None); LAST_USED.pop(sig, None)
61 |
--------------------------------------------------------------------------------
/deploy/docker/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi>=0.115.12
2 | uvicorn>=0.34.2
3 | gunicorn>=23.0.0
4 | slowapi==0.1.9
5 | prometheus-fastapi-instrumentator>=7.1.0
6 | redis>=5.2.1
7 | jwt>=1.3.1
8 | dnspython>=2.7.0
9 | email-validator==2.2.0
10 | sse-starlette==2.2.1
11 | pydantic>=2.11
12 | rank-bm25==0.2.2
13 | anyio==4.9.0
14 | PyJWT==2.10.1
15 | mcp>=1.18.0
16 | websockets>=15.0.1
17 | httpx[http2]>=0.27.2
18 |
--------------------------------------------------------------------------------
/deploy/docker/supervisord.conf:
--------------------------------------------------------------------------------
1 | [supervisord]
2 | nodaemon=true ; Run supervisord in the foreground
3 | logfile=/dev/null ; Log supervisord output to stdout/stderr
4 | logfile_maxbytes=0
5 |
6 | [program:redis]
7 | command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
8 | user=appuser ; Run redis as our non-root user
9 | autorestart=true
10 | priority=10
11 | stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout
12 | stdout_logfile_maxbytes=0
13 | stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr
14 | stderr_logfile_maxbytes=0
15 |
16 | [program:gunicorn]
17 | command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 1 --threads 4 --timeout 1800 --graceful-timeout 30 --keep-alive 300 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
18 | directory=/app ; Working directory for the app
19 | user=appuser ; Run gunicorn as our non-root user
20 | autorestart=true
21 | priority=20
22 | environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
23 | stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
24 | stdout_logfile_maxbytes=0
25 | stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
26 | stderr_logfile_maxbytes=0
27 |
28 | # Optional: Add filebeat or other logging agents here if needed
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 |
3 | # Shared configuration for all environments
4 | x-base-config: &base-config
5 | ports:
6 | - "11235:11235" # Gunicorn port
7 | env_file:
8 | - .llm.env # API keys (create from .llm.env.example)
9 | environment:
10 | - OPENAI_API_KEY=${OPENAI_API_KEY:-}
11 | - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
12 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
13 | - GROQ_API_KEY=${GROQ_API_KEY:-}
14 | - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
15 | - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
16 | - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
17 | - LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
18 | volumes:
19 | - /dev/shm:/dev/shm # Chromium performance
20 | deploy:
21 | resources:
22 | limits:
23 | memory: 4G
24 | reservations:
25 | memory: 1G
26 | restart: unless-stopped
27 | healthcheck:
28 | test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
29 | interval: 30s
30 | timeout: 10s
31 | retries: 3
32 | start_period: 40s
33 | user: "appuser"
34 |
35 | services:
36 | crawl4ai:
37 | # 1. Default: Pull multi-platform test image from Docker Hub
38 | # 2. Override with local image via: IMAGE=local-test docker compose up
39 | image: ${IMAGE:-unclecode/crawl4ai:${TAG:-latest}}
40 |
41 | # Local build config (used with --build)
42 | build:
43 | context: .
44 | dockerfile: Dockerfile
45 | args:
46 | INSTALL_TYPE: ${INSTALL_TYPE:-default}
47 | ENABLE_GPU: ${ENABLE_GPU:-false}
48 |
49 | # Inherit shared config
50 | <<: *base-config
--------------------------------------------------------------------------------
/docs/apps/linkdin/schemas/company_card.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "LinkedIn Company Search Result Card",
3 | "baseSelector": "div[data-chameleon-result-urn][data-view-name=\"search-entity-result-universal-template\"]",
4 | "baseFields": [
5 | {
6 | "name": "chameleon_result_urn",
7 | "type": "attribute",
8 | "attribute": "data-chameleon-result-urn"
9 | },
10 | {
11 | "name": "view_name",
12 | "type": "attribute",
13 | "attribute": "data-view-name"
14 | }
15 | ],
16 | "fields": [
17 | {
18 | "name": "handle",
19 | "selector": "div.mb1 div.display-flex span a[data-test-app-aware-link]",
20 | "type": "attribute",
21 | "attribute": "href"
22 | },
23 | {
24 | "name": "profile_image",
25 | "selector": "div.ivm-image-view-model img",
26 | "type": "attribute",
27 | "attribute": "src"
28 | },
29 | {
30 | "name": "name",
31 | "selector": "div.mb1 div.display-flex span a[data-test-app-aware-link]",
32 | "type": "text"
33 | },
34 | {
35 | "name": "descriptor",
36 | "selector": "div.mb1 > div[class*=\"t-14 t-black\"]",
37 | "type": "text"
38 | },
39 | {
40 | "name": "about",
41 | "selector": "p.entity-result__summary--2-lines",
42 | "type": "text"
43 | },
44 | {
45 | "name": "followers",
46 | "selector": "div.mb1 > div:nth-of-type(3)",
47 | "type": "regex",
48 | "pattern": "(\\d+[KM]?) followers"
49 | }
50 | ]
51 | }
--------------------------------------------------------------------------------
/docs/apps/linkdin/schemas/people_card.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "LinkedIn People Profile Card",
3 | "baseSelector": "li.org-people-profile-card__profile-card-spacing",
4 | "baseFields": [],
5 | "fields": [
6 | {
7 | "name": "profile_url",
8 | "selector": "div.artdeco-entity-lockup__title a[data-test-app-aware-link]",
9 | "type": "attribute",
10 | "attribute": "href"
11 | },
12 | {
13 | "name": "avatar_url",
14 | "selector": "div.artdeco-entity-lockup__image img",
15 | "type": "attribute",
16 | "attribute": "src"
17 | },
18 | {
19 | "name": "name",
20 | "selector": "div.artdeco-entity-lockup__title a div.lt-line-clamp--single-line",
21 | "type": "text"
22 | },
23 | {
24 | "name": "headline",
25 | "selector": "div.artdeco-entity-lockup__subtitle div.lt-line-clamp--multi-line",
26 | "type": "text"
27 | },
28 | {
29 | "name": "followers",
30 | "selector": "span.text-align-center span.lt-line-clamp--multi-line",
31 | "type": "regex",
32 | "pattern": "(\\d+)"
33 | },
34 | {
35 | "name": "connection_degree",
36 | "selector": "span.artdeco-entity-lockup__degree",
37 | "type": "regex",
38 | "pattern": "(\\d+\\w+)"
39 | }
40 | ]
41 | }
--------------------------------------------------------------------------------
/docs/apps/linkdin/templates/ai.js:
--------------------------------------------------------------------------------
1 | // ==== File: ai.js ====
2 |
3 | class ApiHandler {
4 | constructor(apiKey = null) {
5 | this.apiKey = apiKey || localStorage.getItem("openai_api_key") || "";
6 | console.log("ApiHandler ready");
7 | }
8 |
9 | setApiKey(k) {
10 | this.apiKey = k.trim();
11 | if (this.apiKey) localStorage.setItem("openai_api_key", this.apiKey);
12 | }
13 |
14 | async *chatStream(messages, {model = "gpt-4o", temperature = 0.7} = {}) {
15 | if (!this.apiKey) throw new Error("OpenAI API key missing");
16 | const payload = {model, messages, stream: true, max_tokens: 1024};
17 | const controller = new AbortController();
18 |
19 | const res = await fetch("https://api.openai.com/v1/chat/completions", {
20 | method: "POST",
21 | headers: {
22 | "Content-Type": "application/json",
23 | Authorization: `Bearer ${this.apiKey}`,
24 | },
25 | body: JSON.stringify(payload),
26 | signal: controller.signal,
27 | });
28 | if (!res.ok) throw new Error(`OpenAI: ${res.statusText}`);
29 | const reader = res.body.getReader();
30 | const dec = new TextDecoder();
31 |
32 | let buf = "";
33 | while (true) {
34 | const {done, value} = await reader.read();
35 | if (done) break;
36 | buf += dec.decode(value, {stream: true});
37 | for (const line of buf.split("\n")) {
38 | if (!line.startsWith("data: ")) continue;
39 | if (line.includes("[DONE]")) return;
40 | const json = JSON.parse(line.slice(6));
41 | const delta = json.choices?.[0]?.delta?.content;
42 | if (delta) yield delta;
43 | }
44 | buf = buf.endsWith("\n") ? "" : buf; // keep partial line
45 | }
46 | }
47 | }
48 |
49 | window.API = new ApiHandler();
50 |
--------------------------------------------------------------------------------
/docs/assets/pitch-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/assets/pitch-dark.png
--------------------------------------------------------------------------------
/docs/assets/powered-by-dark.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
2 | <!-- Dark Theme -->
3 | <g>
4 | <defs>
5 | <pattern id="halftoneDark" width="4" height="4" patternUnits="userSpaceOnUse">
6 | <circle cx="2" cy="2" r="1" fill="#eee" opacity="0.1"/>
7 | </pattern>
8 | <pattern id="halftoneTextDark" width="3" height="3" patternUnits="userSpaceOnUse">
9 | <circle cx="1.5" cy="1.5" r="2" fill="#aaa" opacity="0.2"/>
10 | </pattern>
11 | </defs>
12 | <!-- White border - added as outer rectangle -->
13 | <rect width="120" height="35" rx="5" fill="#111"/>
14 | <!-- Dark background slightly smaller to show thicker border -->
15 | <rect x="2" y="2" width="116" height="31" rx="4" fill="#1a1a1a"/>
16 | <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneDark)"/>
17 |
18 | <!-- Logo with halftone -->
19 | <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#eee" stroke-width="2"/>
20 | <path d="M18 17.5 L27 17.5" stroke="#eee" stroke-width="2"/>
21 | <circle cx="22.5" cy="17.5" r="2" fill="#eee"/>
22 |
23 | <text x="40" y="23" fill="#eee" font-family="Arial, sans-serif" font-weight="500" font-size="14">Crawl4AI</text>
24 | </g>
25 | </svg>
--------------------------------------------------------------------------------
/docs/assets/powered-by-disco.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
2 | <g>
3 | <defs>
4 | <pattern id="cyberdots" width="4" height="4" patternUnits="userSpaceOnUse">
5 | <circle cx="2" cy="2" r="1">
6 | <animate attributeName="fill"
7 | values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
8 | dur="6s"
9 | repeatCount="indefinite"/>
10 | <animate attributeName="opacity"
11 | values="0.2;0.4;0.2"
12 | dur="4s"
13 | repeatCount="indefinite"/>
14 | </circle>
15 | </pattern>
16 | <filter id="neonGlow" x="-20%" y="-20%" width="140%" height="140%">
17 | <feGaussianBlur stdDeviation="1" result="blur"/>
18 | <feFlood flood-color="#FF2EC4" flood-opacity="0.2">
19 | <animate attributeName="flood-color"
20 | values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
21 | dur="8s"
22 | repeatCount="indefinite"/>
23 | </feFlood>
24 | <feComposite in2="blur" operator="in"/>
25 | <feMerge>
26 | <feMergeNode/>
27 | <feMergeNode in="SourceGraphic"/>
28 | </feMerge>
29 | </filter>
30 | </defs>
31 |
32 | <rect width="120" height="35" rx="5" fill="#0A0A0F"/>
33 | <rect x="2" y="2" width="116" height="31" rx="4" fill="#16161E"/>
34 | <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#cyberdots)"/>
35 |
36 | <!-- Logo with animated neon -->
37 | <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)">
38 | <animate attributeName="stroke"
39 | values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
40 | dur="8s"
41 | repeatCount="indefinite"/>
42 | </path>
43 | <path d="M18 17.5 L27 17.5" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)">
44 | <animate attributeName="stroke"
45 | values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
46 | dur="8s"
47 | repeatCount="indefinite"/>
48 | </path>
49 | <circle cx="22.5" cy="17.5" r="2" fill="#0BC5EA">
50 | <animate attributeName="fill"
51 | values="#0BC5EA;#FF2EC4;#8B5CF6;#0BC5EA"
52 | dur="8s"
53 | repeatCount="indefinite"/>
54 | </circle>
55 |
56 | <text x="40" y="23" font-family="Arial, sans-serif" font-weight="500" font-size="14" filter="url(#neonGlow)">
57 | <animate attributeName="fill"
58 | values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
59 | dur="8s"
60 | repeatCount="indefinite"/>
61 | Crawl4AI
62 | </text>
63 | </g>
64 | </svg>
--------------------------------------------------------------------------------
/docs/assets/powered-by-light.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
2 | <g>
3 | <defs>
4 | <pattern id="halftoneLight" width="4" height="4" patternUnits="userSpaceOnUse">
5 | <circle cx="2" cy="2" r="1" fill="#111" opacity="0.1"/>
6 | </pattern>
7 | </defs>
8 | <!-- Dark border -->
9 | <rect width="120" height="35" rx="5" fill="#DDD"/>
10 | <!-- Light background -->
11 | <rect x="2" y="2" width="116" height="31" rx="4" fill="#fff"/>
12 | <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneLight)"/>
13 |
14 | <!-- Logo -->
15 | <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#111" stroke-width="2"/>
16 | <path d="M18 17.5 L27 17.5" stroke="#111" stroke-width="2"/>
17 | <circle cx="22.5" cy="17.5" r="2" fill="#111"/>
18 |
19 | <text x="40" y="23" fill="#111" font-family="Arial, sans-serif" font-weight="500" font-size="14">Crawl4AI</text>
20 | </g>
21 | </svg>
--------------------------------------------------------------------------------
/docs/assets/powered-by-night.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
2 | <g>
3 | <defs>
4 | <pattern id="halftoneDark" width="4" height="4" patternUnits="userSpaceOnUse">
5 | <circle cx="2" cy="2" r="1" fill="#8B5CF6" opacity="0.1"/>
6 | </pattern>
7 | <filter id="neonGlow" x="-20%" y="-20%" width="140%" height="140%">
8 | <feGaussianBlur stdDeviation="1" result="blur"/>
9 | <feFlood flood-color="#8B5CF6" flood-opacity="0.2"/>
10 | <feComposite in2="blur" operator="in"/>
11 | <feMerge>
12 | <feMergeNode/>
13 | <feMergeNode in="SourceGraphic"/>
14 | </feMerge>
15 | </filter>
16 | </defs>
17 | <rect width="120" height="35" rx="5" fill="#0A0A0F"/>
18 | <rect x="2" y="2" width="116" height="31" rx="4" fill="#16161E"/>
19 | <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneDark)"/>
20 |
21 | <!-- Logo with neon glow -->
22 | <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)"/>
23 | <path d="M18 17.5 L27 17.5" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)"/>
24 | <circle cx="22.5" cy="17.5" r="2" fill="#8B5CF6"/>
25 |
26 | <text x="40" y="23" fill="#fff" font-family="Arial, sans-serif" font-weight="500" font-size="14" filter="url(#neonGlow)">Crawl4AI</text>
27 | </g>
28 | </svg>
--------------------------------------------------------------------------------
/docs/blog/release-v0.7.1.md:
--------------------------------------------------------------------------------
1 | # 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
2 |
3 | *July 17, 2025 • 2 min read*
4 |
5 | ---
6 |
7 | A small maintenance release that removes unused code and improves documentation.
8 |
9 | ## 🎯 What's Changed
10 |
11 | - **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
12 | - **Updated documentation** with better examples and parameter explanations
13 | - **Fixed virtual scroll configuration** examples in docs
14 |
15 | ## 🧹 Code Cleanup
16 |
17 | Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
18 |
19 | ```python
20 | # Removed unused code:
21 | from playwright_stealth import StealthConfig
22 | stealth_config = StealthConfig(...) # This was never used
23 | ```
24 |
25 | ## 📖 Documentation Updates
26 |
27 | - Fixed adaptive crawling parameter examples
28 | - Updated session management documentation
29 | - Corrected virtual scroll configuration examples
30 |
31 | ## 🚀 Installation
32 |
33 | ```bash
34 | pip install crawl4ai==0.7.1
35 | ```
36 |
37 | No breaking changes - upgrade directly from v0.7.0.
38 |
39 | ---
40 |
41 | Questions? Issues?
42 | - GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
43 | - Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
--------------------------------------------------------------------------------
/docs/examples/assets/audio.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/audio.mp3
--------------------------------------------------------------------------------
/docs/examples/assets/basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/basic.png
--------------------------------------------------------------------------------
/docs/examples/assets/cosine_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/cosine_extraction.png
--------------------------------------------------------------------------------
/docs/examples/assets/css_js.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/css_js.png
--------------------------------------------------------------------------------
/docs/examples/assets/css_selector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/css_selector.png
--------------------------------------------------------------------------------
/docs/examples/assets/exec_script.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/exec_script.png
--------------------------------------------------------------------------------
/docs/examples/assets/instagram_grid_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/instagram_grid_result.png
--------------------------------------------------------------------------------
/docs/examples/assets/llm_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/llm_extraction.png
--------------------------------------------------------------------------------
/docs/examples/assets/semantic_extraction_cosine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/semantic_extraction_cosine.png
--------------------------------------------------------------------------------
/docs/examples/assets/semantic_extraction_llm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/assets/semantic_extraction_llm.png
--------------------------------------------------------------------------------
/docs/examples/async_webcrawler_multiple_urls_example.py:
--------------------------------------------------------------------------------
1 | # File: async_webcrawler_multiple_urls_example.py
2 | import os, sys
3 |
4 | # append 2 parent directories to sys.path to import crawl4ai
5 | parent_dir = os.path.dirname(
6 | os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
7 | )
8 | sys.path.append(parent_dir)
9 |
10 | import asyncio
11 | from crawl4ai import AsyncWebCrawler
12 |
13 |
14 | async def main():
15 | # Initialize the AsyncWebCrawler
16 | async with AsyncWebCrawler(verbose=True) as crawler:
17 | # List of URLs to crawl
18 | urls = [
19 | "https://example.com",
20 | "https://python.org",
21 | "https://github.com",
22 | "https://stackoverflow.com",
23 | "https://news.ycombinator.com",
24 | ]
25 |
26 | # Set up crawling parameters
27 | word_count_threshold = 100
28 |
29 | # Run the crawling process for multiple URLs
30 | results = await crawler.arun_many(
31 | urls=urls,
32 | word_count_threshold=word_count_threshold,
33 | bypass_cache=True,
34 | verbose=True,
35 | )
36 |
37 | # Process the results
38 | for result in results:
39 | if result.success:
40 | print(f"Successfully crawled: {result.url}")
41 | print(f"Title: {result.metadata.get('title', 'N/A')}")
42 | print(f"Word count: {len(result.markdown.split())}")
43 | print(
44 | f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
45 | )
46 | print(f"Number of images: {len(result.media.get('images', []))}")
47 | print("---")
48 | else:
49 | print(f"Failed to crawl: {result.url}")
50 | print(f"Error: {result.error_message}")
51 | print("---")
52 |
53 |
54 | if __name__ == "__main__":
55 | asyncio.run(main())
56 |
--------------------------------------------------------------------------------
/docs/examples/c4a_script/amazon_example/generated_product_schema.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Amazon Product Search Results",
3 | "baseSelector": "div[data-component-type='s-impression-counter']",
4 | "fields": [
5 | {
6 | "name": "title",
7 | "selector": "h2.a-size-base-plus.a-spacing-none.a-color-base.a-text-normal span",
8 | "type": "text"
9 | },
10 | {
11 | "name": "price",
12 | "selector": "span.a-price > span.a-offscreen",
13 | "type": "text"
14 | },
15 | {
16 | "name": "rating",
17 | "selector": "i.a-icon-star-small span.a-icon-alt",
18 | "type": "text"
19 | },
20 | {
21 | "name": "number_of_reviews",
22 | "selector": "a.a-link-normal.s-underline-text span.a-size-base",
23 | "type": "text"
24 | },
25 | {
26 | "name": "delivery_info",
27 | "selector": "div[data-cy='delivery-recipe'] span.a-color-base",
28 | "type": "text"
29 | },
30 | {
31 | "name": "product_url",
32 | "selector": "a.a-link-normal.s-no-outline",
33 | "type": "attribute",
34 | "attribute": "href"
35 | },
36 | {
37 | "name": "sponsored",
38 | "selector": "span.puis-label-popover-default span.a-color-secondary",
39 | "type": "text"
40 | },
41 | {
42 | "name": "small_business_badge",
43 | "selector": "span.a-size-base.a-color-base",
44 | "type": "text"
45 | }
46 | ]
47 | }
--------------------------------------------------------------------------------
/docs/examples/c4a_script/amazon_example/generated_search_script.js:
--------------------------------------------------------------------------------
1 | const searchBox = document.querySelector('#twotabsearchtextbox');
2 | const searchButton = document.querySelector('#nav-search-submit-button');
3 |
4 | if (searchBox && searchButton) {
5 | searchBox.focus();
6 | searchBox.value = '';
7 | searchBox.value = 'r2d2';
8 | searchButton.click();
9 | }
--------------------------------------------------------------------------------
/docs/examples/c4a_script/c4a_script_hello_world.py:
--------------------------------------------------------------------------------
1 | """
2 | C4A-Script Hello World
3 | A concise example showing how to use the C4A-Script compiler
4 | """
5 |
6 | from crawl4ai.script.c4a_compile import compile
7 |
8 | # Define your C4A-Script
9 | script = """
10 | GO https://example.com
11 | WAIT `#content` 5
12 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
13 | CLICK `button.submit`
14 | """
15 |
16 | # Compile the script
17 | result = compile(script)
18 |
19 | # Check if compilation was successful
20 | if result.success:
21 | # Success! Use the generated JavaScript
22 | print("✅ Compilation successful!")
23 | print(f"Generated {len(result.js_code)} JavaScript statements:\n")
24 |
25 | for i, js in enumerate(result.js_code, 1):
26 | print(f"{i}. {js}\n")
27 |
28 | # In real usage, you'd pass result.js_code to Crawl4AI:
29 | # config = CrawlerRunConfig(js_code=result.js_code)
30 |
31 | else:
32 | # Error! Handle the compilation error
33 | print("❌ Compilation failed!")
34 |
35 | # Get the first error (there might be multiple)
36 | error = result.first_error
37 |
38 | # Show error details
39 | print(f"Error at line {error.line}, column {error.column}")
40 | print(f"Message: {error.message}")
41 |
42 | # Show the problematic code
43 | print(f"\nCode: {error.source_line}")
44 | print(" " * (6 + error.column) + "^")
45 |
46 | # Show suggestions if available
47 | if error.suggestions:
48 | print("\n💡 How to fix:")
49 | for suggestion in error.suggestions:
50 | print(f" {suggestion.message}")
51 |
52 | # For debugging or logging, you can also get JSON
53 | # error_json = result.to_json()
--------------------------------------------------------------------------------
/docs/examples/c4a_script/c4a_script_hello_world_error.py:
--------------------------------------------------------------------------------
1 | """
2 | C4A-Script Hello World - Error Example
3 | Shows how error handling works
4 | """
5 |
6 | from crawl4ai.script.c4a_compile import compile
7 |
8 | # Define a script with an error (missing THEN)
9 | script = """
10 | GO https://example.com
11 | WAIT `#content` 5
12 | IF (EXISTS `.cookie-banner`) CLICK `.accept`
13 | CLICK `button.submit`
14 | """
15 |
16 | # Compile the script
17 | result = compile(script)
18 |
19 | # Check if compilation was successful
20 | if result.success:
21 | # Success! Use the generated JavaScript
22 | print("✅ Compilation successful!")
23 | print(f"Generated {len(result.js_code)} JavaScript statements:\n")
24 |
25 | for i, js in enumerate(result.js_code, 1):
26 | print(f"{i}. {js}\n")
27 |
28 | # In real usage, you'd pass result.js_code to Crawl4AI:
29 | # config = CrawlerRunConfig(js_code=result.js_code)
30 |
31 | else:
32 | # Error! Handle the compilation error
33 | print("❌ Compilation failed!")
34 |
35 | # Get the first error (there might be multiple)
36 | error = result.first_error
37 |
38 | # Show error details
39 | print(f"Error at line {error.line}, column {error.column}")
40 | print(f"Message: {error.message}")
41 |
42 | # Show the problematic code
43 | print(f"\nCode: {error.source_line}")
44 | print(" " * (6 + error.column) + "^")
45 |
46 | # Show suggestions if available
47 | if error.suggestions:
48 | print("\n💡 How to fix:")
49 | for suggestion in error.suggestions:
50 | print(f" {suggestion.message}")
51 |
52 | # For debugging or logging, you can also get JSON
53 | # error_json = result.to_json()
--------------------------------------------------------------------------------
/docs/examples/c4a_script/generate_script_hello_world.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Hello World Example: LLM-Generated C4A-Script
4 |
5 | This example shows how to use the new generate_script() function to automatically
6 | create C4A-Script automation from natural language descriptions and HTML.
7 | """
8 |
9 | from crawl4ai.script.c4a_compile import C4ACompiler
10 |
11 | def main():
12 | print("🤖 C4A-Script Generation Hello World")
13 | print("=" * 50)
14 |
15 | # Example 1: Simple login form
16 | html = """
17 | <html>
18 | <body>
19 | <form id="login">
20 | <input id="email" type="email" placeholder="Email">
21 | <input id="password" type="password" placeholder="Password">
22 | <button id="submit">Login</button>
23 | </form>
24 | </body>
25 | </html>
26 | """
27 |
28 | goal = "Fill in email 'user@example.com', password 'secret123', and submit the form"
29 |
30 | print("📝 Goal:", goal)
31 | print("🌐 HTML: Simple login form")
32 | print()
33 |
34 | # Generate C4A-Script
35 | print("🔧 Generated C4A-Script:")
36 | print("-" * 30)
37 | c4a_script = C4ACompiler.generate_script(
38 | html=html,
39 | query=goal,
40 | mode="c4a"
41 | )
42 | print(c4a_script)
43 | print()
44 |
45 | # Generate JavaScript
46 | print("🔧 Generated JavaScript:")
47 | print("-" * 30)
48 | js_script = C4ACompiler.generate_script(
49 | html=html,
50 | query=goal,
51 | mode="js"
52 | )
53 | print(js_script)
54 | print()
55 |
56 | # Example 2: Simple button click
57 | html2 = """
58 | <html>
59 | <body>
60 | <div class="content">
61 | <h1>Welcome!</h1>
62 | <button id="start-btn" class="primary">Get Started</button>
63 | </div>
64 | </body>
65 | </html>
66 | """
67 |
68 | goal2 = "Click the 'Get Started' button"
69 |
70 | print("=" * 50)
71 | print("📝 Goal:", goal2)
72 | print("🌐 HTML: Simple button")
73 | print()
74 |
75 | print("🔧 Generated C4A-Script:")
76 | print("-" * 30)
77 | c4a_script2 = C4ACompiler.generate_script(
78 | html=html2,
79 | query=goal2,
80 | mode="c4a"
81 | )
82 | print(c4a_script2)
83 | print()
84 |
85 | print("✅ Done! The LLM automatically converted natural language goals")
86 | print(" into executable automation scripts.")
87 |
88 | if __name__ == "__main__":
89 | main()
--------------------------------------------------------------------------------
/docs/examples/c4a_script/github_search/generated_result_schema.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "GitHub Repository Cards",
3 | "baseSelector": "div.Box-sc-g0xbh4-0.iwUbcA",
4 | "fields": [
5 | {
6 | "name": "repository_name",
7 | "selector": "div.search-title a span",
8 | "type": "text",
9 | "transform": "strip"
10 | },
11 | {
12 | "name": "repository_owner",
13 | "selector": "div.search-title a span",
14 | "type": "text",
15 | "transform": "split",
16 | "pattern": "/"
17 | },
18 | {
19 | "name": "repository_url",
20 | "selector": "div.search-title a",
21 | "type": "attribute",
22 | "attribute": "href",
23 | "transform": "prepend",
24 | "pattern": "https://github.com"
25 | },
26 | {
27 | "name": "description",
28 | "selector": "div.dcdlju span",
29 | "type": "text"
30 | },
31 | {
32 | "name": "primary_language",
33 | "selector": "ul.bZkODq li span[aria-label]",
34 | "type": "text"
35 | },
36 | {
37 | "name": "star_count",
38 | "selector": "ul.bZkODq li a[href*='stargazers'] span",
39 | "type": "text",
40 | "transform": "strip"
41 | },
42 | {
43 | "name": "topics",
44 | "type": "list",
45 | "selector": "div.jgRnBg div a",
46 | "fields": [
47 | {
48 | "name": "topic_name",
49 | "selector": "a",
50 | "type": "text"
51 | }
52 | ]
53 | },
54 | {
55 | "name": "last_updated",
56 | "selector": "ul.bZkODq li span[title]",
57 | "type": "text"
58 | },
59 | {
60 | "name": "has_sponsor_button",
61 | "selector": "button[aria-label*='Sponsor']",
62 | "type": "text",
63 | "transform": "exists"
64 | }
65 | ]
66 | }
--------------------------------------------------------------------------------
/docs/examples/c4a_script/github_search/generated_search_script.js:
--------------------------------------------------------------------------------
1 | (async () => {
2 | const waitForElement = (selector, timeout = 10000) => new Promise((resolve, reject) => {
3 | const el = document.querySelector(selector);
4 | if (el) return resolve(el);
5 | const observer = new MutationObserver(() => {
6 | const el = document.querySelector(selector);
7 | if (el) {
8 | observer.disconnect();
9 | resolve(el);
10 | }
11 | });
12 | observer.observe(document.body, { childList: true, subtree: true });
13 | setTimeout(() => {
14 | observer.disconnect();
15 | reject(new Error(`Timeout waiting for ${selector}`));
16 | }, timeout);
17 | });
18 |
19 | try {
20 | const searchInput = await waitForElement('#adv_code_search input[type="text"]');
21 | searchInput.value = 'crawl4AI';
22 | searchInput.dispatchEvent(new Event('input', { bubbles: true }));
23 |
24 | const languageSelect = await waitForElement('#search_language');
25 | languageSelect.value = 'Python';
26 | languageSelect.dispatchEvent(new Event('change', { bubbles: true }));
27 |
28 | const starsInput = await waitForElement('#search_stars');
29 | starsInput.value = '>10000';
30 | starsInput.dispatchEvent(new Event('input', { bubbles: true }));
31 |
32 | const searchButton = await waitForElement('#adv_code_search button[type="submit"]');
33 | searchButton.click();
34 |
35 | await waitForElement('.codesearch-results, #search-results');
36 | } catch (e) {
37 | console.error('Search script failed:', e.message);
38 | }
39 | })();
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/add_to_cart.c4a:
--------------------------------------------------------------------------------
1 | GO https://store.example.com/product/laptop
2 | WAIT `.product-details` 8
3 | CLICK `button.add-to-cart`
4 | WAIT `.cart-notification` 3
5 | CLICK `.cart-icon`
6 | WAIT `.checkout-btn` 5
7 | CLICK `.checkout-btn`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/advanced_control_flow.c4a:
--------------------------------------------------------------------------------
1 | # Advanced control flow with IF, EXISTS, and REPEAT
2 |
3 | # Define reusable procedures
4 | PROC handle_cookie_banner
5 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept-cookies`
6 | IF (EXISTS `.privacy-notice`) THEN CLICK `.dismiss-privacy`
7 | ENDPROC
8 |
9 | PROC scroll_to_load
10 | SCROLL DOWN 500
11 | WAIT 0.5
12 | ENDPROC
13 |
14 | PROC try_login
15 | CLICK `#email`
16 | TYPE "user@example.com"
17 | CLICK `#password`
18 | TYPE "secure123"
19 | CLICK `button[type="submit"]`
20 | WAIT 2
21 | ENDPROC
22 |
23 | # Main script
24 | GO https://example.com
25 | WAIT 2
26 |
27 | # Handle popups
28 | handle_cookie_banner
29 |
30 | # Conditional navigation based on login state
31 | IF (EXISTS `.user-menu`) THEN CLICK `.dashboard-link` ELSE try_login
32 |
33 | # Repeat scrolling based on content count
34 | REPEAT (scroll_to_load, 5)
35 |
36 | # Load more content while button exists
37 | REPEAT (CLICK `.load-more`, `document.querySelector('.load-more') && !document.querySelector('.no-more-content')`)
38 |
39 | # Process items conditionally
40 | IF (`document.querySelectorAll('.item').length > 10`) THEN EVAL `console.log('Found ' + document.querySelectorAll('.item').length + ' items')`
41 |
42 | # Complex condition with viewport check
43 | IF (`window.innerWidth < 768 && document.querySelector('.mobile-menu')`) THEN CLICK `.mobile-menu-toggle`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/conditional_login.c4a:
--------------------------------------------------------------------------------
1 | GO https://myapp.com
2 | WAIT 2
3 | IF (EXISTS `.user-avatar`) THEN CLICK `.logout` ELSE CLICK `.login`
4 | WAIT `#auth-form` 5
5 | IF (EXISTS `#auth-form`) THEN TYPE "user@example.com"
6 | IF (EXISTS `#auth-form`) THEN PRESS Tab
7 | IF (EXISTS `#auth-form`) THEN TYPE "password123"
8 | IF (EXISTS `#auth-form`) THEN CLICK `button[type="submit"]`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/data_extraction.c4a:
--------------------------------------------------------------------------------
1 | # Data extraction example
2 | # Scrapes product information from an e-commerce site
3 |
4 | # Navigate to products page
5 | GO https://shop.example.com/products
6 | WAIT `.product-list` 10
7 |
8 | # Scroll to load lazy-loaded content
9 | SCROLL DOWN 500
10 | WAIT 1
11 | SCROLL DOWN 500
12 | WAIT 1
13 | SCROLL DOWN 500
14 | WAIT 2
15 |
16 | # Extract product data
17 | EVAL `
18 | // Extract all product information
19 | const products = Array.from(document.querySelectorAll('.product-card')).map((card, index) => {
20 | return {
21 | id: index + 1,
22 | name: card.querySelector('.product-title')?.textContent?.trim() || 'N/A',
23 | price: card.querySelector('.price')?.textContent?.trim() || 'N/A',
24 | rating: card.querySelector('.rating')?.textContent?.trim() || 'N/A',
25 | availability: card.querySelector('.in-stock') ? 'In Stock' : 'Out of Stock',
26 | image: card.querySelector('img')?.src || 'N/A'
27 | };
28 | });
29 |
30 | // Log results
31 | console.log('=== Product Extraction Results ===');
32 | console.log('Total products found:', products.length);
33 | console.log(JSON.stringify(products, null, 2));
34 |
35 | // Save to localStorage for retrieval
36 | localStorage.setItem('scraped_products', JSON.stringify(products));
37 | `
38 |
39 | # Optional: Click on first product for details
40 | CLICK `.product-card:first-child`
41 | WAIT `.product-details` 5
42 |
43 | # Extract detailed information
44 | EVAL `
45 | const details = {
46 | description: document.querySelector('.product-description')?.textContent?.trim(),
47 | specifications: Array.from(document.querySelectorAll('.spec-item')).map(spec => ({
48 | label: spec.querySelector('.spec-label')?.textContent,
49 | value: spec.querySelector('.spec-value')?.textContent
50 | })),
51 | reviews: document.querySelector('.review-count')?.textContent
52 | };
53 |
54 | console.log('=== Product Details ===');
55 | console.log(JSON.stringify(details, null, 2));
56 | `
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/fill_contact.c4a:
--------------------------------------------------------------------------------
1 | GO https://company.com/contact
2 | WAIT `form#contact` 10
3 | TYPE "John Smith"
4 | PRESS Tab
5 | TYPE "john@email.com"
6 | PRESS Tab
7 | TYPE "Need help with my order"
8 | CLICK `button[type="submit"]`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/load_more_content.c4a:
--------------------------------------------------------------------------------
1 | GO https://news.example.com
2 | WAIT `.article-list` 5
3 | REPEAT (SCROLL DOWN 500, 3)
4 | WAIT 1
5 | REPEAT (CLICK `.load-more`, `document.querySelector('.load-more') !== null`)
6 | WAIT 2
7 | IF (`document.querySelectorAll('.article').length > 20`) THEN EVAL `console.log('Loaded enough articles')`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/login_flow.c4a:
--------------------------------------------------------------------------------
1 | # Login flow with error handling
2 | # Demonstrates procedures, variables, and conditional checks
3 |
4 | # Define login procedure
5 | PROC perform_login
6 | CLICK `input#email`
7 | TYPE $email
8 | CLICK `input#password`
9 | TYPE $password
10 | CLICK `button.login-submit`
11 | ENDPROC
12 |
13 | # Set credentials
14 | SET email = "user@example.com"
15 | SET password = "securePassword123"
16 |
17 | # Navigate to login page
18 | GO https://app.example.com/login
19 | WAIT `.login-container` 15
20 |
21 | # Attempt login
22 | perform_login
23 |
24 | # Wait for page to load
25 | WAIT 3
26 |
27 | # Check if login was successful
28 | EVAL `
29 | if (document.querySelector('.dashboard')) {
30 | console.log('Login successful - on dashboard');
31 | } else if (document.querySelector('.error-message')) {
32 | console.log('Login failed:', document.querySelector('.error-message').textContent);
33 | } else {
34 | console.log('Unknown state after login');
35 | }
36 | `
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/navigate_tabs.c4a:
--------------------------------------------------------------------------------
1 | GO https://app.example.com
2 | WAIT `.nav-menu` 8
3 | CLICK `a[href="/products"]`
4 | WAIT 2
5 | CLICK `a[href="/about"]`
6 | WAIT 2
7 | BACK
8 | WAIT 1
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/quick_login.c4a:
--------------------------------------------------------------------------------
1 | GO https://myapp.com/login
2 | WAIT `input#email` 5
3 | CLICK `input#email`
4 | TYPE "user@example.com"
5 | PRESS Tab
6 | TYPE "password123"
7 | CLICK `button.login-btn`
8 | WAIT `.dashboard` 10
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/responsive_actions.c4a:
--------------------------------------------------------------------------------
1 | GO https://responsive.site.com
2 | WAIT 2
3 | IF (`window.innerWidth < 768`) THEN CLICK `.mobile-menu`
4 | IF (`window.innerWidth < 768`) THEN WAIT `.mobile-nav` 3
5 | IF (`window.innerWidth >= 768`) THEN CLICK `.desktop-menu li:nth-child(2)`
6 | REPEAT (CLICK `.next-slide`, 5)
7 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept-cookies`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/scroll_and_click.c4a:
--------------------------------------------------------------------------------
1 | GO https://news.site.com
2 | WAIT `.article-list` 10
3 | SCROLL DOWN 500
4 | WAIT 1
5 | SCROLL DOWN 500
6 | WAIT 1
7 | CLICK `.article:nth-child(5)`
8 | WAIT `.article-content` 5
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/search_product.c4a:
--------------------------------------------------------------------------------
1 | GO https://shop.example.com
2 | WAIT `.search-bar` 10
3 | CLICK `.search-bar`
4 | TYPE "wireless headphones"
5 | PRESS Enter
6 | WAIT `.results` 5
7 | CLICK `.product-card:first-child`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/simple_form.c4a:
--------------------------------------------------------------------------------
1 | # Simple form submission example
2 | # This script fills out a contact form and submits it
3 |
4 | GO https://example.com/contact
5 | WAIT `form#contact-form` 10
6 |
7 | # Fill out the form fields
8 | CLICK `input[name="name"]`
9 | TYPE "Alice Smith"
10 | PRESS Tab
11 | TYPE "alice@example.com"
12 | PRESS Tab
13 | TYPE "I'd like to learn more about your services"
14 |
15 | # Submit the form
16 | CLICK `button[type="submit"]`
17 |
18 | # Wait for success message
19 | WAIT "Thank you for your message" 5
--------------------------------------------------------------------------------
/docs/examples/c4a_script/script_samples/smart_form_fill.c4a:
--------------------------------------------------------------------------------
1 | PROC fill_field
2 | TYPE "test@example.com"
3 | PRESS Tab
4 | ENDPROC
5 |
6 | GO https://forms.example.com
7 | WAIT `form` 5
8 | IF (EXISTS `input[type="email"]`) THEN CLICK `input[type="email"]`
9 | IF (EXISTS `input[type="email"]`) THEN fill_field
10 | REPEAT (PRESS Tab, `document.activeElement.type !== 'submit'`)
11 | CLICK `button[type="submit"]`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/c4a_script/tutorial/assets/DankMono-Bold.woff2
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/c4a_script/tutorial/assets/DankMono-Italic.woff2
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/c4a_script/tutorial/assets/DankMono-Regular.woff2
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/blockly-demo.c4a:
--------------------------------------------------------------------------------
1 | # Demo: Login Flow with Blockly
2 | # This script can be created visually using Blockly blocks
3 |
4 | GO https://example.com/login
5 | WAIT `#login-form` 5
6 |
7 | # Check if already logged in
8 | IF (EXISTS `.user-avatar`) THEN GO https://example.com/dashboard
9 |
10 | # Fill login form
11 | CLICK `#email`
12 | TYPE "demo@example.com"
13 | CLICK `#password`
14 | TYPE "password123"
15 |
16 | # Submit form
17 | CLICK `button[type="submit"]`
18 | WAIT `.dashboard` 10
19 |
20 | # Success message
21 | EVAL `console.log('Login successful!')`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/requirements.txt:
--------------------------------------------------------------------------------
1 | flask>=2.3.0
2 | flask-cors>=4.0.0
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/01-basic-interaction.c4a:
--------------------------------------------------------------------------------
1 | # Basic Page Interaction
2 | # This script demonstrates basic C4A commands
3 |
4 | # Navigate to the playground
5 | GO http://127.0.0.1:8080/playground/
6 |
7 | # Wait for page to load
8 | WAIT `body` 2
9 |
10 | # Handle cookie banner if present
11 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
12 |
13 | # Close newsletter popup if it appears
14 | WAIT 3
15 | IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`
16 |
17 | # Click the start tutorial button
18 | CLICK `#start-tutorial`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/02-login-flow.c4a:
--------------------------------------------------------------------------------
1 | # Complete Login Flow
2 | # Demonstrates form interaction and authentication
3 |
4 | # Click login button
5 | CLICK `#login-btn`
6 |
7 | # Wait for login modal
8 | WAIT `.login-form` 3
9 |
10 | # Fill in credentials
11 | CLICK `#email`
12 | TYPE "demo@example.com"
13 |
14 | CLICK `#password`
15 | TYPE "demo123"
16 |
17 | # Check remember me
18 | IF (EXISTS `#remember-me`) THEN CLICK `#remember-me`
19 |
20 | # Submit form
21 | CLICK `button[type="submit"]`
22 |
23 | # Wait for success
24 | WAIT `.welcome-message` 5
25 |
26 | # Verify login succeeded
27 | IF (EXISTS `.user-info`) THEN EVAL `console.log('✅ Login successful!')`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/03-infinite-scroll.c4a:
--------------------------------------------------------------------------------
1 | # Infinite Scroll Product Loading
2 | # Load all products using scroll automation
3 |
4 | # Navigate to catalog
5 | CLICK `#catalog-link`
6 | WAIT `.product-grid` 3
7 |
8 | # Switch to infinite scroll mode
9 | CLICK `#infinite-scroll-btn`
10 |
11 | # Define scroll procedure
12 | PROC load_more_products
13 | # Get current product count
14 | EVAL `window.initialCount = document.querySelectorAll('.product-card').length`
15 |
16 | # Scroll down
17 | SCROLL DOWN 1000
18 | WAIT 2
19 |
20 | # Check if more products loaded
21 | EVAL `
22 | const newCount = document.querySelectorAll('.product-card').length;
23 | console.log('Products loaded: ' + newCount);
24 | window.moreLoaded = newCount > window.initialCount;
25 | `
26 | ENDPROC
27 |
28 | # Load products until no more
29 | REPEAT (load_more_products, `window.moreLoaded !== false`)
30 |
31 | # Final count
32 | EVAL `console.log('✅ Total products: ' + document.querySelectorAll('.product-card').length)`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/04-multi-step-form.c4a:
--------------------------------------------------------------------------------
1 | # Multi-step Form Wizard
2 | # Complete a complex form with multiple steps
3 |
4 | # Navigate to forms section
5 | CLICK `a[href="#forms"]`
6 | WAIT `#survey-form` 2
7 |
8 | # Step 1: Basic Information
9 | CLICK `#full-name`
10 | TYPE "John Doe"
11 |
12 | CLICK `#survey-email`
13 | TYPE "john.doe@example.com"
14 |
15 | # Go to next step
16 | CLICK `.next-step`
17 | WAIT 1
18 |
19 | # Step 2: Select Interests
20 | # Select multiple options
21 | CLICK `#interests`
22 | CLICK `option[value="tech"]`
23 | CLICK `option[value="music"]`
24 | CLICK `option[value="travel"]`
25 |
26 | # Continue to final step
27 | CLICK `.next-step`
28 | WAIT 1
29 |
30 | # Step 3: Review and Submit
31 | # Verify we're on the last step
32 | IF (EXISTS `#submit-survey`) THEN EVAL `console.log('📋 On final step')`
33 |
34 | # Submit the form
35 | CLICK `#submit-survey`
36 |
37 | # Wait for success message
38 | WAIT `.success-message` 5
39 |
40 | # Verify submission
41 | IF (EXISTS `.success-message`) THEN EVAL `console.log('✅ Survey submitted successfully!')`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/scripts/05-complex-workflow.c4a:
--------------------------------------------------------------------------------
1 | # Complete E-commerce Workflow
2 | # Login, browse products, and interact with various elements
3 |
4 | # Define reusable procedures
5 | PROC handle_popups
6 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
7 | IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`
8 | ENDPROC
9 |
10 | PROC login_user
11 | CLICK `#login-btn`
12 | WAIT `.login-form` 2
13 | CLICK `#email`
14 | TYPE "demo@example.com"
15 | CLICK `#password`
16 | TYPE "demo123"
17 | CLICK `button[type="submit"]`
18 | WAIT `.welcome-message` 5
19 | ENDPROC
20 |
21 | PROC browse_products
22 | # Go to catalog
23 | CLICK `#catalog-link`
24 | WAIT `.product-grid` 3
25 |
26 | # Apply filters
27 | CLICK `.collapsible`
28 | WAIT 0.5
29 | CLICK `input[type="checkbox"]`
30 |
31 | # Load some products
32 | SCROLL DOWN 500
33 | WAIT 1
34 | SCROLL DOWN 500
35 | WAIT 1
36 | ENDPROC
37 |
38 | # Main workflow
39 | GO http://127.0.0.1:8080/playground/
40 | WAIT `body` 2
41 |
42 | # Handle initial popups
43 | handle_popups
44 |
45 | # Login if not already
46 | IF (NOT EXISTS `.user-info`) THEN login_user
47 |
48 | # Browse products
49 | browse_products
50 |
51 | # Navigate to tabs demo
52 | CLICK `a[href="#tabs"]`
53 | WAIT `.tabs-container` 2
54 |
55 | # Interact with tabs
56 | CLICK `button[data-tab="reviews"]`
57 | WAIT 1
58 |
59 | # Load comments
60 | IF (EXISTS `.load-comments`) THEN CLICK `.load-comments`
61 | WAIT `.comments-section` 2
62 |
63 | # Check specifications
64 | CLICK `button[data-tab="specs"]`
65 | WAIT 1
66 |
67 | # Final navigation to data tables
68 | CLICK `a[href="#data"]`
69 | WAIT `.data-table` 2
70 |
71 | # Search in table
72 | CLICK `.search-input`
73 | TYPE "User"
74 |
75 | # Load more rows
76 | CLICK `.load-more-rows`
77 | WAIT 1
78 |
79 | # Export data
80 | CLICK `#export-btn`
81 |
82 | EVAL `console.log('✅ Workflow completed successfully!')`
--------------------------------------------------------------------------------
/docs/examples/c4a_script/tutorial/test_blockly.html:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <html lang="en">
3 | <head>
4 | <meta charset="UTF-8">
5 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
6 | <title>Blockly Test</title>
7 | <style>
8 | body {
9 | margin: 0;
10 | padding: 20px;
11 | background: #0e0e10;
12 | color: #e0e0e0;
13 | font-family: monospace;
14 | }
15 | #blocklyDiv {
16 | height: 600px;
17 | width: 100%;
18 | border: 1px solid #2a2a2c;
19 | }
20 | #output {
21 | margin-top: 20px;
22 | padding: 15px;
23 | background: #1a1a1b;
24 | border: 1px solid #2a2a2c;
25 | white-space: pre-wrap;
26 | }
27 | </style>
28 | </head>
29 | <body>
30 | <h1>C4A-Script Blockly Test</h1>
31 | <div id="blocklyDiv"></div>
32 | <div id="output">
33 | <h3>Generated C4A-Script:</h3>
34 | <pre id="code-output"></pre>
35 | </div>
36 |
37 | <script src="https://unpkg.com/blockly/blockly.min.js"></script>
38 | <script src="assets/c4a-blocks.js"></script>
39 | <script>
40 | // Simple test
41 | const workspace = Blockly.inject('blocklyDiv', {
42 | toolbox: `
43 | <xml>
44 | <category name="Test" colour="#1E88E5">
45 | <block type="c4a_go"></block>
46 | <block type="c4a_wait_time"></block>
47 | <block type="c4a_click"></block>
48 | </category>
49 | </xml>
50 | `,
51 | theme: Blockly.Theme.defineTheme('dark', {
52 | 'base': Blockly.Themes.Classic,
53 | 'componentStyles': {
54 | 'workspaceBackgroundColour': '#0e0e10',
55 | 'toolboxBackgroundColour': '#1a1a1b',
56 | 'toolboxForegroundColour': '#e0e0e0',
57 | 'flyoutBackgroundColour': '#1a1a1b',
58 | 'flyoutForegroundColour': '#e0e0e0',
59 | }
60 | })
61 | });
62 |
63 | workspace.addChangeListener((event) => {
64 | const code = Blockly.JavaScript.workspaceToCode(workspace);
65 | document.getElementById('code-output').textContent = code;
66 | });
67 | </script>
68 | </body>
69 | </html>
--------------------------------------------------------------------------------
/docs/examples/chainlit.md:
--------------------------------------------------------------------------------
1 | # Welcome to Crawl4AI! 🚀🤖
2 |
3 | Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
--------------------------------------------------------------------------------
/docs/examples/cli/browser.yml:
--------------------------------------------------------------------------------
1 | browser_type: "chromium"
2 | headless: true
3 | viewport_width: 1280
4 | viewport_height: 800
5 | user_agent_mode: "random"
6 | verbose: true
7 | text_mode: false
8 | light_mode: false
9 | ignore_https_errors: true
10 | java_script_enabled: true
11 | extra_args:
12 | - "--disable-gpu"
13 | - "--no-sandbox"
--------------------------------------------------------------------------------
/docs/examples/cli/crawler.yml:
--------------------------------------------------------------------------------
1 | cache_mode: "bypass"
2 | wait_until: "networkidle"
3 | page_timeout: 30000
4 | delay_before_return_html: 0.5
5 | word_count_threshold: 100
6 | scan_full_page: true
7 | scroll_delay: 0.3
8 | process_iframes: false
9 | remove_overlay_elements: true
10 | magic: true
11 | verbose: true
12 | exclude_external_links: true
13 | exclude_social_media_links: true
--------------------------------------------------------------------------------
/docs/examples/cli/css_schema.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ArticleExtractor",
3 | "baseSelector": ".cards[data-tax=news] .card__data",
4 | "fields": [
5 | {
6 | "name": "title",
7 | "selector": "h4.card__title",
8 | "type": "text"
9 | },
10 | {
11 | "name": "link",
12 | "selector": "h4.card__title a",
13 | "type": "attribute",
14 | "attribute": "href"
15 | },
16 | {
17 | "name": "details",
18 | "selector": ".card__details",
19 | "type": "text"
20 | },
21 | {
22 | "name": "topics",
23 | "selector": ".card__topics.topics",
24 | "type": "text"
25 | }
26 | ]
27 | }
--------------------------------------------------------------------------------
/docs/examples/cli/extract.yml:
--------------------------------------------------------------------------------
1 | type: "llm"
2 | provider: "openai/gpt-4o-mini"
3 | api_token: "env:OPENAI_API_KEY"
4 | instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
5 | params:
6 | chunk_token_threshold: 4096
7 | overlap_rate: 0.1
8 | word_token_rate: 0.75
9 | temperature: 0.3
10 | max_tokens: 1000
11 | verbose: true
--------------------------------------------------------------------------------
/docs/examples/cli/extract_css.yml:
--------------------------------------------------------------------------------
1 | type: "json-css"
2 | params:
3 | verbose: true
--------------------------------------------------------------------------------
/docs/examples/cli/llm_schema.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "NewsArticle",
3 | "type": "object",
4 | "properties": {
5 | "title": {
6 | "type": "string",
7 | "description": "The title/headline of the news article"
8 | },
9 | "link": {
10 | "type": "string",
11 | "description": "The URL or link to the full article"
12 | },
13 | "details": {
14 | "type": "string",
15 | "description": "Brief summary or details about the article content"
16 | },
17 | "topics": {
18 | "type": "array",
19 | "items": {
20 | "type": "string"
21 | },
22 | "description": "List of topics or categories associated with the article"
23 | }
24 | },
25 | "required": ["title", "details"]
26 | }
--------------------------------------------------------------------------------
/docs/examples/docker_python_sdk.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai.docker_client import Crawl4aiDockerClient
3 | from crawl4ai import (
4 | BrowserConfig,
5 | CrawlerRunConfig
6 | )
7 |
8 | async def main():
9 | async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
10 | # If jwt is enabled, authenticate first
11 | # await client.authenticate("test@example.com")
12 |
13 | # Non-streaming crawl
14 | results = await client.crawl(
15 | ["https://example.com", "https://python.org"],
16 | browser_config=BrowserConfig(headless=True),
17 | crawler_config=CrawlerRunConfig()
18 | )
19 | print(f"Non-streaming results: {results}")
20 |
21 | # Streaming crawl
22 | crawler_config = CrawlerRunConfig(stream=True)
23 | async for result in await client.crawl(
24 | ["https://example.com", "https://python.org"],
25 | browser_config=BrowserConfig(headless=True),
26 | crawler_config=crawler_config
27 | ):
28 | print(f"Streamed result: {result}")
29 |
30 | # Get schema
31 | schema = await client.get_schema()
32 | print(f"Schema: {schema}")
33 |
34 | if __name__ == "__main__":
35 | asyncio.run(main())
--------------------------------------------------------------------------------
/docs/examples/hello_world.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai import (
3 | AsyncWebCrawler,
4 | BrowserConfig,
5 | CrawlerRunConfig,
6 | DefaultMarkdownGenerator,
7 | PruningContentFilter,
8 | CrawlResult
9 | )
10 |
11 |
12 | async def main():
13 | browser_config = BrowserConfig(
14 | headless=False,
15 | verbose=True,
16 | )
17 | async with AsyncWebCrawler(config=browser_config) as crawler:
18 | crawler_config = CrawlerRunConfig(
19 | markdown_generator=DefaultMarkdownGenerator(
20 | content_filter=PruningContentFilter()
21 | ),
22 | )
23 | result: CrawlResult = await crawler.arun(
24 | url="https://www.helloworld.org", config=crawler_config
25 | )
26 | print(result.markdown.raw_markdown[:500])
27 |
28 | if __name__ == "__main__":
29 | asyncio.run(main())
30 |
--------------------------------------------------------------------------------
/docs/examples/hello_world_undetected.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai import (
3 | AsyncWebCrawler,
4 | BrowserConfig,
5 | CrawlerRunConfig,
6 | DefaultMarkdownGenerator,
7 | PruningContentFilter,
8 | CrawlResult,
9 | UndetectedAdapter
10 | )
11 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
12 |
13 |
14 | async def main():
15 | # Create browser config
16 | browser_config = BrowserConfig(
17 | headless=False,
18 | verbose=True,
19 | )
20 |
21 | # Create the undetected adapter
22 | undetected_adapter = UndetectedAdapter()
23 |
24 | # Create the crawler strategy with the undetected adapter
25 | crawler_strategy = AsyncPlaywrightCrawlerStrategy(
26 | browser_config=browser_config,
27 | browser_adapter=undetected_adapter
28 | )
29 |
30 | # Create the crawler with our custom strategy
31 | async with AsyncWebCrawler(
32 | crawler_strategy=crawler_strategy,
33 | config=browser_config
34 | ) as crawler:
35 | # Configure the crawl
36 | crawler_config = CrawlerRunConfig(
37 | markdown_generator=DefaultMarkdownGenerator(
38 | content_filter=PruningContentFilter()
39 | ),
40 | capture_console_messages=True, # Enable console capture to test adapter
41 | )
42 |
43 | # Test on a site that typically detects bots
44 | print("Testing undetected adapter...")
45 | result: CrawlResult = await crawler.arun(
46 | url="https://www.helloworld.org",
47 | config=crawler_config
48 | )
49 |
50 | print(f"Status: {result.status_code}")
51 | print(f"Success: {result.success}")
52 | print(f"Console messages captured: {len(result.console_messages or [])}")
53 | print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
54 |
55 |
56 | if __name__ == "__main__":
57 | asyncio.run(main())
--------------------------------------------------------------------------------
/docs/examples/language_support_example.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
3 |
4 |
5 | async def main():
6 | # Example 1: Setting language when creating the crawler
7 | crawler1 = AsyncWebCrawler(
8 | crawler_strategy=AsyncPlaywrightCrawlerStrategy(
9 | headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
10 | )
11 | )
12 | result1 = await crawler1.arun("https://www.example.com")
13 | print(
14 | "Example 1 result:", result1.extracted_content[:100]
15 | ) # Print first 100 characters
16 |
17 | # Example 2: Setting language before crawling
18 | crawler2 = AsyncWebCrawler()
19 | crawler2.crawler_strategy.headers[
20 | "Accept-Language"
21 | ] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
22 | result2 = await crawler2.arun("https://www.example.com")
23 | print("Example 2 result:", result2.extracted_content[:100])
24 |
25 | # Example 3: Setting language when calling arun method
26 | crawler3 = AsyncWebCrawler()
27 | result3 = await crawler3.arun(
28 | "https://www.example.com",
29 | headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"},
30 | )
31 | print("Example 3 result:", result3.extracted_content[:100])
32 |
33 | # Example 4: Crawling multiple pages with different languages
34 | urls = [
35 | ("https://www.example.com", "fr-FR,fr;q=0.9"),
36 | ("https://www.example.org", "es-ES,es;q=0.9"),
37 | ("https://www.example.net", "de-DE,de;q=0.9"),
38 | ]
39 |
40 | crawler4 = AsyncWebCrawler()
41 | results = await asyncio.gather(
42 | *[crawler4.arun(url, headers={"Accept-Language": lang}) for url, lang in urls]
43 | )
44 |
45 | for url, result in zip([u for u, _ in urls], results):
46 | print(f"Result for {url}:", result.extracted_content[:100])
47 |
48 |
49 | if __name__ == "__main__":
50 | asyncio.run(main())
51 |
--------------------------------------------------------------------------------
/docs/examples/llm_extraction_openai_pricing.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from pydantic import BaseModel, Field
3 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig, BrowserConfig, CacheMode
4 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
5 | from typing import Dict
6 | import os
7 |
8 |
9 | class OpenAIModelFee(BaseModel):
10 | model_name: str = Field(..., description="Name of the OpenAI model.")
11 | input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
12 | output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
13 |
14 |
15 | async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
16 | print(f"\n--- Extracting Structured Data with {provider} ---")
17 |
18 | if api_token is None and provider != "ollama":
19 | print(f"API token is required for {provider}. Skipping this example.")
20 | return
21 |
22 | browser_config = BrowserConfig(headless=True)
23 |
24 | extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
25 | if extra_headers:
26 | extra_args["extra_headers"] = extra_headers
27 |
28 | crawler_config = CrawlerRunConfig(
29 | cache_mode=CacheMode.BYPASS,
30 | word_count_threshold=1,
31 | page_timeout=80000,
32 | extraction_strategy=LLMExtractionStrategy(
33 | llm_config=LLMConfig(provider=provider, api_token=api_token),
34 | schema=OpenAIModelFee.model_json_schema(),
35 | extraction_type="schema",
36 | instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
37 | Do not miss any models in the entire content.""",
38 | extra_args=extra_args,
39 | ),
40 | )
41 |
42 | async with AsyncWebCrawler(config=browser_config) as crawler:
43 | result = await crawler.arun(
44 | url="https://openai.com/api/pricing/",
45 | config=crawler_config
46 | )
47 | print(result.extracted_content)
48 |
49 |
50 | if __name__ == "__main__":
51 | asyncio.run(
52 | extract_structured_data_using_llm(
53 | provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
54 | )
55 | )
56 |
--------------------------------------------------------------------------------
/docs/examples/markdown/content_source_short_example.py:
--------------------------------------------------------------------------------
1 | """
2 | Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
3 | """
4 |
5 | import asyncio
6 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
7 |
8 | async def demo_markdown_source_config():
9 | print("\n=== Demo: Configuring Markdown Source ===")
10 |
11 | # Example 1: Generate markdown from cleaned HTML (default behavior)
12 | cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
13 | config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
14 |
15 | async with AsyncWebCrawler() as crawler:
16 | result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
17 | print("Markdown from Cleaned HTML (default):")
18 | print(f" Length: {len(result_cleaned.markdown.raw_markdown)}")
19 | print(f" Start: {result_cleaned.markdown.raw_markdown[:100]}...")
20 |
21 | # Example 2: Generate markdown directly from raw HTML
22 | raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
23 | config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
24 |
25 | async with AsyncWebCrawler() as crawler:
26 | result_raw = await crawler.arun(url="https://example.com", config=config_raw)
27 | print("\nMarkdown from Raw HTML:")
28 | print(f" Length: {len(result_raw.markdown.raw_markdown)}")
29 | print(f" Start: {result_raw.markdown.raw_markdown[:100]}...")
30 |
31 | # Example 3: Generate markdown from preprocessed 'fit' HTML
32 | fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
33 | config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
34 |
35 | async with AsyncWebCrawler() as crawler:
36 | result_fit = await crawler.arun(url="https://example.com", config=config_fit)
37 | print("\nMarkdown from Fit HTML:")
38 | print(f" Length: {len(result_fit.markdown.raw_markdown)}")
39 | print(f" Start: {result_fit.markdown.raw_markdown[:100]}...")
40 |
41 | if __name__ == "__main__":
42 | asyncio.run(demo_markdown_source_config())
--------------------------------------------------------------------------------
/docs/examples/rest_call.py:
--------------------------------------------------------------------------------
1 | import requests, base64, os
2 |
3 | data = {
4 | "urls": ["https://www.nbcnews.com/business"],
5 | "screenshot": True,
6 | }
7 |
8 | response = requests.post("https://crawl4ai.com/crawl", json=data)
9 | result = response.json()["results"][0]
10 | print(result.keys())
11 | # dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
12 | # 'links', 'screenshot', 'markdown', 'extracted_content',
13 | # 'metadata', 'error_message'])
14 | with open("screenshot.png", "wb") as f:
15 | f.write(base64.b64decode(result["screenshot"]))
16 |
17 | # Example of filtering the content using CSS selectors
18 | data = {
19 | "urls": ["https://www.nbcnews.com/business"],
20 | "css_selector": "article",
21 | "screenshot": True,
22 | }
23 |
24 | # Example of executing a JS script on the page before extracting the content
25 | data = {
26 | "urls": ["https://www.nbcnews.com/business"],
27 | "screenshot": True,
28 | "js": [
29 | """
30 | const loadMoreButton = Array.from(document.querySelectorAll('button')).
31 | find(button => button.textContent.includes('Load More'));
32 | loadMoreButton && loadMoreButton.click();
33 | """
34 | ],
35 | }
36 |
37 | # Example of using a custom extraction strategy
38 | data = {
39 | "urls": ["https://www.nbcnews.com/business"],
40 | "extraction_strategy": "CosineStrategy",
41 | "extraction_strategy_args": {"semantic_filter": "inflation rent prices"},
42 | }
43 |
44 | # Example of using LLM to extract content
45 | data = {
46 | "urls": ["https://www.nbcnews.com/business"],
47 | "extraction_strategy": "LLMExtractionStrategy",
48 | "extraction_strategy_args": {
49 | "provider": "groq/llama3-8b-8192",
50 | "api_token": os.environ.get("GROQ_API_KEY"),
51 | "instruction": """I am interested in only financial news,
52 | and translate them in French.""",
53 | },
54 | }
55 |
--------------------------------------------------------------------------------
/docs/examples/session_id_example.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai import (
3 | AsyncWebCrawler,
4 | BrowserConfig,
5 | CrawlerRunConfig,
6 | DefaultMarkdownGenerator,
7 | PruningContentFilter,
8 | CrawlResult
9 | )
10 |
11 |
12 |
13 | async def main():
14 | browser_config = BrowserConfig(
15 | headless=False,
16 | verbose=True,
17 | )
18 | async with AsyncWebCrawler(config=browser_config) as crawler:
19 | crawler_config = CrawlerRunConfig(
20 | session_id= "hello_world", # This help us to use the same page
21 | )
22 | result : CrawlResult = await crawler.arun(
23 | url="https://www.helloworld.org", config=crawler_config
24 | )
25 | # Add a breakpoint here, then you will the page is open and browser is not closed
26 | print(result.markdown.raw_markdown[:500])
27 |
28 | new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True)
29 | result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page
30 | url="https://www.helloworld.org", config= new_config
31 | )
32 | print(result.js_execution_result) # You should see {'data':'hello'} in the console
33 |
34 | # Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config
35 | page, context = crawler.crawler_strategy.get_page(new_config)
36 |
37 | if __name__ == "__main__":
38 | asyncio.run(main())
39 |
--------------------------------------------------------------------------------
/docs/examples/simple_anti_bot_examples.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
3 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
4 |
5 | # Example 1: Stealth Mode
6 | async def stealth_mode_example():
7 | browser_config = BrowserConfig(
8 | enable_stealth=True,
9 | headless=False
10 | )
11 |
12 | async with AsyncWebCrawler(config=browser_config) as crawler:
13 | result = await crawler.arun("https://example.com")
14 | return result.html[:500]
15 |
16 | # Example 2: Undetected Browser
17 | async def undetected_browser_example():
18 | browser_config = BrowserConfig(
19 | headless=False
20 | )
21 |
22 | adapter = UndetectedAdapter()
23 | strategy = AsyncPlaywrightCrawlerStrategy(
24 | browser_config=browser_config,
25 | browser_adapter=adapter
26 | )
27 |
28 | async with AsyncWebCrawler(
29 | crawler_strategy=strategy,
30 | config=browser_config
31 | ) as crawler:
32 | result = await crawler.arun("https://example.com")
33 | return result.html[:500]
34 |
35 | # Example 3: Both Combined
36 | async def combined_example():
37 | browser_config = BrowserConfig(
38 | enable_stealth=True,
39 | headless=False
40 | )
41 |
42 | adapter = UndetectedAdapter()
43 | strategy = AsyncPlaywrightCrawlerStrategy(
44 | browser_config=browser_config,
45 | browser_adapter=adapter
46 | )
47 |
48 | async with AsyncWebCrawler(
49 | crawler_strategy=strategy,
50 | config=browser_config
51 | ) as crawler:
52 | result = await crawler.arun("https://example.com")
53 | return result.html[:500]
54 |
55 | # Run examples
56 | if __name__ == "__main__":
57 | asyncio.run(stealth_mode_example())
58 | asyncio.run(undetected_browser_example())
59 | asyncio.run(combined_example())
--------------------------------------------------------------------------------
/docs/examples/ssl_example.py:
--------------------------------------------------------------------------------
1 | """Example showing how to work with SSL certificates in Crawl4AI."""
2 |
3 | import asyncio
4 | import os
5 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
6 |
7 | # Create tmp directory if it doesn't exist
8 | parent_dir = os.path.dirname(
9 | os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10 | )
11 | tmp_dir = os.path.join(parent_dir, "tmp")
12 | os.makedirs(tmp_dir, exist_ok=True)
13 |
14 |
15 | async def main():
16 | # Configure crawler to fetch SSL certificate
17 | config = CrawlerRunConfig(
18 | fetch_ssl_certificate=True,
19 | cache_mode=CacheMode.BYPASS, # Bypass cache to always get fresh certificates
20 | )
21 |
22 | async with AsyncWebCrawler() as crawler:
23 | result = await crawler.arun(url="https://example.com", config=config)
24 |
25 | if result.success and result.ssl_certificate:
26 | cert = result.ssl_certificate
27 |
28 | # 1. Access certificate properties directly
29 | print("\nCertificate Information:")
30 | print(f"Issuer: {cert.issuer.get('CN', '')}")
31 | print(f"Valid until: {cert.valid_until}")
32 | print(f"Fingerprint: {cert.fingerprint}")
33 |
34 | # 2. Export certificate in different formats
35 | cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
36 | print("\nCertificate exported to:")
37 | print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
38 |
39 | pem_data = cert.to_pem(
40 | os.path.join(tmp_dir, "certificate.pem")
41 | ) # For web servers
42 | print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
43 |
44 | der_data = cert.to_der(
45 | os.path.join(tmp_dir, "certificate.der")
46 | ) # For Java apps
47 | print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
48 |
49 |
50 | if __name__ == "__main__":
51 | asyncio.run(main())
52 |
--------------------------------------------------------------------------------
/docs/examples/stealth_test_simple.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple test to verify stealth mode is working
3 | """
4 |
5 | import asyncio
6 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
7 |
8 |
9 | async def test_stealth():
10 | """Test stealth mode effectiveness"""
11 |
12 | # Test WITHOUT stealth
13 | print("=== WITHOUT Stealth ===")
14 | config1 = BrowserConfig(
15 | headless=False,
16 | enable_stealth=False
17 | )
18 |
19 | async with AsyncWebCrawler(config=config1) as crawler:
20 | result = await crawler.arun(
21 | url="https://bot.sannysoft.com",
22 | config=CrawlerRunConfig(
23 | wait_until="networkidle",
24 | screenshot=True
25 | )
26 | )
27 | print(f"Success: {result.success}")
28 | # Take screenshot
29 | if result.screenshot:
30 | with open("without_stealth.png", "wb") as f:
31 | import base64
32 | f.write(base64.b64decode(result.screenshot))
33 | print("Screenshot saved: without_stealth.png")
34 |
35 | # Test WITH stealth
36 | print("\n=== WITH Stealth ===")
37 | config2 = BrowserConfig(
38 | headless=False,
39 | enable_stealth=True
40 | )
41 |
42 | async with AsyncWebCrawler(config=config2) as crawler:
43 | result = await crawler.arun(
44 | url="https://bot.sannysoft.com",
45 | config=CrawlerRunConfig(
46 | wait_until="networkidle",
47 | screenshot=True
48 | )
49 | )
50 | print(f"Success: {result.success}")
51 | # Take screenshot
52 | if result.screenshot:
53 | with open("with_stealth.png", "wb") as f:
54 | import base64
55 | f.write(base64.b64decode(result.screenshot))
56 | print("Screenshot saved: with_stealth.png")
57 |
58 | print("\nCheck the screenshots to see the difference in bot detection results!")
59 |
60 |
61 | if __name__ == "__main__":
62 | asyncio.run(test_stealth())
--------------------------------------------------------------------------------
/docs/examples/summarize_page.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from crawl4ai.web_crawler import WebCrawler
4 | from crawl4ai.chunking_strategy import *
5 | from crawl4ai import *
6 | from crawl4ai.crawler_strategy import *
7 |
8 | url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
9 |
10 | crawler = WebCrawler()
11 | crawler.warmup()
12 |
13 | from pydantic import BaseModel, Field
14 |
15 |
16 | class PageSummary(BaseModel):
17 | title: str = Field(..., description="Title of the page.")
18 | summary: str = Field(..., description="Summary of the page.")
19 | brief_summary: str = Field(..., description="Brief summary of the page.")
20 | keywords: list = Field(..., description="Keywords assigned to the page.")
21 |
22 |
23 | result = crawler.run(
24 | url=url,
25 | word_count_threshold=1,
26 | extraction_strategy=LLMExtractionStrategy(
27 | provider="openai/gpt-4o",
28 | api_token=os.getenv("OPENAI_API_KEY"),
29 | schema=PageSummary.model_json_schema(),
30 | extraction_type="schema",
31 | apply_chunking=False,
32 | instruction="From the crawled content, extract the following details: "
33 | "1. Title of the page "
34 | "2. Summary of the page, which is a detailed summary "
35 | "3. Brief summary of the page, which is a paragraph text "
36 | "4. Keywords assigned to the page, which is a list of keywords. "
37 | "The extracted JSON format should look like this: "
38 | '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
39 | ),
40 | bypass_cache=True,
41 | )
42 |
43 | page_summary = json.loads(result.extracted_content)
44 |
45 | print(page_summary)
46 |
47 | with open(".data/page_summary.json", "w", encoding="utf-8") as f:
48 | f.write(result.extracted_content)
49 |
--------------------------------------------------------------------------------
/docs/examples/undetectability/undetected_basic_test.py:
--------------------------------------------------------------------------------
1 | """
2 | Basic Undetected Browser Test
3 | Simple example to test if undetected mode works
4 | """
5 |
6 | import asyncio
7 | from crawl4ai import AsyncWebCrawler, BrowserConfig
8 |
9 | async def test_regular_mode():
10 | """Test with regular browser"""
11 | print("Testing Regular Browser Mode...")
12 | browser_config = BrowserConfig(
13 | headless=False,
14 | verbose=True
15 | )
16 |
17 | async with AsyncWebCrawler(config=browser_config) as crawler:
18 | result = await crawler.arun(url="https://www.example.com")
19 | print(f"Regular Mode - Success: {result.success}")
20 | print(f"Regular Mode - Status: {result.status_code}")
21 | print(f"Regular Mode - Content length: {len(result.markdown.raw_markdown)}")
22 | print(f"Regular Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
23 | return result.success
24 |
25 | async def test_undetected_mode():
26 | """Test with undetected browser"""
27 | print("\nTesting Undetected Browser Mode...")
28 | from crawl4ai import UndetectedAdapter
29 | from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
30 |
31 | browser_config = BrowserConfig(
32 | headless=False,
33 | verbose=True
34 | )
35 |
36 | # Create undetected adapter
37 | undetected_adapter = UndetectedAdapter()
38 |
39 | # Create strategy with undetected adapter
40 | crawler_strategy = AsyncPlaywrightCrawlerStrategy(
41 | browser_config=browser_config,
42 | browser_adapter=undetected_adapter
43 | )
44 |
45 | async with AsyncWebCrawler(
46 | crawler_strategy=crawler_strategy,
47 | config=browser_config
48 | ) as crawler:
49 | result = await crawler.arun(url="https://www.example.com")
50 | print(f"Undetected Mode - Success: {result.success}")
51 | print(f"Undetected Mode - Status: {result.status_code}")
52 | print(f"Undetected Mode - Content length: {len(result.markdown.raw_markdown)}")
53 | print(f"Undetected Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
54 | return result.success
55 |
56 | async def main():
57 | """Run both tests"""
58 | print("🤖 Crawl4AI Basic Adapter Test\n")
59 |
60 | # Test regular mode
61 | regular_success = await test_regular_mode()
62 |
63 | # Test undetected mode
64 | undetected_success = await test_undetected_mode()
65 |
66 | # Summary
67 | print("\n" + "="*50)
68 | print("Summary:")
69 | print(f"Regular Mode: {'✅ Success' if regular_success else '❌ Failed'}")
70 | print(f"Undetected Mode: {'✅ Success' if undetected_success else '❌ Failed'}")
71 | print("="*50)
72 |
73 | if __name__ == "__main__":
74 | asyncio.run(main())
--------------------------------------------------------------------------------
/docs/examples/use_geo_location.py:
--------------------------------------------------------------------------------
1 | # use_geo_location.py
2 | """
3 | Example: override locale, timezone, and geolocation using Crawl4ai patterns.
4 |
5 | This demo uses `AsyncWebCrawler.arun()` to fetch a page with
6 | browser context primed for specific locale, timezone, and GPS,
7 | and saves a screenshot for visual verification.
8 | """
9 |
10 | import asyncio
11 | import base64
12 | from pathlib import Path
13 | from typing import List
14 | from crawl4ai import (
15 | AsyncWebCrawler,
16 | CrawlerRunConfig,
17 | BrowserConfig,
18 | GeolocationConfig,
19 | CrawlResult,
20 | )
21 |
22 | async def demo_geo_override():
23 | """Demo: Crawl a geolocation-test page with overrides and screenshot."""
24 | print("\n=== Geo-Override Crawl ===")
25 |
26 | # 1) Browser setup: use Playwright-managed contexts
27 | browser_cfg = BrowserConfig(
28 | headless=False,
29 | viewport_width=1280,
30 | viewport_height=720,
31 | use_managed_browser=False,
32 | )
33 |
34 | # 2) Run config: include locale, timezone_id, geolocation, and screenshot
35 | run_cfg = CrawlerRunConfig(
36 | url="https://browserleaks.com/geo", # test page that shows your location
37 | locale="en-US", # Accept-Language & UI locale
38 | timezone_id="America/Los_Angeles", # JS Date()/Intl timezone
39 | geolocation=GeolocationConfig( # override GPS coords
40 | latitude=34.0522,
41 | longitude=-118.2437,
42 | accuracy=10.0,
43 | ),
44 | screenshot=True, # capture screenshot after load
45 | session_id="geo_test", # reuse context if rerunning
46 | delay_before_return_html=5
47 | )
48 |
49 | async with AsyncWebCrawler(config=browser_cfg) as crawler:
50 | # 3) Run crawl (returns list even for single URL)
51 | results: List[CrawlResult] = await crawler.arun(
52 | url=run_cfg.url,
53 | config=run_cfg,
54 | )
55 | result = results[0]
56 |
57 | # 4) Save screenshot and report path
58 | if result.screenshot:
59 | __current_dir = Path(__file__).parent
60 | out_dir = __current_dir / "tmp"
61 | out_dir.mkdir(exist_ok=True)
62 | shot_path = out_dir / "geo_test.png"
63 | with open(shot_path, "wb") as f:
64 | f.write(base64.b64decode(result.screenshot))
65 | print(f"Saved screenshot to {shot_path}")
66 | else:
67 | print("No screenshot captured, check configuration.")
68 |
69 | if __name__ == "__main__":
70 | asyncio.run(demo_geo_override())
71 |
--------------------------------------------------------------------------------
/docs/examples/website-to-api/app.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Startup script for the Web Scraper API with frontend interface.
4 | """
5 |
6 | import os
7 | import sys
8 | import uvicorn
9 | from pathlib import Path
10 |
11 | def main():
12 | # Check if static directory exists
13 | static_dir = Path("static")
14 | if not static_dir.exists():
15 | print("❌ Static directory not found!")
16 | print("Please make sure the 'static' directory exists with the frontend files.")
17 | sys.exit(1)
18 |
19 | # Check if required frontend files exist
20 | required_files = ["index.html", "styles.css", "script.js"]
21 | missing_files = []
22 |
23 | for file in required_files:
24 | if not (static_dir / file).exists():
25 | missing_files.append(file)
26 |
27 | if missing_files:
28 | print(f"❌ Missing frontend files: {', '.join(missing_files)}")
29 | print("Please make sure all frontend files are present in the static directory.")
30 | sys.exit(1)
31 |
32 | print("🚀 Starting Web Scraper API with Frontend Interface")
33 | print("=" * 50)
34 | print("📁 Static files found and ready to serve")
35 | print("🌐 Frontend will be available at: http://localhost:8000")
36 | print("🔌 API endpoints available at: http://localhost:8000/docs")
37 | print("=" * 50)
38 |
39 | # Start the server
40 | uvicorn.run(
41 | "api_server:app",
42 | host="0.0.0.0",
43 | port=8000,
44 | reload=True,
45 | log_level="info"
46 | )
47 |
48 | if __name__ == "__main__":
49 | main()
--------------------------------------------------------------------------------
/docs/examples/website-to-api/assets/crawl4ai_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/examples/website-to-api/assets/crawl4ai_logo.jpg
--------------------------------------------------------------------------------
/docs/examples/website-to-api/requirements.txt:
--------------------------------------------------------------------------------
1 | crawl4ai
2 | fastapi
3 | uvicorn
4 | pydantic
5 | litellm
--------------------------------------------------------------------------------
/docs/examples/website-to-api/test_api.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from web_scraper_lib import scrape_website
3 | import os
4 |
5 | async def test_library():
6 | """Test the mini library directly."""
7 | print("=== Testing Mini Library ===")
8 |
9 | # Test 1: Scrape with a custom model
10 | url = "https://marketplace.mainstreet.co.in/collections/adidas-yeezy/products/adidas-yeezy-boost-350-v2-yecheil-non-reflective"
11 | query = "Extract the following data: Product name, Product price, Product description, Product size. DO NOT EXTRACT ANYTHING ELSE."
12 | if os.path.exists("models"):
13 | model_name = os.listdir("models")[0].split(".")[0]
14 | else:
15 | raise Exception("No models found in models directory")
16 |
17 | print(f"Scraping: {url}")
18 | print(f"Query: {query}")
19 |
20 | try:
21 | result = await scrape_website(url, query, model_name)
22 | print("✅ Library test successful!")
23 | print(f"Extracted data: {result['extracted_data']}")
24 | except Exception as e:
25 | print(f"❌ Library test failed: {e}")
26 |
27 | if __name__ == "__main__":
28 | asyncio.run(test_library())
--------------------------------------------------------------------------------
/docs/examples/website-to-api/test_models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Test script for the new model management functionality.
4 | This script demonstrates how to save and use custom model configurations.
5 | """
6 |
7 | import asyncio
8 | import requests
9 | import json
10 |
11 | # API base URL
12 | BASE_URL = "http://localhost:8000"
13 |
14 | def test_model_management():
15 | """Test the model management endpoints."""
16 |
17 | print("=== Testing Model Management ===")
18 |
19 | # 1. List current models
20 | print("\n1. Listing current models:")
21 | response = requests.get(f"{BASE_URL}/models")
22 | print(f"Status: {response.status_code}")
23 | print(f"Response: {json.dumps(response.json(), indent=2)}")
24 |
25 |
26 | # 2. Save another model configuration (OpenAI example)
27 | print("\n2. Saving OpenAI model configuration:")
28 | openai_config = {
29 | "model_name": "my-openai",
30 | "provider": "openai",
31 | "api_token": "your-openai-api-key-here"
32 | }
33 |
34 | response = requests.post(f"{BASE_URL}/models", json=openai_config)
35 | print(f"Status: {response.status_code}")
36 | print(f"Response: {json.dumps(response.json(), indent=2)}")
37 |
38 | # 3. List models again to see the new ones
39 | print("\n3. Listing models after adding new ones:")
40 | response = requests.get(f"{BASE_URL}/models")
41 | print(f"Status: {response.status_code}")
42 | print(f"Response: {json.dumps(response.json(), indent=2)}")
43 |
44 | # 4. Delete a model configuration
45 | print("\n4. Deleting a model configuration:")
46 | response = requests.delete(f"{BASE_URL}/models/my-openai")
47 | print(f"Status: {response.status_code}")
48 | print(f"Response: {json.dumps(response.json(), indent=2)}")
49 |
50 | # 5. Final list of models
51 | print("\n5. Final list of models:")
52 | response = requests.get(f"{BASE_URL}/models")
53 | print(f"Status: {response.status_code}")
54 | print(f"Response: {json.dumps(response.json(), indent=2)}")
55 |
56 | if __name__ == "__main__":
57 | print("Model Management Test Script")
58 | print("Make sure the API server is running on http://localhost:8000")
59 | print("=" * 50)
60 |
61 | try:
62 | test_model_management()
63 | except requests.exceptions.ConnectionError:
64 | print("Error: Could not connect to the API server.")
65 | print("Make sure the server is running with: python api_server.py")
66 | except Exception as e:
67 | print(f"Error: {e}")
--------------------------------------------------------------------------------
/docs/md_v2/advanced/crawl-dispatcher.md:
--------------------------------------------------------------------------------
1 | # Crawl Dispatcher
2 |
3 | We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress.
4 |
5 | Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X.
6 |
7 | Below is a **sample** of how the dispatcher’s performance monitor might look in action:
8 |
9 | 
10 |
11 |
12 | We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates!
--------------------------------------------------------------------------------
/docs/md_v2/apps/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/assets/DankMono-Bold.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/assets/DankMono-Italic.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/assets/DankMono-Regular.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/c4a-script/assets/DankMono-Bold.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/c4a-script/assets/DankMono-Italic.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/c4a-script/assets/DankMono-Regular.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/blockly-demo.c4a:
--------------------------------------------------------------------------------
1 | # Demo: Login Flow with Blockly
2 | # This script can be created visually using Blockly blocks
3 |
4 | GO https://example.com/login
5 | WAIT `#login-form` 5
6 |
7 | # Check if already logged in
8 | IF (EXISTS `.user-avatar`) THEN GO https://example.com/dashboard
9 |
10 | # Fill login form
11 | CLICK `#email`
12 | TYPE "demo@example.com"
13 | CLICK `#password`
14 | TYPE "password123"
15 |
16 | # Submit form
17 | CLICK `button[type="submit"]`
18 | WAIT `.dashboard` 10
19 |
20 | # Success message
21 | EVAL `console.log('Login successful!')`
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/requirements.txt:
--------------------------------------------------------------------------------
1 | flask>=2.3.0
2 | flask-cors>=4.0.0
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/01-basic-interaction.c4a:
--------------------------------------------------------------------------------
1 | # Basic Page Interaction
2 | # This script demonstrates basic C4A commands
3 |
4 | # Navigate to the playground
5 | GO http://127.0.0.1:8080/playground/
6 |
7 | # Wait for page to load
8 | WAIT `body` 2
9 |
10 | # Handle cookie banner if present
11 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
12 |
13 | # Close newsletter popup if it appears
14 | WAIT 3
15 | IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`
16 |
17 | # Click the start tutorial button
18 | CLICK `#start-tutorial`
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/02-login-flow.c4a:
--------------------------------------------------------------------------------
1 | # Complete Login Flow
2 | # Demonstrates form interaction and authentication
3 |
4 | # Click login button
5 | CLICK `#login-btn`
6 |
7 | # Wait for login modal
8 | WAIT `.login-form` 3
9 |
10 | # Fill in credentials
11 | CLICK `#email`
12 | TYPE "demo@example.com"
13 |
14 | CLICK `#password`
15 | TYPE "demo123"
16 |
17 | # Check remember me
18 | IF (EXISTS `#remember-me`) THEN CLICK `#remember-me`
19 |
20 | # Submit form
21 | CLICK `button[type="submit"]`
22 |
23 | # Wait for success
24 | WAIT `.welcome-message` 5
25 |
26 | # Verify login succeeded
27 | IF (EXISTS `.user-info`) THEN EVAL `console.log('✅ Login successful!')`
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/03-infinite-scroll.c4a:
--------------------------------------------------------------------------------
1 | # Infinite Scroll Product Loading
2 | # Load all products using scroll automation
3 |
4 | # Navigate to catalog
5 | CLICK `#catalog-link`
6 | WAIT `.product-grid` 3
7 |
8 | # Switch to infinite scroll mode
9 | CLICK `#infinite-scroll-btn`
10 |
11 | # Define scroll procedure
12 | PROC load_more_products
13 | # Get current product count
14 | EVAL `window.initialCount = document.querySelectorAll('.product-card').length`
15 |
16 | # Scroll down
17 | SCROLL DOWN 1000
18 | WAIT 2
19 |
20 | # Check if more products loaded
21 | EVAL `
22 | const newCount = document.querySelectorAll('.product-card').length;
23 | console.log('Products loaded: ' + newCount);
24 | window.moreLoaded = newCount > window.initialCount;
25 | `
26 | ENDPROC
27 |
28 | # Load products until no more
29 | REPEAT (load_more_products, `window.moreLoaded !== false`)
30 |
31 | # Final count
32 | EVAL `console.log('✅ Total products: ' + document.querySelectorAll('.product-card').length)`
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/04-multi-step-form.c4a:
--------------------------------------------------------------------------------
1 | # Multi-step Form Wizard
2 | # Complete a complex form with multiple steps
3 |
4 | # Navigate to forms section
5 | CLICK `a[href="#forms"]`
6 | WAIT `#survey-form` 2
7 |
8 | # Step 1: Basic Information
9 | CLICK `#full-name`
10 | TYPE "John Doe"
11 |
12 | CLICK `#survey-email`
13 | TYPE "john.doe@example.com"
14 |
15 | # Go to next step
16 | CLICK `.next-step`
17 | WAIT 1
18 |
19 | # Step 2: Select Interests
20 | # Select multiple options
21 | CLICK `#interests`
22 | CLICK `option[value="tech"]`
23 | CLICK `option[value="music"]`
24 | CLICK `option[value="travel"]`
25 |
26 | # Continue to final step
27 | CLICK `.next-step`
28 | WAIT 1
29 |
30 | # Step 3: Review and Submit
31 | # Verify we're on the last step
32 | IF (EXISTS `#submit-survey`) THEN EVAL `console.log('📋 On final step')`
33 |
34 | # Submit the form
35 | CLICK `#submit-survey`
36 |
37 | # Wait for success message
38 | WAIT `.success-message` 5
39 |
40 | # Verify submission
41 | IF (EXISTS `.success-message`) THEN EVAL `console.log('✅ Survey submitted successfully!')`
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/scripts/05-complex-workflow.c4a:
--------------------------------------------------------------------------------
1 | # Complete E-commerce Workflow
2 | # Login, browse products, and interact with various elements
3 |
4 | # Define reusable procedures
5 | PROC handle_popups
6 | IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
7 | IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`
8 | ENDPROC
9 |
10 | PROC login_user
11 | CLICK `#login-btn`
12 | WAIT `.login-form` 2
13 | CLICK `#email`
14 | TYPE "demo@example.com"
15 | CLICK `#password`
16 | TYPE "demo123"
17 | CLICK `button[type="submit"]`
18 | WAIT `.welcome-message` 5
19 | ENDPROC
20 |
21 | PROC browse_products
22 | # Go to catalog
23 | CLICK `#catalog-link`
24 | WAIT `.product-grid` 3
25 |
26 | # Apply filters
27 | CLICK `.collapsible`
28 | WAIT 0.5
29 | CLICK `input[type="checkbox"]`
30 |
31 | # Load some products
32 | SCROLL DOWN 500
33 | WAIT 1
34 | SCROLL DOWN 500
35 | WAIT 1
36 | ENDPROC
37 |
38 | # Main workflow
39 | GO http://127.0.0.1:8080/playground/
40 | WAIT `body` 2
41 |
42 | # Handle initial popups
43 | handle_popups
44 |
45 | # Login if not already
46 | IF (NOT EXISTS `.user-info`) THEN login_user
47 |
48 | # Browse products
49 | browse_products
50 |
51 | # Navigate to tabs demo
52 | CLICK `a[href="#tabs"]`
53 | WAIT `.tabs-container` 2
54 |
55 | # Interact with tabs
56 | CLICK `button[data-tab="reviews"]`
57 | WAIT 1
58 |
59 | # Load comments
60 | IF (EXISTS `.load-comments`) THEN CLICK `.load-comments`
61 | WAIT `.comments-section` 2
62 |
63 | # Check specifications
64 | CLICK `button[data-tab="specs"]`
65 | WAIT 1
66 |
67 | # Final navigation to data tables
68 | CLICK `a[href="#data"]`
69 | WAIT `.data-table` 2
70 |
71 | # Search in table
72 | CLICK `.search-input`
73 | TYPE "User"
74 |
75 | # Load more rows
76 | CLICK `.load-more-rows`
77 | WAIT 1
78 |
79 | # Export data
80 | CLICK `#export-btn`
81 |
82 | EVAL `console.log('✅ Workflow completed successfully!')`
--------------------------------------------------------------------------------
/docs/md_v2/apps/c4a-script/test_blockly.html:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <html lang="en">
3 | <head>
4 | <meta charset="UTF-8">
5 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
6 | <title>Blockly Test</title>
7 | <style>
8 | body {
9 | margin: 0;
10 | padding: 20px;
11 | background: #0e0e10;
12 | color: #e0e0e0;
13 | font-family: monospace;
14 | }
15 | #blocklyDiv {
16 | height: 600px;
17 | width: 100%;
18 | border: 1px solid #2a2a2c;
19 | }
20 | #output {
21 | margin-top: 20px;
22 | padding: 15px;
23 | background: #1a1a1b;
24 | border: 1px solid #2a2a2c;
25 | white-space: pre-wrap;
26 | }
27 | </style>
28 | </head>
29 | <body>
30 | <h1>C4A-Script Blockly Test</h1>
31 | <div id="blocklyDiv"></div>
32 | <div id="output">
33 | <h3>Generated C4A-Script:</h3>
34 | <pre id="code-output"></pre>
35 | </div>
36 |
37 | <script src="https://unpkg.com/blockly/blockly.min.js"></script>
38 | <script src="assets/c4a-blocks.js"></script>
39 | <script>
40 | // Simple test
41 | const workspace = Blockly.inject('blocklyDiv', {
42 | toolbox: `
43 | <xml>
44 | <category name="Test" colour="#1E88E5">
45 | <block type="c4a_go"></block>
46 | <block type="c4a_wait_time"></block>
47 | <block type="c4a_click"></block>
48 | </category>
49 | </xml>
50 | `,
51 | theme: Blockly.Theme.defineTheme('dark', {
52 | 'base': Blockly.Themes.Classic,
53 | 'componentStyles': {
54 | 'workspaceBackgroundColour': '#0e0e10',
55 | 'toolboxBackgroundColour': '#1a1a1b',
56 | 'toolboxForegroundColour': '#e0e0e0',
57 | 'flyoutBackgroundColour': '#1a1a1b',
58 | 'flyoutForegroundColour': '#e0e0e0',
59 | }
60 | })
61 | });
62 |
63 | workspace.addChangeListener((event) => {
64 | const code = Blockly.JavaScript.workspaceToCode(workspace);
65 | document.getElementById('code-output').textContent = code;
66 | });
67 | </script>
68 | </body>
69 | </html>
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Bold.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Italic.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/assets/DankMono-Regular.woff2
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/background/service-worker.js:
--------------------------------------------------------------------------------
1 | // Service worker for Crawl4AI Assistant
2 |
3 | // Handle messages from content script
4 | chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
5 | if (message.action === 'downloadCode' || message.action === 'downloadScript') {
6 | try {
7 | // Create a data URL for the Python code
8 | const dataUrl = 'data:text/plain;charset=utf-8,' + encodeURIComponent(message.code);
9 |
10 | // Download the file
11 | chrome.downloads.download({
12 | url: dataUrl,
13 | filename: message.filename || 'crawl4ai_schema.py',
14 | saveAs: true
15 | }, (downloadId) => {
16 | if (chrome.runtime.lastError) {
17 | console.error('Download failed:', chrome.runtime.lastError);
18 | sendResponse({ success: false, error: chrome.runtime.lastError.message });
19 | } else {
20 | console.log('Download started with ID:', downloadId);
21 | sendResponse({ success: true, downloadId: downloadId });
22 | }
23 | });
24 | } catch (error) {
25 | console.error('Error creating download:', error);
26 | sendResponse({ success: false, error: error.message });
27 | }
28 |
29 | return true; // Keep the message channel open for async response
30 | }
31 |
32 | return false;
33 | });
34 |
35 | // Clean up on extension install/update
36 | chrome.runtime.onInstalled.addListener(() => {
37 | // Clear any stored state
38 | chrome.storage.local.clear();
39 | });
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/content/content.js:
--------------------------------------------------------------------------------
1 | // Main content script for Crawl4AI Assistant
2 | // Coordinates between Click2Crawl, ScriptBuilder, and MarkdownExtraction
3 |
4 | let activeBuilder = null;
5 |
6 | // Listen for messages from popup
7 | chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
8 | if (request.action === 'startCapture') {
9 | if (activeBuilder) {
10 | console.log('Stopping existing capture session');
11 | activeBuilder.stop();
12 | activeBuilder = null;
13 | }
14 |
15 | if (request.mode === 'schema') {
16 | console.log('Starting Click2Crawl');
17 | activeBuilder = new Click2Crawl();
18 | activeBuilder.start();
19 | } else if (request.mode === 'script') {
20 | console.log('Starting Script Builder');
21 | activeBuilder = new ScriptBuilder();
22 | activeBuilder.start();
23 | }
24 |
25 | sendResponse({ success: true });
26 | } else if (request.action === 'stopCapture') {
27 | if (activeBuilder) {
28 | activeBuilder.stop();
29 | activeBuilder = null;
30 | }
31 | sendResponse({ success: true });
32 | } else if (request.action === 'startSchemaCapture') {
33 | if (activeBuilder) {
34 | activeBuilder.deactivate?.();
35 | activeBuilder = null;
36 | }
37 | console.log('Starting Click2Crawl');
38 | activeBuilder = new Click2Crawl();
39 | activeBuilder.start();
40 | sendResponse({ success: true });
41 | } else if (request.action === 'startScriptCapture') {
42 | if (activeBuilder) {
43 | activeBuilder.deactivate?.();
44 | activeBuilder = null;
45 | }
46 | console.log('Starting Script Builder');
47 | activeBuilder = new ScriptBuilder();
48 | activeBuilder.start();
49 | sendResponse({ success: true });
50 | } else if (request.action === 'startClick2Crawl') {
51 | if (activeBuilder) {
52 | activeBuilder.deactivate?.();
53 | activeBuilder = null;
54 | }
55 | console.log('Starting Markdown Extraction');
56 | activeBuilder = new MarkdownExtraction();
57 | sendResponse({ success: true });
58 | } else if (request.action === 'generateCode') {
59 | if (activeBuilder && activeBuilder.generateCode) {
60 | activeBuilder.generateCode();
61 | }
62 | sendResponse({ success: true });
63 | }
64 | });
65 |
66 | // Cleanup on page unload
67 | window.addEventListener('beforeunload', () => {
68 | if (activeBuilder) {
69 | if (activeBuilder.deactivate) {
70 | activeBuilder.deactivate();
71 | } else if (activeBuilder.stop) {
72 | activeBuilder.stop();
73 | }
74 | activeBuilder = null;
75 | }
76 | });
77 |
78 | console.log('Crawl4AI Assistant content script loaded');
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.2.1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.2.1.zip
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/icons/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/icons/favicon.ico
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/icons/icon-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/icons/icon-128.png
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/icons/icon-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/icons/icon-16.png
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/icons/icon-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/icons/icon-48.png
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "manifest_version": 3,
3 | "name": "Crawl4AI Assistant",
4 | "version": "1.3.0",
5 | "description": "Visual schema and script builder for Crawl4AI - Build extraction schemas and automation scripts by clicking and recording actions",
6 | "permissions": [
7 | "activeTab",
8 | "storage",
9 | "downloads"
10 | ],
11 | "host_permissions": [
12 | "<all_urls>"
13 | ],
14 | "action": {
15 | "default_popup": "popup/popup.html",
16 | "default_icon": {
17 | "16": "icons/icon-16.png",
18 | "48": "icons/icon-48.png",
19 | "128": "icons/icon-128.png"
20 | }
21 | },
22 | "content_scripts": [
23 | {
24 | "matches": ["<all_urls>"],
25 | "js": [
26 | "libs/marked.min.js",
27 | "content/shared/utils.js",
28 | "content/markdownPreviewModal.js",
29 | "content/click2crawl.js",
30 | "content/scriptBuilder.js",
31 | "content/contentAnalyzer.js",
32 | "content/markdownConverter.js",
33 | "content/markdownExtraction.js",
34 | "content/content.js"
35 | ],
36 | "css": ["content/overlay.css"],
37 | "run_at": "document_idle"
38 | }
39 | ],
40 | "background": {
41 | "service_worker": "background/service-worker.js"
42 | },
43 | "icons": {
44 | "16": "icons/icon-16.png",
45 | "48": "icons/icon-48.png",
46 | "128": "icons/icon-128.png"
47 | },
48 | "web_accessible_resources": [
49 | {
50 | "resources": ["icons/*", "assets/*"],
51 | "matches": ["<all_urls>"]
52 | }
53 | ]
54 | }
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/popup/icons/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/popup/icons/favicon.ico
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-128.png
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-16.png
--------------------------------------------------------------------------------
/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/apps/crawl4ai-assistant/popup/icons/icon-48.png
--------------------------------------------------------------------------------
/docs/md_v2/ask_ai/index.html:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <html lang="en">
3 | <head>
4 | <meta charset="UTF-8">
5 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
6 | <title>Crawl4AI Assistant</title>
7 | <!-- Link main styles first for variable access -->
8 | <link rel="stylesheet" href="../assets/layout.css">
9 | <link rel="stylesheet" href="../assets/styles.css">
10 | <!-- Link specific AI styles -->
11 | <link rel="stylesheet" href="../assets/highlight.css">
12 | <link rel="stylesheet" href="ask-ai.css">
13 | </head>
14 | <body>
15 | <div class="ai-assistant-container">
16 |
17 | <!-- Left Sidebar: Conversation History -->
18 | <aside id="history-panel" class="sidebar left-sidebar">
19 | <header>
20 | <h3>History</h3>
21 | <button id="new-chat-button" class="btn btn-sm">New Chat</button>
22 | </header>
23 | <ul id="history-list">
24 | <!-- History items populated by JS -->
25 | </ul>
26 | </aside>
27 |
28 | <!-- Main Area: Chat Interface -->
29 | <main id="chat-panel">
30 | <div id="chat-messages">
31 | <!-- Chat messages populated by JS -->
32 | <div class="message ai-message welcome-message">
33 | Welcome to the Crawl4AI Assistant! How can I help you today?
34 | </div>
35 | </div>
36 | <div id="chat-input-area">
37 | <!-- Loading indicator for general waiting (optional) -->
38 | <!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
39 | <textarea id="chat-input" placeholder="We will roll out this feature very soon." rows="2" disabled></textarea>
40 | <button id="send-button">Send</button>
41 | </div>
42 | </main>
43 |
44 | <!-- Right Sidebar: Citations / Context -->
45 | <aside id="citations-panel" class="sidebar right-sidebar">
46 | <header>
47 | <h3>Citations</h3>
48 | </header>
49 | <ul id="citations-list">
50 | <!-- Citations populated by JS -->
51 | <li class="no-citations">No citations for this response yet.</li>
52 | </ul>
53 | </aside>
54 |
55 | </div>
56 |
57 | <!-- Include Marked.js library -->
58 | <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
59 | <script src="../assets/highlight.min.js"></script>
60 |
61 | <!-- Your AI Assistant Logic -->
62 | <script src="ask-ai.js"></script>
63 | </body>
64 | </html>
--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/DankMono-Bold.woff2
--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/DankMono-Italic.woff2
--------------------------------------------------------------------------------
/docs/md_v2/assets/DankMono-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/DankMono-Regular.woff2
--------------------------------------------------------------------------------
/docs/md_v2/assets/Monaco.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/Monaco.woff
--------------------------------------------------------------------------------
/docs/md_v2/assets/copy_code.js:
--------------------------------------------------------------------------------
1 | // ==== File: docs/assets/copy_code.js ====
2 |
3 | document.addEventListener('DOMContentLoaded', () => {
4 | // Target specifically code blocks within the main content area
5 | const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
6 |
7 | codeBlocks.forEach((codeElement) => {
8 | const preElement = codeElement.parentElement; // The <pre> tag
9 |
10 | // Ensure the <pre> tag can contain a positioned button
11 | if (window.getComputedStyle(preElement).position === 'static') {
12 | preElement.style.position = 'relative';
13 | }
14 |
15 | // Create the button
16 | const copyButton = document.createElement('button');
17 | copyButton.className = 'copy-code-button';
18 | copyButton.type = 'button';
19 | copyButton.setAttribute('aria-label', 'Copy code to clipboard');
20 | copyButton.title = 'Copy code to clipboard';
21 | copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
22 |
23 | // Append the button to the <pre> element
24 | preElement.appendChild(copyButton);
25 |
26 | // Add click event listener
27 | copyButton.addEventListener('click', () => {
28 | copyCodeToClipboard(codeElement, copyButton);
29 | });
30 | });
31 |
32 | async function copyCodeToClipboard(codeElement, button) {
33 | // Use innerText to get the rendered text content, preserving line breaks
34 | const textToCopy = codeElement.innerText;
35 |
36 | try {
37 | await navigator.clipboard.writeText(textToCopy);
38 |
39 | // Visual feedback
40 | button.innerHTML = 'Copied!';
41 | button.classList.add('copied');
42 | button.disabled = true; // Temporarily disable
43 |
44 | // Revert button state after a short delay
45 | setTimeout(() => {
46 | button.innerHTML = 'Copy';
47 | button.classList.remove('copied');
48 | button.disabled = false;
49 | }, 2000); // Show "Copied!" for 2 seconds
50 |
51 | } catch (err) {
52 | console.error('Failed to copy code: ', err);
53 | // Optional: Provide error feedback on the button
54 | button.innerHTML = 'Error';
55 | setTimeout(() => {
56 | button.innerHTML = 'Copy';
57 | }, 2000);
58 | }
59 | }
60 |
61 | console.log("Copy Code Button script loaded.");
62 | });
--------------------------------------------------------------------------------
/docs/md_v2/assets/crawl4ai-skill.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/crawl4ai-skill.zip
--------------------------------------------------------------------------------
/docs/md_v2/assets/docs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/docs.zip
--------------------------------------------------------------------------------
/docs/md_v2/assets/feedback-overrides.css:
--------------------------------------------------------------------------------
1 | /* docs/assets/feedback-overrides.css */
2 | :root {
3 | /* brand */
4 | --feedback-primary-color: #09b5a5;
5 | --feedback-highlight-color: #fed500; /* stars etc */
6 |
7 | /* modal shell / text */
8 | --feedback-modal-content-bg-color: var(--background-color);
9 | --feedback-modal-content-text-color: var(--font-color);
10 | --feedback-modal-content-border-color: var(--primary-dimmed-color);
11 | --feedback-modal-content-border-radius: 4px;
12 |
13 | /* overlay */
14 | --feedback-overlay-bg-color: rgba(0,0,0,.75);
15 |
16 | /* rating buttons */
17 | --feedback-modal-rating-button-color: var(--secondary-color);
18 | --feedback-modal-rating-button-selected-color: var(--primary-color);
19 |
20 | /* inputs */
21 | --feedback-modal-input-bg-color: var(--code-bg-color);
22 | --feedback-modal-input-text-color: var(--font-color);
23 | --feedback-modal-input-border-color: var(--primary-dimmed-color);
24 | --feedback-modal-input-border-color-focused: var(--primary-color);
25 |
26 | /* submit / secondary buttons */
27 | --feedback-modal-button-submit-bg-color: var(--primary-color);
28 | --feedback-modal-button-submit-bg-color-hover: var(--primary-dimmed-color);
29 | --feedback-modal-button-submit-text-color: var(--invert-font-color);
30 |
31 | --feedback-modal-button-bg-color: transparent; /* screenshot btn */
32 | --feedback-modal-button-border-color: var(--primary-color);
33 | --feedback-modal-button-icon-color: var(--primary-color);
34 | }
35 |
36 | /* optional: keep the “Powered by” link subtle */
37 | .feedback-logo a{color:var(--secondary-color);}
38 |
--------------------------------------------------------------------------------
/docs/md_v2/assets/floating_ask_ai_button.js:
--------------------------------------------------------------------------------
1 | // ==== File: docs/assets/floating_ask_ai_button.js ====
2 |
3 | document.addEventListener('DOMContentLoaded', () => {
4 | const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
5 | const currentPath = window.location.pathname;
6 |
7 | // Determine the base URL for constructing the link correctly,
8 | // especially if deployed in a sub-directory.
9 | // This assumes a simple structure; adjust if needed.
10 | const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
11 |
12 |
13 | // Check if the current page IS the Ask AI page
14 | // Use includes() for flexibility (handles trailing slash or .html)
15 | if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
16 | console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
17 | return; // Don't add the button on the target page
18 | }
19 |
20 | // --- Create the button ---
21 | const fabLink = document.createElement('a');
22 | fabLink.className = 'floating-ask-ai-button';
23 | fabLink.href = askAiPagePath; // Construct the correct URL
24 | fabLink.title = 'Ask Crawl4AI Assistant';
25 | fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
26 |
27 | // Add content (using SVG icon for better visuals)
28 | fabLink.innerHTML = `
29 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
30 | <path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
31 | </svg>
32 | <span>Ask AI</span>
33 | `;
34 |
35 | // Append to body
36 | document.body.appendChild(fabLink);
37 |
38 | console.log("Floating Ask AI Button added.");
39 | });
--------------------------------------------------------------------------------
/docs/md_v2/assets/gtag.js:
--------------------------------------------------------------------------------
1 | window.dataLayer = window.dataLayer || [];
2 | function gtag(){dataLayer.push(arguments);}
3 | gtag('js', new Date());
4 |
5 | gtag('config', 'G-58W0K2ZQ25');
6 |
--------------------------------------------------------------------------------
/docs/md_v2/assets/highlight.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/highlight.css
--------------------------------------------------------------------------------
/docs/md_v2/assets/highlight_init.js:
--------------------------------------------------------------------------------
1 | document.addEventListener('DOMContentLoaded', (event) => {
2 | document.querySelectorAll('pre code').forEach((block) => {
3 | hljs.highlightBlock(block);
4 | });
5 | });
6 |
--------------------------------------------------------------------------------
/docs/md_v2/assets/images/dispatcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/images/dispatcher.png
--------------------------------------------------------------------------------
/docs/md_v2/assets/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/assets/images/logo.png
--------------------------------------------------------------------------------
/docs/md_v2/blog/releases/0.7.1.md:
--------------------------------------------------------------------------------
1 | # 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
2 |
3 | *July 17, 2025 • 2 min read*
4 |
5 | ---
6 |
7 | A small maintenance release that removes unused code and improves documentation.
8 |
9 | ## 🎯 What's Changed
10 |
11 | - **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
12 | - **Updated documentation** with better examples and parameter explanations
13 | - **Fixed virtual scroll configuration** examples in docs
14 |
15 | ## 🧹 Code Cleanup
16 |
17 | Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
18 |
19 | ```python
20 | # Removed unused code:
21 | from playwright_stealth import StealthConfig
22 | stealth_config = StealthConfig(...) # This was never used
23 | ```
24 |
25 | ## 📖 Documentation Updates
26 |
27 | - Fixed adaptive crawling parameter examples
28 | - Updated session management documentation
29 | - Corrected virtual scroll configuration examples
30 |
31 | ## 🚀 Installation
32 |
33 | ```bash
34 | pip install crawl4ai==0.7.1
35 | ```
36 |
37 | No breaking changes - upgrade directly from v0.7.0.
38 |
39 | ---
40 |
41 | Questions? Issues?
42 | - GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
43 | - Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
--------------------------------------------------------------------------------
/docs/md_v2/core/cache-modes.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI Cache System and Migration Guide
2 |
3 | ## Overview
4 | Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
5 |
6 | ## Old vs New Approach
7 |
8 | ### Old Way (Deprecated)
9 | The old system used multiple boolean flags:
10 | - `bypass_cache`: Skip cache entirely
11 | - `disable_cache`: Disable all caching
12 | - `no_cache_read`: Don't read from cache
13 | - `no_cache_write`: Don't write to cache
14 |
15 | ### New Way (Recommended)
16 | The new system uses a single `CacheMode` enum:
17 | - `CacheMode.ENABLED`: Normal caching (read/write)
18 | - `CacheMode.DISABLED`: No caching at all
19 | - `CacheMode.READ_ONLY`: Only read from cache
20 | - `CacheMode.WRITE_ONLY`: Only write to cache
21 | - `CacheMode.BYPASS`: Skip cache for this operation
22 |
23 | ## Migration Example
24 |
25 | ### Old Code (Deprecated)
26 | ```python
27 | import asyncio
28 | from crawl4ai import AsyncWebCrawler
29 |
30 | async def use_proxy():
31 | async with AsyncWebCrawler(verbose=True) as crawler:
32 | result = await crawler.arun(
33 | url="https://www.nbcnews.com/business",
34 | bypass_cache=True # Old way
35 | )
36 | print(len(result.markdown))
37 |
38 | async def main():
39 | await use_proxy()
40 |
41 | if __name__ == "__main__":
42 | asyncio.run(main())
43 | ```
44 |
45 | ### New Code (Recommended)
46 | ```python
47 | import asyncio
48 | from crawl4ai import AsyncWebCrawler, CacheMode
49 | from crawl4ai.async_configs import CrawlerRunConfig
50 |
51 | async def use_proxy():
52 | # Use CacheMode in CrawlerRunConfig
53 | config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
54 | async with AsyncWebCrawler(verbose=True) as crawler:
55 | result = await crawler.arun(
56 | url="https://www.nbcnews.com/business",
57 | config=config # Pass the configuration object
58 | )
59 | print(len(result.markdown))
60 |
61 | async def main():
62 | await use_proxy()
63 |
64 | if __name__ == "__main__":
65 | asyncio.run(main())
66 | ```
67 |
68 | ## Common Migration Patterns
69 |
70 | | Old Flag | New Mode |
71 | |-----------------------|---------------------------------|
72 | | `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` |
73 | | `disable_cache=True` | `cache_mode=CacheMode.DISABLED`|
74 | | `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` |
75 | | `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
--------------------------------------------------------------------------------
/docs/md_v2/core/llmtxt.md:
--------------------------------------------------------------------------------
1 | I<div class="llmtxt-container">
2 | <iframe id="llmtxt-frame" src="../../llmtxt/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI LLM Context Builder"></iframe>
3 | </div>
4 |
5 | <script>
6 | // Iframe height adjustment
7 | function resizeLLMtxtIframe() {
8 | const iframe = document.getElementById('llmtxt-frame');
9 | if (iframe) {
10 | const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
11 | const topOffset = headerHeight + 20;
12 | const availableHeight = window.innerHeight - topOffset;
13 | iframe.style.height = Math.max(800, availableHeight) + 'px';
14 | }
15 | }
16 |
17 | // Run immediately and on resize/load
18 | resizeLLMtxtIframe();
19 | let resizeTimer;
20 | window.addEventListener('load', resizeLLMtxtIframe);
21 | window.addEventListener('resize', () => {
22 | clearTimeout(resizeTimer);
23 | resizeTimer = setTimeout(resizeLLMtxtIframe, 150);
24 | });
25 |
26 | // Remove Footer & HR from parent page
27 | document.addEventListener('DOMContentLoaded', () => {
28 | setTimeout(() => {
29 | const footer = window.parent.document.querySelector('footer');
30 | if (footer) {
31 | const hrBeforeFooter = footer.previousElementSibling;
32 | if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
33 | hrBeforeFooter.remove();
34 | }
35 | footer.remove();
36 | resizeLLMtxtIframe();
37 | }
38 | }, 100);
39 | });
40 | </script>
41 |
42 | <style>
43 | #terminal-mkdocs-main-content {
44 | padding: 0 !important;
45 | margin: 0;
46 | width: 100%;
47 | height: 100%;
48 | overflow: hidden;
49 | }
50 |
51 | #terminal-mkdocs-main-content .llmtxt-container {
52 | margin: 0;
53 | padding: 0;
54 | max-width: none;
55 | overflow: hidden;
56 | }
57 |
58 | #terminal-mkdocs-toc-panel {
59 | display: none !important;
60 | }
61 | </style>
--------------------------------------------------------------------------------
/docs/md_v2/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/favicon.ico
--------------------------------------------------------------------------------
/docs/md_v2/img/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/img/favicon-32x32.png
--------------------------------------------------------------------------------
/docs/md_v2/img/favicon-x-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/img/favicon-x-32x32.png
--------------------------------------------------------------------------------
/docs/md_v2/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/md_v2/img/favicon.ico
--------------------------------------------------------------------------------
/docs/md_v2/marketplace/README.md:
--------------------------------------------------------------------------------
1 | # Crawl4AI Marketplace
2 |
3 | A terminal-themed marketplace for tools, integrations, and resources related to Crawl4AI.
4 |
5 | ## Setup
6 |
7 | ### Backend
8 |
9 | 1. Install dependencies:
10 | ```bash
11 | cd backend
12 | pip install -r requirements.txt
13 | ```
14 |
15 | 2. Generate dummy data:
16 | ```bash
17 | python dummy_data.py
18 | ```
19 |
20 | 3. Run the server:
21 | ```bash
22 | python server.py
23 | ```
24 |
25 | The API will be available at http://localhost:8100
26 |
27 | ### Frontend
28 |
29 | 1. Open `frontend/index.html` in your browser
30 | 2. Or serve via MkDocs as part of the documentation site
31 |
32 | ## Database Schema
33 |
34 | The marketplace uses SQLite with automatic migration from `schema.yaml`. Tables include:
35 | - **apps**: Tools and integrations
36 | - **articles**: Reviews, tutorials, and news
37 | - **categories**: App categories
38 | - **sponsors**: Sponsored content
39 |
40 | ## API Endpoints
41 |
42 | - `GET /api/apps` - List apps with filters
43 | - `GET /api/articles` - List articles
44 | - `GET /api/categories` - Get all categories
45 | - `GET /api/sponsors` - Get active sponsors
46 | - `GET /api/search?q=query` - Search across content
47 | - `GET /api/stats` - Marketplace statistics
48 |
49 | ## Features
50 |
51 | - **Smart caching**: LocalStorage with TTL (1 hour)
52 | - **Terminal theme**: Consistent with Crawl4AI branding
53 | - **Responsive design**: Works on all devices
54 | - **Fast search**: Debounced with 300ms delay
55 | - **CORS protected**: Only crawl4ai.com and localhost
56 |
57 | ## Admin Panel
58 |
59 | Coming soon - for now, edit the database directly or modify `dummy_data.py`
60 |
61 | ## Deployment
62 |
63 | For production deployment on EC2:
64 | 1. Update `API_BASE` in `marketplace.js` to production URL
65 | 2. Run FastAPI with proper production settings (use gunicorn/uvicorn)
66 | 3. Set up nginx proxy if needed
--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/.env.example:
--------------------------------------------------------------------------------
1 | # Marketplace Configuration
2 | # Copy this to .env and update with your values
3 |
4 | # Admin password (required)
5 | MARKETPLACE_ADMIN_PASSWORD=change_this_password
6 |
7 | # JWT secret key (required) - generate with: python3 -c "import secrets; print(secrets.token_urlsafe(32))"
8 | MARKETPLACE_JWT_SECRET=change_this_to_a_secure_random_key
9 |
10 | # Database path (optional, defaults to ./marketplace.db)
11 | MARKETPLACE_DB_PATH=./marketplace.db
12 |
13 | # Token expiry in hours (optional, defaults to 4)
14 | MARKETPLACE_TOKEN_EXPIRY=4
--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/config.py:
--------------------------------------------------------------------------------
1 | """
2 | Marketplace Configuration - Loads from .env file
3 | """
4 | import os
5 | import sys
6 | import hashlib
7 | from pathlib import Path
8 | from dotenv import load_dotenv
9 |
10 | # Load .env file
11 | env_path = Path(__file__).parent / '.env'
12 | if not env_path.exists():
13 | print("\n❌ ERROR: No .env file found!")
14 | print("Please copy .env.example to .env and update with your values:")
15 | print(f" cp {Path(__file__).parent}/.env.example {Path(__file__).parent}/.env")
16 | print("\nThen edit .env with your secure values.")
17 | sys.exit(1)
18 |
19 | load_dotenv(env_path)
20 |
21 | # Required environment variables
22 | required_vars = ['MARKETPLACE_ADMIN_PASSWORD', 'MARKETPLACE_JWT_SECRET']
23 | missing_vars = [var for var in required_vars if not os.getenv(var)]
24 |
25 | if missing_vars:
26 | print(f"\n❌ ERROR: Missing required environment variables: {', '.join(missing_vars)}")
27 | print("Please check your .env file and ensure all required variables are set.")
28 | sys.exit(1)
29 |
30 | class Config:
31 | """Configuration loaded from environment variables"""
32 |
33 | # Admin authentication - hashed from password in .env
34 | ADMIN_PASSWORD_HASH = hashlib.sha256(
35 | os.getenv('MARKETPLACE_ADMIN_PASSWORD').encode()
36 | ).hexdigest()
37 |
38 | # JWT secret for token generation
39 | JWT_SECRET_KEY = os.getenv('MARKETPLACE_JWT_SECRET')
40 |
41 | # Database path
42 | DATABASE_PATH = os.getenv('MARKETPLACE_DB_PATH', './marketplace.db')
43 |
44 | # Token expiry in hours
45 | TOKEN_EXPIRY_HOURS = int(os.getenv('MARKETPLACE_TOKEN_EXPIRY', '4'))
46 |
47 | # CORS origins - hardcoded as they don't contain secrets
48 | ALLOWED_ORIGINS = [
49 | "http://localhost:8000",
50 | "http://localhost:8080",
51 | "http://localhost:8100",
52 | "http://127.0.0.1:8000",
53 | "http://127.0.0.1:8080",
54 | "http://127.0.0.1:8100",
55 | "https://crawl4ai.com",
56 | "https://www.crawl4ai.com",
57 | "https://docs.crawl4ai.com",
58 | "https://market.crawl4ai.com"
59 | ]
--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn
3 | pyyaml
4 | python-multipart
5 | python-dotenv
--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/schema.yaml:
--------------------------------------------------------------------------------
1 | database:
2 | name: marketplace.db
3 |
4 | tables:
5 | apps:
6 | columns:
7 | id: {type: INTEGER, primary: true, autoincrement: true}
8 | name: {type: TEXT, required: true}
9 | slug: {type: TEXT, unique: true}
10 | description: {type: TEXT}
11 | long_description: {type: TEXT}
12 | logo_url: {type: TEXT}
13 | image: {type: TEXT}
14 | screenshots: {type: JSON, default: '[]'}
15 | category: {type: TEXT}
16 | type: {type: TEXT, default: 'Open Source'}
17 | status: {type: TEXT, default: 'Active'}
18 | website_url: {type: TEXT}
19 | github_url: {type: TEXT}
20 | demo_url: {type: TEXT}
21 | video_url: {type: TEXT}
22 | documentation_url: {type: TEXT}
23 | support_url: {type: TEXT}
24 | discord_url: {type: TEXT}
25 | pricing: {type: TEXT}
26 | rating: {type: REAL, default: 0.0}
27 | downloads: {type: INTEGER, default: 0}
28 | featured: {type: BOOLEAN, default: 0}
29 | sponsored: {type: BOOLEAN, default: 0}
30 | integration_guide: {type: TEXT}
31 | documentation: {type: TEXT}
32 | examples: {type: TEXT}
33 | installation_command: {type: TEXT}
34 | requirements: {type: TEXT}
35 | changelog: {type: TEXT}
36 | tags: {type: JSON, default: '[]'}
37 | added_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
38 | updated_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
39 | contact_email: {type: TEXT}
40 | views: {type: INTEGER, default: 0}
41 |
42 | articles:
43 | columns:
44 | id: {type: INTEGER, primary: true, autoincrement: true}
45 | title: {type: TEXT, required: true}
46 | slug: {type: TEXT, unique: true}
47 | content: {type: TEXT}
48 | author: {type: TEXT, default: 'Crawl4AI Team'}
49 | category: {type: TEXT}
50 | related_apps: {type: JSON, default: '[]'}
51 | featured_image: {type: TEXT}
52 | published_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
53 | tags: {type: JSON, default: '[]'}
54 | views: {type: INTEGER, default: 0}
55 |
56 | categories:
57 | columns:
58 | id: {type: INTEGER, primary: true, autoincrement: true}
59 | name: {type: TEXT, unique: true}
60 | slug: {type: TEXT, unique: true}
61 | icon: {type: TEXT}
62 | description: {type: TEXT}
63 | order_index: {type: INTEGER, default: 0}
64 |
65 | sponsors:
66 | columns:
67 | id: {type: INTEGER, primary: true, autoincrement: true}
68 | company_name: {type: TEXT, required: true}
69 | logo_url: {type: TEXT}
70 | tier: {type: TEXT, default: 'Bronze'}
71 | banner_url: {type: TEXT}
72 | landing_url: {type: TEXT}
73 | active: {type: BOOLEAN, default: 1}
74 | start_date: {type: DATETIME}
75 | end_date: {type: DATETIME}
--------------------------------------------------------------------------------
/docs/md_v2/marketplace/backend/uploads/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/docs/md_v2/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% set extra_html_attrs = 'data-theme="dark"' %}
2 | {% extends "base.html" %}
3 |
4 | {% block extrahead %}
5 | {{ super() }}
6 | <script>
7 | document.documentElement.setAttribute("data-theme", "dark");
8 | </script>
9 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pushfeedback/dist/pushfeedback/pushfeedback.css">
10 |
11 | <style>
12 | :root {
13 | /* brand */
14 | --feedback-primary-color: #09b5a5;
15 | --feedback-highlight-color: #fed500;
16 |
17 |
18 | /* align with the value you really use in :root */
19 | --header-height: 65px;
20 |
21 | /* Push modal content down */
22 | --feedback-modal-content-position-top: var(--header-height);
23 |
24 | --feedback-modal-modal-wrapper-z-index: 1100;
25 | /* > header’s 1000 */
26 | --feedback-modal-content-z-index: 1101;
27 | }
28 |
29 | feedback-modal::part(overlay) {
30 | top: var(--header-height);
31 | /* start below header */
32 | height: calc(100vh - var(--header-height));
33 | /* fill the rest */
34 |
35 |
36 | }
37 | </style>
38 | <script type="module"
39 | src="https://cdn.jsdelivr.net/npm/pushfeedback@latest/dist/pushfeedback/pushfeedback.esm.js"></script>
40 | {% endblock %}
41 |
42 | {% block footer %}
43 | <feedback-button project="w8plzp8vjp" button-style="dark" button-position="center-right" modal-position="sidebar-right">
44 | >
45 | Feedback
46 | </feedback-button>
47 | {% endblock %}
--------------------------------------------------------------------------------
/docs/tutorials/coming_soon.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/docs/tutorials/coming_soon.md
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Note: These requirements are also specified in pyproject.toml
2 | # This file is kept for development environment setup and compatibility
3 | aiofiles>=24.1.0
4 | aiohttp>=3.11.11
5 | aiosqlite~=0.20
6 | anyio>=4.0.0
7 | lxml~=5.3
8 | litellm>=1.53.1
9 | numpy>=1.26.0,<3
10 | pillow>=10.4
11 | playwright>=1.49.0
12 | patchright>=1.49.0
13 | python-dotenv~=1.0
14 | requests~=2.26
15 | beautifulsoup4~=4.12
16 | tf-playwright-stealth>=1.1.0
17 | xxhash~=3.4
18 | rank-bm25~=0.2
19 | colorama~=0.4
20 | snowballstemmer~=2.2
21 | pydantic>=2.10
22 | pyOpenSSL>=24.3.0
23 | psutil>=6.1.1
24 | PyYAML>=6.0
25 | nltk>=3.9.1
26 | rich>=13.9.4
27 | cssselect>=1.2.0
28 | chardet>=5.2.0
29 | brotli>=1.1.0
30 | httpx[http2]>=0.27.2
31 | alphashape>=1.3.1
32 | shapely>=2.0.0
33 |
34 | fake-useragent>=2.2.0
35 | pdf2image>=1.17.0
36 | PyPDF2>=3.0.1
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [options]
2 | include_package_data = True
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | import os
3 | from pathlib import Path
4 | import shutil
5 |
6 | # Note: Most configuration is now in pyproject.toml
7 | # This setup.py is kept for backwards compatibility
8 |
9 | # Create the .crawl4ai folder in the user's home directory if it doesn't exist
10 | # If the folder already exists, remove the cache folder
11 | base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
12 | crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
13 | crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
14 | cache_folder = crawl4ai_folder / "cache"
15 | content_folders = [
16 | "html_content",
17 | "cleaned_html",
18 | "markdown_content",
19 | "extracted_content",
20 | "screenshots",
21 | ]
22 |
23 | # Clean up old cache if exists
24 | if cache_folder.exists():
25 | shutil.rmtree(cache_folder)
26 |
27 | # Create new folder structure
28 | crawl4ai_folder.mkdir(exist_ok=True)
29 | cache_folder.mkdir(exist_ok=True)
30 | for folder in content_folders:
31 | (crawl4ai_folder / folder).mkdir(exist_ok=True)
32 |
33 | version = "0.0.0" # This will be overridden by pyproject.toml's dynamic version
34 | try:
35 | with open("crawl4ai/__version__.py") as f:
36 | for line in f:
37 | if line.startswith("__version__"):
38 | version = line.split("=")[1].strip().strip('"')
39 | break
40 | except Exception:
41 | pass # Let pyproject.toml handle version
42 |
43 | setup(
44 | name="Crawl4AI",
45 | version=version,
46 | description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
47 | long_description=open("README.md", encoding="utf-8").read(),
48 | long_description_content_type="text/markdown",
49 | url="https://github.com/unclecode/crawl4ai",
50 | author="Unclecode",
51 | author_email="unclecode@kidocode.com",
52 | license="Apache-2.0",
53 | packages=find_packages(),
54 | package_data={"crawl4ai": ["js_snippet/*.js"]},
55 | classifiers=[
56 | "Development Status :: 3 - Alpha",
57 | "Intended Audience :: Developers",
58 | "Programming Language :: Python :: 3",
59 | "Programming Language :: Python :: 3.10",
60 | "Programming Language :: Python :: 3.11",
61 | "Programming Language :: Python :: 3.12",
62 | "Programming Language :: Python :: 3.13",
63 | ],
64 | python_requires=">=3.10",
65 | )
66 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unclecode/crawl4ai/40173eeb7374dd5d3ab84b355b28e88d43703ee0/tests/__init__.py
--------------------------------------------------------------------------------
/tests/async/test_crawler_strategy.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import pytest
4 |
5 | # Add the parent directory to the Python path
6 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
7 | sys.path.append(parent_dir)
8 |
9 | from crawl4ai.async_webcrawler import AsyncWebCrawler
10 |
11 |
12 | @pytest.mark.asyncio
13 | async def test_custom_user_agent():
14 | async with AsyncWebCrawler(verbose=True) as crawler:
15 | custom_user_agent = "MyCustomUserAgent/1.0"
16 | crawler.crawler_strategy.update_user_agent(custom_user_agent)
17 | url = "https://httpbin.org/user-agent"
18 | result = await crawler.arun(url=url, bypass_cache=True)
19 | assert result.success
20 | assert custom_user_agent in result.html
21 |
22 |
23 | @pytest.mark.asyncio
24 | async def test_custom_headers():
25 | async with AsyncWebCrawler(verbose=True) as crawler:
26 | custom_headers = {"X-Test-Header": "TestValue"}
27 | crawler.crawler_strategy.set_custom_headers(custom_headers)
28 | url = "https://httpbin.org/headers"
29 | result = await crawler.arun(url=url, bypass_cache=True)
30 | assert result.success
31 | assert "X-Test-Header" in result.html
32 | assert "TestValue" in result.html
33 |
34 |
35 | @pytest.mark.asyncio
36 | async def test_javascript_execution():
37 | async with AsyncWebCrawler(verbose=True) as crawler:
38 | js_code = "document.body.innerHTML = '<h1>Modified by JS</h1>';"
39 | url = "https://www.example.com"
40 | result = await crawler.arun(url=url, bypass_cache=True, js_code=js_code)
41 | assert result.success
42 | assert "<h1>Modified by JS</h1>" in result.html
43 |
44 |
45 | @pytest.mark.asyncio
46 | async def test_hook_execution():
47 | async with AsyncWebCrawler(verbose=True) as crawler:
48 |
49 | async def test_hook(page):
50 | await page.evaluate("document.body.style.backgroundColor = 'red';")
51 | return page
52 |
53 | crawler.crawler_strategy.set_hook("after_goto", test_hook)
54 | url = "https://www.example.com"
55 | result = await crawler.arun(url=url, bypass_cache=True)
56 | assert result.success
57 | assert "background-color: red" in result.html
58 |
59 |
60 | @pytest.mark.asyncio
61 | async def test_screenshot():
62 | async with AsyncWebCrawler(verbose=True) as crawler:
63 | url = "https://www.example.com"
64 | result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
65 | assert result.success
66 | assert result.screenshot
67 | assert isinstance(result.screenshot, str)
68 | assert len(result.screenshot) > 0
69 |
70 |
71 | # Entry point for debugging
72 | if __name__ == "__main__":
73 | pytest.main([__file__, "-v"])
74 |
--------------------------------------------------------------------------------
/tests/async/test_performance.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import pytest
4 | import time
5 |
6 | # Add the parent directory to the Python path
7 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8 | sys.path.append(parent_dir)
9 |
10 | from crawl4ai.async_webcrawler import AsyncWebCrawler
11 |
12 |
13 | @pytest.mark.asyncio
14 | async def test_crawl_speed():
15 | async with AsyncWebCrawler(verbose=True) as crawler:
16 | url = "https://www.nbcnews.com/business"
17 | start_time = time.time()
18 | result = await crawler.arun(url=url, bypass_cache=True)
19 | end_time = time.time()
20 |
21 | assert result.success
22 | crawl_time = end_time - start_time
23 | print(f"Crawl time: {crawl_time:.2f} seconds")
24 |
25 | assert crawl_time < 10, f"Crawl took too long: {crawl_time:.2f} seconds"
26 |
27 |
28 | @pytest.mark.asyncio
29 | async def test_concurrent_crawling_performance():
30 | async with AsyncWebCrawler(verbose=True) as crawler:
31 | urls = [
32 | "https://www.nbcnews.com/business",
33 | "https://www.example.com",
34 | "https://www.python.org",
35 | "https://www.github.com",
36 | "https://www.stackoverflow.com",
37 | ]
38 |
39 | start_time = time.time()
40 | results = await crawler.arun_many(urls=urls, bypass_cache=True)
41 | end_time = time.time()
42 |
43 | total_time = end_time - start_time
44 | print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
45 |
46 | assert all(result.success for result in results)
47 | assert len(results) == len(urls)
48 |
49 | assert (
50 | total_time < len(urls) * 5
51 | ), f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
52 |
53 |
54 | @pytest.mark.asyncio
55 | async def test_crawl_speed_with_caching():
56 | async with AsyncWebCrawler(verbose=True) as crawler:
57 | url = "https://www.nbcnews.com/business"
58 |
59 | start_time = time.time()
60 | result1 = await crawler.arun(url=url, bypass_cache=True)
61 | end_time = time.time()
62 | first_crawl_time = end_time - start_time
63 |
64 | start_time = time.time()
65 | result2 = await crawler.arun(url=url, bypass_cache=False)
66 | end_time = time.time()
67 | second_crawl_time = end_time - start_time
68 |
69 | assert result1.success and result2.success
70 | print(f"First crawl time: {first_crawl_time:.2f} seconds")
71 | print(f"Second crawl time (cached): {second_crawl_time:.2f} seconds")
72 |
73 | assert (
74 | second_crawl_time < first_crawl_time / 2
75 | ), "Cached crawl not significantly faster"
76 |
77 |
78 | if __name__ == "__main__":
79 | pytest.main([__file__, "-v"])
80 |
--------------------------------------------------------------------------------
/tests/browser/docker/__init__.py:
--------------------------------------------------------------------------------
1 | """Docker browser strategy tests.
2 |
3 | This package contains tests for the Docker browser strategy implementation.
4 | """
--------------------------------------------------------------------------------
/tests/browser/test_combined.py:
--------------------------------------------------------------------------------
1 | """Combined test runner for all browser module tests.
2 |
3 | This script runs all the browser module tests in sequence and
4 | provides a comprehensive summary.
5 | """
6 |
7 | import asyncio
8 | import os
9 | import sys
10 | import time
11 |
12 | # Add the project root to Python path if running directly
13 | if __name__ == "__main__":
14 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
15 |
16 | from crawl4ai.async_logger import AsyncLogger
17 |
18 | # Create a logger for clear terminal output
19 | logger = AsyncLogger(verbose=True, log_file=None)
20 |
21 | async def run_test_module(module_name, header):
22 | """Run all tests in a module and return results."""
23 | logger.info(f"\n{'-'*30}", tag="TEST")
24 | logger.info(f"RUNNING: {header}", tag="TEST")
25 | logger.info(f"{'-'*30}", tag="TEST")
26 |
27 | # Import the module dynamically
28 | module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
29 |
30 | # Track time for performance measurement
31 | start_time = time.time()
32 |
33 | # Run the tests
34 | await module.run_tests()
35 |
36 | # Calculate time taken
37 | time_taken = time.time() - start_time
38 | logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
39 |
40 | return time_taken
41 |
42 | async def main():
43 | """Run all test modules."""
44 | logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
45 |
46 | # List of test modules to run
47 | test_modules = [
48 | ("test_browser_manager", "Browser Manager Tests"),
49 | ("test_playwright_strategy", "Playwright Strategy Tests"),
50 | ("test_cdp_strategy", "CDP Strategy Tests"),
51 | ("test_builtin_strategy", "Builtin Browser Strategy Tests"),
52 | ("test_profiles", "Profile Management Tests")
53 | ]
54 |
55 | # Run each test module
56 | timings = {}
57 | for module_name, header in test_modules:
58 | try:
59 | time_taken = await run_test_module(module_name, header)
60 | timings[module_name] = time_taken
61 | except Exception as e:
62 | logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
63 |
64 | # Print summary
65 | logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
66 | logger.info(f"{'-'*50}", tag="SUMMARY")
67 | for module_name, header in test_modules:
68 | if module_name in timings:
69 | logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
70 | else:
71 | logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
72 | logger.info(f"{'-'*50}", tag="SUMMARY")
73 | total_time = sum(timings.values())
74 | logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
75 |
76 | if __name__ == "__main__":
77 | asyncio.run(main())
78 |
--------------------------------------------------------------------------------
/tests/browser/test_launch_standalone.py:
--------------------------------------------------------------------------------
1 | from crawl4ai.browser_profiler import BrowserProfiler
2 | import asyncio
3 |
4 |
5 | if __name__ == "__main__":
6 | # Test launching a standalone browser
7 | async def test_standalone_browser():
8 | profiler = BrowserProfiler()
9 | cdp_url = await profiler.launch_standalone_browser(
10 | browser_type="chromium",
11 | user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
12 | debugging_port=9222,
13 | headless=False
14 | )
15 | print(f"CDP URL: {cdp_url}")
16 |
17 | asyncio.run(test_standalone_browser())
--------------------------------------------------------------------------------
/tests/docker/test_dockerclient.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai.docker_client import Crawl4aiDockerClient
3 | from crawl4ai import (
4 | BrowserConfig,
5 | CrawlerRunConfig
6 | )
7 |
8 | async def main():
9 | async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
10 | await client.authenticate("test@example.com")
11 |
12 | # Non-streaming crawl
13 | results = await client.crawl(
14 | ["https://example.com", "https://python.org"],
15 | browser_config=BrowserConfig(headless=True),
16 | crawler_config=CrawlerRunConfig()
17 | )
18 | print(f"Non-streaming results: {results}")
19 |
20 | # Streaming crawl
21 | crawler_config = CrawlerRunConfig(stream=True)
22 | async for result in await client.crawl(
23 | ["https://example.com", "https://python.org"],
24 | browser_config=BrowserConfig(headless=True),
25 | crawler_config=crawler_config
26 | ):
27 | print(f"Streamed result: {result}")
28 |
29 | # Get schema
30 | schema = await client.get_schema()
31 | print(f"Schema: {schema}")
32 |
33 | if __name__ == "__main__":
34 | asyncio.run(main())
--------------------------------------------------------------------------------
/tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai import (
3 | AsyncWebCrawler,
4 | CrawlerRunConfig,
5 | HTTPCrawlerConfig,
6 | CacheMode,
7 | DefaultMarkdownGenerator,
8 | PruningContentFilter
9 | )
10 | from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
11 | from crawl4ai.async_logger import AsyncLogger
12 |
13 | async def main():
14 | # Initialize HTTP crawler strategy
15 | http_strategy = AsyncHTTPCrawlerStrategy(
16 | browser_config=HTTPCrawlerConfig(
17 | method="GET",
18 | verify_ssl=True,
19 | follow_redirects=True
20 | ),
21 | logger=AsyncLogger(verbose=True)
22 | )
23 |
24 | # Initialize web crawler with HTTP strategy
25 | async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
26 | crawler_config = CrawlerRunConfig(
27 | cache_mode=CacheMode.BYPASS,
28 | markdown_generator=DefaultMarkdownGenerator(
29 | content_filter=PruningContentFilter(
30 | threshold=0.48,
31 | threshold_type="fixed",
32 | min_word_threshold=0
33 | )
34 | )
35 | )
36 |
37 | # Test different URLs
38 | urls = [
39 | "https://example.com",
40 | "https://httpbin.org/get",
41 | "raw://<html><body>Test content</body></html>"
42 | ]
43 |
44 | for url in urls:
45 | print(f"\n=== Testing {url} ===")
46 | try:
47 | result = await crawler.arun(url=url, config=crawler_config)
48 | print(f"Status: {result.status_code}")
49 | print(f"Raw HTML length: {len(result.html)}")
50 | if hasattr(result, 'markdown'):
51 | print(f"Markdown length: {len(result.markdown.raw_markdown)}")
52 | except Exception as e:
53 | print(f"Error: {e}")
54 |
55 | if __name__ == "__main__":
56 | asyncio.run(main())
--------------------------------------------------------------------------------
/tests/general/test_advanced_deep_crawl.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import time
3 |
4 |
5 | from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
6 | from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
7 | from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
8 | from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
9 | from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
10 | # from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
11 |
12 |
13 | async def main():
14 | """Example deep crawl of documentation site."""
15 | filter_chain = FilterChain([
16 | URLPatternFilter(patterns=["*2025*"]),
17 | DomainFilter(allowed_domains=["techcrunch.com"]),
18 | ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
19 | ContentTypeFilter(allowed_types=["text/html","application/javascript"])
20 | ])
21 | config = CrawlerRunConfig(
22 | deep_crawl_strategy = BestFirstCrawlingStrategy(
23 | max_depth=2,
24 | include_external=False,
25 | filter_chain=filter_chain,
26 | url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
27 | ),
28 | stream=False,
29 | verbose=True,
30 | cache_mode=CacheMode.BYPASS,
31 | scraping_strategy=LXMLWebScrapingStrategy()
32 | )
33 |
34 | async with AsyncWebCrawler() as crawler:
35 | print("Starting deep crawl in streaming mode:")
36 | config.stream = True
37 | start_time = time.perf_counter()
38 | async for result in await crawler.arun(
39 | url="https://techcrunch.com",
40 | config=config
41 | ):
42 | print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
43 | print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
44 |
45 | if __name__ == "__main__":
46 | asyncio.run(main())
--------------------------------------------------------------------------------
/tests/general/test_crawlers.py:
--------------------------------------------------------------------------------
1 |
2 | # example_usageexample_usageexample_usage# example_usage.py
3 | import asyncio
4 | from crawl4ai.crawlers import get_crawler
5 |
6 | async def main():
7 | # Get the registered crawler
8 | example_crawler = get_crawler("example_site.content")
9 |
10 | # Crawl example.com
11 | result = await example_crawler(url="https://example.com")
12 |
13 | print(result)
14 |
15 |
16 | if __name__ == "__main__":
17 | asyncio.run(main())
--------------------------------------------------------------------------------
/tests/general/test_deep_crawl.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import time
3 |
4 |
5 | from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
6 | from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
7 | from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
8 | # from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
9 |
10 |
11 | async def main():
12 | """Example deep crawl of documentation site."""
13 | config = CrawlerRunConfig(
14 | deep_crawl_strategy = BFSDeepCrawlStrategy(
15 | max_depth=2,
16 | include_external=False
17 | ),
18 | stream=False,
19 | verbose=True,
20 | cache_mode=CacheMode.BYPASS,
21 | scraping_strategy=LXMLWebScrapingStrategy()
22 | )
23 |
24 | async with AsyncWebCrawler() as crawler:
25 | start_time = time.perf_counter()
26 | print("\nStarting deep crawl in batch mode:")
27 | results = await crawler.arun(
28 | url="https://docs.crawl4ai.com",
29 | config=config
30 | )
31 | print(f"Crawled {len(results)} pages")
32 | print(f"Example page: {results[0].url}")
33 | print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
34 |
35 | print("Starting deep crawl in streaming mode:")
36 | config.stream = True
37 | start_time = time.perf_counter()
38 | async for result in await crawler.arun(
39 | url="https://docs.crawl4ai.com",
40 | config=config
41 | ):
42 | print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
43 | print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
44 |
45 | if __name__ == "__main__":
46 | asyncio.run(main())
--------------------------------------------------------------------------------
/tests/general/test_download_file.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, BrowserConfig
3 | from pathlib import Path
4 | import os
5 |
6 | async def test_basic_download():
7 |
8 | # Custom folder (otherwise defaults to ~/.crawl4ai/downloads)
9 | downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
10 | os.makedirs(downloads_path, exist_ok=True)
11 | browser_config = BrowserConfig(
12 | accept_downloads=True,
13 | downloads_path=downloads_path
14 | )
15 | async with AsyncWebCrawler(config=browser_config) as crawler:
16 | run_config = CrawlerRunConfig(
17 | js_code="""
18 | const link = document.querySelector('a[href$=".exe"]');
19 | if (link) { link.click(); }
20 | """,
21 | delay_before_return_html=5
22 | )
23 | result = await crawler.arun("https://www.python.org/downloads/", config=run_config)
24 |
25 | if result.downloaded_files:
26 | print("Downloaded files:")
27 | for file_path in result.downloaded_files:
28 | print("•", file_path)
29 | else:
30 | print("No files downloaded.")
31 |
32 | if __name__ == "__main__":
33 | asyncio.run(test_basic_download())
34 |
--------------------------------------------------------------------------------
/tests/general/test_persistent_context.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | from crawl4ai.async_webcrawler import AsyncWebCrawler
4 | from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
5 |
6 | # Simple concurrency test for persistent context page creation
7 | # Usage: python scripts/test_persistent_context.py
8 |
9 | URLS = [
10 | # "https://example.com",
11 | "https://httpbin.org/html",
12 | "https://www.python.org/",
13 | "https://www.rust-lang.org/",
14 | ]
15 |
16 | async def main():
17 | profile_dir = os.path.join(os.path.expanduser("~"), ".crawl4ai", "profiles", "test-persistent-profile")
18 | os.makedirs(profile_dir, exist_ok=True)
19 |
20 | browser_config = BrowserConfig(
21 | browser_type="chromium",
22 | headless=True,
23 | use_persistent_context=True,
24 | user_data_dir=profile_dir,
25 | use_managed_browser=True,
26 | verbose=True,
27 | )
28 |
29 | run_cfg = CrawlerRunConfig(
30 | cache_mode=CacheMode.BYPASS,
31 | stream=False,
32 | verbose=True,
33 | )
34 |
35 | async with AsyncWebCrawler(config=browser_config) as crawler:
36 | results = await crawler.arun_many(URLS, config=run_cfg)
37 | for r in results:
38 | print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0)
39 | # r = await crawler.arun(url=URLS[0], config=run_cfg)
40 | # print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0)
41 |
42 | if __name__ == "__main__":
43 | asyncio.run(main())
44 |
--------------------------------------------------------------------------------
/tests/general/test_stream.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | # append 2 parent directories to sys.path to import crawl4ai
3 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
4 | sys.path.append(parent_dir)
5 | parent_parent_dir = os.path.dirname(parent_dir)
6 | sys.path.append(parent_parent_dir)
7 |
8 | import asyncio
9 | from crawl4ai import *
10 |
11 | async def test_crawler():
12 | # Setup configurations
13 | browser_config = BrowserConfig(headless=True, verbose=False)
14 | crawler_config = CrawlerRunConfig(
15 | cache_mode=CacheMode.BYPASS,
16 | markdown_generator=DefaultMarkdownGenerator(
17 | content_filter=PruningContentFilter(
18 | threshold=0.48,
19 | threshold_type="fixed",
20 | min_word_threshold=0
21 | )
22 | ),
23 | )
24 |
25 | # Test URLs - mix of different sites
26 | urls = [
27 | "http://example.com",
28 | "http://example.org",
29 | "http://example.net",
30 | ] * 10 # 15 total URLs
31 |
32 | async with AsyncWebCrawler(config=browser_config) as crawler:
33 | print("\n=== Testing Streaming Mode ===")
34 | async for result in await crawler.arun_many(
35 | urls=urls,
36 | config=crawler_config.clone(stream=True),
37 | ):
38 | print(f"Received result for: {result.url} - Success: {result.success}")
39 |
40 | print("\n=== Testing Batch Mode ===")
41 | results = await crawler.arun_many(
42 | urls=urls,
43 | config=crawler_config,
44 | )
45 | print(f"Received all {len(results)} results at once")
46 | for result in results:
47 | print(f"Batch result for: {result.url} - Success: {result.success}")
48 |
49 | if __name__ == "__main__":
50 | asyncio.run(test_crawler())
--------------------------------------------------------------------------------
/tests/general/test_stream_dispatch.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | # append 2 parent directories to sys.path to import crawl4ai
3 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
4 | sys.path.append(parent_dir)
5 | parent_parent_dir = os.path.dirname(parent_dir)
6 | sys.path.append(parent_parent_dir)
7 |
8 |
9 | import asyncio
10 | from typing import List
11 | from crawl4ai import *
12 | from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
13 |
14 | async def test_streaming():
15 | browser_config = BrowserConfig(headless=True, verbose=True)
16 | crawler_config = CrawlerRunConfig(
17 | cache_mode=CacheMode.BYPASS,
18 | markdown_generator=DefaultMarkdownGenerator(
19 | # content_filter=PruningContentFilter(
20 | # threshold=0.48,
21 | # threshold_type="fixed",
22 | # min_word_threshold=0
23 | # )
24 | ),
25 | )
26 |
27 | urls = ["http://example.com"] * 10
28 |
29 | async with AsyncWebCrawler(config=browser_config) as crawler:
30 | dispatcher = MemoryAdaptiveDispatcher(
31 | max_session_permit=5,
32 | check_interval=0.5
33 | )
34 |
35 | async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
36 | print(f"Got result for {result.url} - Success: {result.result.success}")
37 |
38 | if __name__ == "__main__":
39 | asyncio.run(test_streaming())
--------------------------------------------------------------------------------
/tests/general/tets_robot.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai import *
3 |
4 | async def test_real_websites():
5 | print("\n=== Testing Real Website Robots.txt Compliance ===\n")
6 |
7 | browser_config = BrowserConfig(headless=True, verbose=True)
8 | async with AsyncWebCrawler(config=browser_config) as crawler:
9 |
10 | # Test cases with URLs
11 | test_cases = [
12 | # Public sites that should be allowed
13 | ("https://example.com", True), # Simple public site
14 | ("https://httpbin.org/get", True), # API endpoint
15 |
16 | # Sites with known strict robots.txt
17 | ("https://www.facebook.com/robots.txt", False), # Social media
18 | ("https://www.google.com/search", False), # Search pages
19 |
20 | # Edge cases
21 | ("https://api.github.com", True), # API service
22 | ("https://raw.githubusercontent.com", True), # Content delivery
23 |
24 | # Non-existent/error cases
25 | ("https://thisisnotarealwebsite.com", True), # Non-existent domain
26 | ("https://localhost:12345", True), # Invalid port
27 | ]
28 |
29 | for url, expected in test_cases:
30 | print(f"\nTesting: {url}")
31 | try:
32 | config = CrawlerRunConfig(
33 | cache_mode=CacheMode.BYPASS,
34 | check_robots_txt=True, # Enable robots.txt checking
35 | verbose=True
36 | )
37 |
38 | result = await crawler.arun(url=url, config=config)
39 | allowed = result.success and not result.error_message
40 |
41 | print(f"Expected: {'allowed' if expected else 'denied'}")
42 | print(f"Actual: {'allowed' if allowed else 'denied'}")
43 | print(f"Status Code: {result.status_code}")
44 | if result.error_message:
45 | print(f"Error: {result.error_message}")
46 |
47 | # Optional: Print robots.txt content if available
48 | if result.metadata and 'robots_txt' in result.metadata:
49 | print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
50 |
51 | except Exception as e:
52 | print(f"Test failed with error: {str(e)}")
53 |
54 | async def main():
55 | try:
56 | await test_real_websites()
57 | except Exception as e:
58 | print(f"Test suite failed: {str(e)}")
59 | raise
60 |
61 | if __name__ == "__main__":
62 | asyncio.run(main())
--------------------------------------------------------------------------------
/tests/hub/test_simple.py:
--------------------------------------------------------------------------------
1 | # test.py
2 | from crawl4ai import CrawlerHub
3 | import json
4 |
5 | async def amazon_example():
6 | if (crawler_cls := CrawlerHub.get("amazon_product")) :
7 | crawler = crawler_cls()
8 | print(f"Crawler version: {crawler_cls.meta['version']}")
9 | print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
10 | print(await crawler.run("https://amazon.com/test"))
11 | else:
12 | print("Crawler not found!")
13 |
14 | async def google_example():
15 | # Get crawler dynamically
16 | crawler_cls = CrawlerHub.get("google_search")
17 | crawler = crawler_cls()
18 |
19 | # Text search
20 | text_results = await crawler.run(
21 | query="apple inc",
22 | search_type="text",
23 | schema_cache_path="/Users/unclecode/.crawl4ai"
24 | )
25 | print(json.dumps(json.loads(text_results), indent=4))
26 |
27 | # Image search
28 | # image_results = await crawler.run(query="apple inc", search_type="image")
29 | # print(image_results)
30 |
31 | if __name__ == "__main__":
32 | import asyncio
33 | # asyncio.run(amazon_example())
34 | asyncio.run(google_example())
--------------------------------------------------------------------------------
/tests/mcp/test_mcp_sse.py:
--------------------------------------------------------------------------------
1 | from mcp.client.sse import sse_client
2 | from mcp.client.session import ClientSession
3 |
4 | async def main():
5 | async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
6 | async with ClientSession(r, w) as sess:
7 | print(await sess.list_tools()) # now works
8 |
9 | if __name__ == "__main__":
10 | import asyncio
11 | asyncio.run(main())
12 |
--------------------------------------------------------------------------------
/tests/memory/cap_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
4 | """
5 |
6 | import asyncio, httpx, json, uuid, argparse
7 |
8 | API = "http://localhost:8020/crawl"
9 | URLS_PER_CALL = 1 # keep it minimal so each arun() == 1 page
10 | CONCURRENT_CALLS = 20 # way above your cap
11 |
12 | payload_template = {
13 | "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
14 | "crawler_config": {
15 | "type": "CrawlerRunConfig",
16 | "params": {"cache_mode": "BYPASS", "verbose": False},
17 | }
18 | }
19 |
20 | async def one_call(client):
21 | payload = payload_template.copy()
22 | payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
23 | r = await client.post(API, json=payload)
24 | r.raise_for_status()
25 | return r.json()["server_peak_memory_mb"]
26 |
27 | async def main():
28 | async with httpx.AsyncClient(timeout=60) as client:
29 | tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
30 | mem_usages = await asyncio.gather(*tasks)
31 | print("Calls finished OK, server peaks reported:", mem_usages)
32 |
33 | if __name__ == "__main__":
34 | asyncio.run(main())
35 |
--------------------------------------------------------------------------------
/tests/memory/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=1.5.0
2 | matplotlib>=3.5.0
3 | seaborn>=0.12.0
4 | rich>=12.0.0
--------------------------------------------------------------------------------
/tests/memory/test_docker_config_gen.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Quick sanity‑check for /config/dump endpoint.
4 |
5 | Usage:
6 | python test_config_dump.py [http://localhost:8020]
7 |
8 | If the server isn’t running, start it first:
9 | uvicorn deploy.docker.server:app --port 8020
10 | """
11 |
12 | import sys, json, textwrap, requests
13 |
14 | # BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
15 | BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
16 | URL = f"{BASE.rstrip('/')}/config/dump"
17 |
18 | CASES = [
19 | # --- CrawlRunConfig variants ---
20 | "CrawlerRunConfig()",
21 | "CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
22 | "CrawlerRunConfig(js_only=True, wait_until='networkidle')",
23 |
24 | # --- BrowserConfig variants ---
25 | "BrowserConfig()",
26 | "BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
27 | "BrowserConfig(browser_mode='builtin', proxy_config={'server': 'http://1.2.3.4:8080'})",
28 | ]
29 |
30 | for code in CASES:
31 | print("\n=== POST:", code)
32 | resp = requests.post(URL, json={"code": code}, timeout=15)
33 | if resp.ok:
34 | print(json.dumps(resp.json(), indent=2)[:400] + "...")
35 | else:
36 | print("ERROR", resp.status_code, resp.text[:200])
37 |
--------------------------------------------------------------------------------
/tests/profiler/test_create_profile.py:
--------------------------------------------------------------------------------
1 | from crawl4ai import BrowserProfiler
2 | import asyncio
3 |
4 |
5 | if __name__ == "__main__":
6 | # Example usage
7 | profiler = BrowserProfiler()
8 |
9 | # Create a new profile
10 | import os
11 | from pathlib import Path
12 | home_dir = Path.home()
13 | profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
14 |
15 | print(f"Profile created at: {profile_path}")
16 |
17 |
18 |
19 | # # Launch a standalone browser
20 | # asyncio.run(profiler.launch_standalone_browser())
21 |
22 | # # List profiles
23 | # profiles = profiler.list_profiles()
24 | # for profile in profiles:
25 | # print(f"Profile: {profile['name']}, Path: {profile['path']}")
26 |
27 | # # Delete a profile
28 | # success = profiler.delete_profile("my-profile")
29 | # if success:
30 | # print("Profile deleted successfully")
31 | # else:
32 | # print("Failed to delete profile")
--------------------------------------------------------------------------------
/tests/profiler/test_keyboard_handle.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pytest
3 | import asyncio
4 | from unittest.mock import patch, MagicMock
5 | from crawl4ai.browser_profiler import BrowserProfiler
6 |
7 | @pytest.mark.asyncio
8 | @pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific msvcrt test")
9 | async def test_keyboard_input_handling():
10 | # Mock sequence of keystrokes: arrow key followed by 'q'
11 | mock_keys = [b'\x00K', b'q']
12 | mock_kbhit = MagicMock(side_effect=[True, True, False])
13 | mock_getch = MagicMock(side_effect=mock_keys)
14 |
15 | with patch('msvcrt.kbhit', mock_kbhit), patch('msvcrt.getch', mock_getch):
16 | # profiler = BrowserProfiler()
17 | user_done_event = asyncio.Event()
18 |
19 | # Create a local async function to simulate the keyboard input handling
20 | async def test_listen_for_quit_command():
21 | if sys.platform == "win32":
22 | while True:
23 | try:
24 | if mock_kbhit():
25 | raw = mock_getch()
26 | try:
27 | key = raw.decode("utf-8")
28 | except UnicodeDecodeError:
29 | continue
30 |
31 | if len(key) != 1 or not key.isprintable():
32 | continue
33 |
34 | if key.lower() == "q":
35 | user_done_event.set()
36 | return
37 |
38 | await asyncio.sleep(0.1)
39 | except Exception as e:
40 | continue
41 |
42 | # Run the listener
43 | listener_task = asyncio.create_task(test_listen_for_quit_command())
44 |
45 | # Wait for the event to be set
46 | try:
47 | await asyncio.wait_for(user_done_event.wait(), timeout=1.0)
48 | assert user_done_event.is_set()
49 | finally:
50 | if not listener_task.done():
51 | listener_task.cancel()
52 | try:
53 | await listener_task
54 | except asyncio.CancelledError:
55 | pass
--------------------------------------------------------------------------------
/tests/proxy/test_proxy_deprecation.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import pytest
4 |
5 | from crawl4ai.async_configs import BrowserConfig, ProxyConfig
6 |
7 |
8 | def test_browser_config_proxy_string_emits_deprecation_and_autoconverts():
9 | warnings.simplefilter("always", DeprecationWarning)
10 |
11 | proxy_str = "23.95.150.145:6114:username:password"
12 | with warnings.catch_warnings(record=True) as caught:
13 | cfg = BrowserConfig(proxy=proxy_str, headless=True)
14 |
15 | dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
16 | assert dep_warnings, "Expected DeprecationWarning when using BrowserConfig(proxy=...)"
17 |
18 | assert cfg.proxy is None, "cfg.proxy should be None after auto-conversion"
19 | assert isinstance(cfg.proxy_config, ProxyConfig), "cfg.proxy_config should be ProxyConfig instance"
20 | assert cfg.proxy_config.username == "username"
21 | assert cfg.proxy_config.password == "password"
22 | assert cfg.proxy_config.server.startswith("http://")
23 | assert cfg.proxy_config.server.endswith(":6114")
24 |
25 |
26 | def test_browser_config_with_proxy_config_emits_no_deprecation():
27 | warnings.simplefilter("always", DeprecationWarning)
28 |
29 | with warnings.catch_warnings(record=True) as caught:
30 | cfg = BrowserConfig(
31 | headless=True,
32 | proxy_config={
33 | "server": "http://127.0.0.1:8080",
34 | "username": "u",
35 | "password": "p",
36 | },
37 | )
38 |
39 | dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
40 | assert not dep_warnings, "Did not expect DeprecationWarning when using proxy_config"
41 | assert cfg.proxy is None
42 | assert isinstance(cfg.proxy_config, ProxyConfig)
43 |
--------------------------------------------------------------------------------
/tests/test_arun_many.py:
--------------------------------------------------------------------------------
1 | """
2 | Test example for multiple crawler configs feature
3 | """
4 | import asyncio
5 | import sys
6 | from pathlib import Path
7 |
8 | # Add parent directory to path for imports
9 | sys.path.insert(0, str(Path(__file__).parent.parent))
10 |
11 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
12 | from crawl4ai.processors.pdf import PDFContentScrapingStrategy
13 |
14 |
15 | async def test_run_many():
16 | default_config = CrawlerRunConfig(
17 | cache_mode=CacheMode.BYPASS,
18 | # scraping_strategy=PDFContentScrapingStrategy()
19 | )
20 |
21 | test_urls = [
22 | # "https://blog.python.org/", # Blog URL
23 | "https://www.python.org/", # Generic HTTPS page
24 | "https://www.kidocode.com/", # Generic HTTPS page
25 | "https://www.example.com/", # Generic HTTPS page
26 | # "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
27 | ]
28 |
29 | async with AsyncWebCrawler() as crawler:
30 | # Single config - traditional usage still works
31 | print("Test 1: Single config (backwards compatible)")
32 | result = await crawler.arun_many(
33 | urls=test_urls[:2],
34 | config=default_config
35 | )
36 | print(f"Crawled {len(result)} URLs with single config\n")
37 | for item in result:
38 | print(f" {item.url} -> {item.status_code}")
39 |
40 |
41 | if __name__ == "__main__":
42 | asyncio.run(test_run_many())
43 |
--------------------------------------------------------------------------------
/tests/test_cli_docs.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from crawl4ai.docs_manager import DocsManager
3 | from click.testing import CliRunner
4 | from crawl4ai.cli import cli
5 |
6 |
7 | def test_cli():
8 | """Test all CLI commands"""
9 | runner = CliRunner()
10 |
11 | print("\n1. Testing docs update...")
12 | # Use sync version for testing
13 | docs_manager = DocsManager()
14 | loop = asyncio.get_event_loop()
15 | loop.run_until_complete(docs_manager.fetch_docs())
16 |
17 | # print("\n2. Testing listing...")
18 | # result = runner.invoke(cli, ['docs', 'list'])
19 | # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
20 | # print(result.output)
21 |
22 | # print("\n2. Testing index building...")
23 | # result = runner.invoke(cli, ['docs', 'index'])
24 | # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
25 | # print(f"Output: {result.output}")
26 |
27 | # print("\n3. Testing search...")
28 | # result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index'])
29 | # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
30 | # print(f"First 200 chars: {result.output[:200]}...")
31 |
32 | # print("\n4. Testing combine with sections...")
33 | # result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended'])
34 | # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
35 | # print(f"First 200 chars: {result.output[:200]}...")
36 |
37 | print("\n5. Testing combine all sections...")
38 | result = runner.invoke(cli, ["docs", "combine", "--mode", "condensed"])
39 | print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
40 | print(f"First 200 chars: {result.output[:200]}...")
41 |
42 |
43 | if __name__ == "__main__":
44 | test_cli()
45 |
--------------------------------------------------------------------------------
/tests/test_llmtxt.py:
--------------------------------------------------------------------------------
1 | from crawl4ai.llmtxt import AsyncLLMTextManager # Changed to AsyncLLMTextManager
2 | from crawl4ai.async_logger import AsyncLogger
3 | from pathlib import Path
4 | import asyncio
5 |
6 |
7 | async def main():
8 | current_file = Path(__file__).resolve()
9 | # base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs"
10 | base_dir = current_file.parent.parent / "local/_docs/llm.txt"
11 | docs_dir = base_dir
12 |
13 | # Create directory if it doesn't exist
14 | docs_dir.mkdir(parents=True, exist_ok=True)
15 |
16 | # Initialize logger
17 | logger = AsyncLogger()
18 | # Updated initialization with default batching params
19 | # manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2)
20 | manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2)
21 |
22 | # Let's first check what files we have
23 | print("\nAvailable files:")
24 | for f in docs_dir.glob("*.md"):
25 | print(f"- {f.name}")
26 |
27 | # Generate index files
28 | print("\nGenerating index files...")
29 | await manager.generate_index_files(
30 | force_generate_facts=False, clear_bm25_cache=False
31 | )
32 |
33 | # Test some relevant queries about Crawl4AI
34 | test_queries = [
35 | "How is using the `arun_many` method?",
36 | ]
37 |
38 | print("\nTesting search functionality:")
39 | for query in test_queries:
40 | print(f"\nQuery: {query}")
41 | results = manager.search(query, top_k=2)
42 | print(f"Results length: {len(results)} characters")
43 | if results:
44 | print(
45 | "First 200 chars of results:", results[:200].replace("\n", " "), "..."
46 | )
47 | else:
48 | print("No results found")
49 |
50 |
51 | if __name__ == "__main__":
52 | asyncio.run(main())
53 |
--------------------------------------------------------------------------------
/tests/test_memory_macos.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Test script to verify macOS memory calculation accuracy."""
3 |
4 | import psutil
5 | import platform
6 | import time
7 | from crawl4ai.utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
8 |
9 |
10 | def test_memory_calculation():
11 | """Test and compare memory calculations."""
12 | print(f"Platform: {platform.system()}")
13 | print(f"Python version: {platform.python_version()}")
14 | print("-" * 60)
15 |
16 | # Get psutil's view
17 | vm = psutil.virtual_memory()
18 | psutil_percent = vm.percent
19 | psutil_available_gb = vm.available / (1024**3)
20 | total_gb = vm.total / (1024**3)
21 |
22 | # Get our corrected view
23 | true_percent = get_true_memory_usage_percent()
24 | true_available_gb = get_true_available_memory_gb()
25 | true_percent_calc, available_calc, total_calc = get_memory_stats()
26 |
27 | print("Memory Statistics Comparison:")
28 | print(f"Total Memory: {total_gb:.2f} GB")
29 | print()
30 |
31 | print("PSUtil (Standard) Calculation:")
32 | print(f" - Memory Used: {psutil_percent:.1f}%")
33 | print(f" - Available: {psutil_available_gb:.2f} GB")
34 | print()
35 |
36 | print("Platform-Aware Calculation:")
37 | print(f" - Memory Used: {true_percent:.1f}%")
38 | print(f" - Available: {true_available_gb:.2f} GB")
39 | print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
40 | print()
41 |
42 | # Show the impact on dispatcher behavior
43 | print("Impact on MemoryAdaptiveDispatcher:")
44 | thresholds = {
45 | "Normal": 90.0,
46 | "Critical": 95.0,
47 | "Recovery": 85.0
48 | }
49 |
50 | for name, threshold in thresholds.items():
51 | psutil_triggered = psutil_percent >= threshold
52 | true_triggered = true_percent >= threshold
53 | print(f" - {name} Threshold ({threshold}%):")
54 | print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
55 | print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
56 | if psutil_triggered != true_triggered:
57 | print(f" → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
58 | print()
59 |
60 | # Monitor for a few seconds
61 | print("Monitoring memory for 10 seconds...")
62 | for i in range(10):
63 | vm = psutil.virtual_memory()
64 | true_pct = get_true_memory_usage_percent()
65 | print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
66 | time.sleep(1)
67 | print("\n")
68 |
69 |
70 | if __name__ == "__main__":
71 | test_memory_calculation()
--------------------------------------------------------------------------------
/tests/test_scraping_strategy.py:
--------------------------------------------------------------------------------
1 | import nest_asyncio
2 |
3 | nest_asyncio.apply()
4 |
5 | import asyncio
6 | from crawl4ai import (
7 | AsyncWebCrawler,
8 | CrawlerRunConfig,
9 | LXMLWebScrapingStrategy,
10 | CacheMode,
11 | )
12 |
13 |
14 | async def main():
15 | config = CrawlerRunConfig(
16 | cache_mode=CacheMode.BYPASS,
17 | scraping_strategy=LXMLWebScrapingStrategy(), # Faster alternative to default BeautifulSoup
18 | )
19 | async with AsyncWebCrawler() as crawler:
20 | result = await crawler.arun(url="https://example.com", config=config)
21 | print(f"Success: {result.success}")
22 | print(f"Markdown length: {len(result.markdown.raw_markdown)}")
23 |
24 |
25 | if __name__ == "__main__":
26 | asyncio.run(main())
27 |
--------------------------------------------------------------------------------