├── .editorconfig
├── .github
    ├── CODEOWNERS
    ├── pull_request_template.md
    └── workflows
    │   ├── build_and_deploy_docs.yaml
    │   ├── check_pr_title.yaml
    │   ├── pre_release.yaml
    │   ├── release.yaml
    │   ├── run_code_checks.yaml
    │   ├── templates_e2e_tests.yaml
    │   └── update_new_issue.yaml
├── .gitignore
├── .markdownlint.yaml
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── deployment
    │   ├── apify_platform.mdx
    │   ├── code_examples
    │   │   ├── apify
    │   │   │   ├── crawler_as_actor_example.py
    │   │   │   ├── get_public_url.py
    │   │   │   ├── log_with_config_example.py
    │   │   │   ├── proxy_advanced_example.py
    │   │   │   └── proxy_example.py
    │   │   └── google
    │   │   │   ├── cloud_run_example.py
    │   │   │   └── google_example.py
    │   ├── google_cloud.mdx
    │   └── google_cloud_run.mdx
    ├── examples
    │   ├── add_data_to_dataset.mdx
    │   ├── beautifulsoup_crawler.mdx
    │   ├── capture_screenshot_using_playwright.mdx
    │   ├── capturing_page_snapshots_with_error_snapshotter.mdx
    │   ├── code_examples
    │   │   ├── adaptive_playwright_crawler.py
    │   │   ├── add_data_to_dataset_bs.py
    │   │   ├── add_data_to_dataset_dataset.py
    │   │   ├── add_data_to_dataset_pw.py
    │   │   ├── beautifulsoup_crawler.py
    │   │   ├── beautifulsoup_crawler_keep_alive.py
    │   │   ├── beautifulsoup_crawler_stop.py
    │   │   ├── capture_screenshot_using_playwright.py
    │   │   ├── configure_json_logging.py
    │   │   ├── crawl_all_links_on_website_bs.py
    │   │   ├── crawl_all_links_on_website_pw.py
    │   │   ├── crawl_multiple_urls_bs.py
    │   │   ├── crawl_multiple_urls_pw.py
    │   │   ├── crawl_specific_links_on_website_bs.py
    │   │   ├── crawl_specific_links_on_website_pw.py
    │   │   ├── crawl_website_with_relative_links_all_links.py
    │   │   ├── crawl_website_with_relative_links_same_domain.py
    │   │   ├── crawl_website_with_relative_links_same_hostname.py
    │   │   ├── crawl_website_with_relative_links_same_origin.py
    │   │   ├── export_entire_dataset_to_file_csv.py
    │   │   ├── export_entire_dataset_to_file_json.py
    │   │   ├── extract_and_add_specific_links_on_website_bs.py
    │   │   ├── extract_and_add_specific_links_on_website_pw.py
    │   │   ├── fill_and_submit_web_form_crawler.py
    │   │   ├── fill_and_submit_web_form_request.py
    │   │   ├── parsel_crawler.py
    │   │   ├── parsel_crawler_with_error_snapshotter.py
    │   │   ├── playwright_block_requests.py
    │   │   ├── playwright_crawler.py
    │   │   ├── playwright_crawler_with_camoufox.py
    │   │   ├── playwright_crawler_with_error_snapshotter.py
    │   │   ├── playwright_crawler_with_fingerprint_generator.py
    │   │   ├── respect_robots_on_skipped_request.py
    │   │   ├── respect_robots_txt_file.py
    │   │   └── resuming_paused_crawl.py
    │   ├── crawl_all_links_on_website.mdx
    │   ├── crawl_multiple_urls.mdx
    │   ├── crawl_specific_links_on_website.mdx
    │   ├── crawl_website_with_relative_links.mdx
    │   ├── crawler_keep_alive.mdx
    │   ├── crawler_stop.mdx
    │   ├── export_entire_dataset_to_file.mdx
    │   ├── fill_and_submit_web_form.mdx
    │   ├── json_logging.mdx
    │   ├── parsel_crawler.mdx
    │   ├── playwright_crawler.mdx
    │   ├── playwright_crawler_adaptive.mdx
    │   ├── playwright_crawler_with_block_requests.mdx
    │   ├── playwright_crawler_with_camoufox.mdx
    │   ├── playwright_crawler_with_fingerprint_generator.mdx
    │   ├── respect_robots_txt_file.mdx
    │   └── resuming_paused_crawl.mdx
    ├── guides
    │   ├── avoid_blocking.mdx
    │   ├── code_examples
    │   │   ├── avoid_blocking
    │   │   │   ├── default_fingerprint_generator_with_args.py
    │   │   │   └── playwright_with_fingerprint_generator.py
    │   │   ├── error_handling
    │   │   │   ├── change_handle_error_status.py
    │   │   │   ├── disable_retry.py
    │   │   │   └── handle_proxy_error.py
    │   │   ├── http_clients
    │   │   │   ├── curl_impersonate_example.py
    │   │   │   └── httpx_example.py
    │   │   ├── login_crawler
    │   │   │   ├── http_login.py
    │   │   │   └── playwright_login.py
    │   │   ├── playwright_crawler
    │   │   │   ├── browser_configuration_example.py
    │   │   │   ├── multiple_launch_example.py
    │   │   │   ├── plugin_browser_configuration_example.py
    │   │   │   └── pre_navigation_hook_example.py
    │   │   ├── playwright_crawler_adaptive
    │   │   │   ├── handler.py
    │   │   │   ├── init_beautifulsoup.py
    │   │   │   ├── init_parsel.py
    │   │   │   ├── init_prediction.py
    │   │   │   └── pre_nav_hooks.py
    │   │   ├── proxy_management
    │   │   │   ├── inspecting_bs_example.py
    │   │   │   ├── inspecting_pw_example.py
    │   │   │   ├── integration_bs_example.py
    │   │   │   ├── integration_pw_example.py
    │   │   │   ├── quick_start_example.py
    │   │   │   ├── session_bs_example.py
    │   │   │   ├── session_pw_example.py
    │   │   │   ├── tiers_bs_example.py
    │   │   │   └── tiers_pw_example.py
    │   │   ├── request_loaders
    │   │   │   ├── rl_basic_example.py
    │   │   │   ├── tandem_example.py
    │   │   │   └── tandem_example_explicit.py
    │   │   ├── running_in_web_server
    │   │   │   ├── __init__.py
    │   │   │   ├── crawler.py
    │   │   │   └── server.py
    │   │   ├── scaling_crawlers
    │   │   │   ├── max_tasks_per_minute_example.py
    │   │   │   └── min_and_max_concurrency_example.py
    │   │   ├── session_management
    │   │   │   ├── multi_sessions_http.py
    │   │   │   ├── one_session_http.py
    │   │   │   ├── sm_basic.py
    │   │   │   ├── sm_beautifulsoup.py
    │   │   │   ├── sm_http.py
    │   │   │   ├── sm_parsel.py
    │   │   │   ├── sm_playwright.py
    │   │   │   └── sm_standalone.py
    │   │   └── storages
    │   │   │   ├── cleaning_do_not_purge_example.py
    │   │   │   ├── cleaning_purge_explicitly_example.py
    │   │   │   ├── dataset_basic_example.py
    │   │   │   ├── dataset_with_crawler_example.py
    │   │   │   ├── dataset_with_crawler_explicit_example.py
    │   │   │   ├── helper_add_requests_example.py
    │   │   │   ├── helper_enqueue_links_example.py
    │   │   │   ├── kvs_basic_example.py
    │   │   │   ├── kvs_with_crawler_example.py
    │   │   │   ├── kvs_with_crawler_explicit_example.py
    │   │   │   ├── rq_basic_example.py
    │   │   │   ├── rq_with_crawler_example.py
    │   │   │   └── rq_with_crawler_explicit_example.py
    │   ├── crawler_login.mdx
    │   ├── error_handling.mdx
    │   ├── http_clients.mdx
    │   ├── http_crawlers.mdx
    │   ├── playwright_crawler.mdx
    │   ├── playwright_crawler_adaptive.mdx
    │   ├── proxy_management.mdx
    │   ├── request_loaders.mdx
    │   ├── running_in_web_server.mdx
    │   ├── scaling_crawlers.mdx
    │   ├── session_management.mdx
    │   └── storages.mdx
    ├── introduction
    │   ├── 01_setting_up.mdx
    │   ├── 02_first_crawler.mdx
    │   ├── 03_adding_more_urls.mdx
    │   ├── 04_real_world_project.mdx
    │   ├── 05_crawling.mdx
    │   ├── 06_scraping.mdx
    │   ├── 07_saving_data.mdx
    │   ├── 08_refactoring.mdx
    │   ├── 09_running_in_cloud.mdx
    │   ├── code_examples
    │   │   ├── 02_bs.py
    │   │   ├── 02_bs_better.py
    │   │   ├── 02_request_queue.py
    │   │   ├── 03_enqueue_strategy.py
    │   │   ├── 03_finding_new_links.py
    │   │   ├── 03_globs.py
    │   │   ├── 03_original_code.py
    │   │   ├── 03_transform_request.py
    │   │   ├── 04_sanity_check.py
    │   │   ├── 05_crawling_detail.py
    │   │   ├── 05_crawling_listing.py
    │   │   ├── 06_scraping.py
    │   │   ├── 07_final_code.py
    │   │   ├── 07_first_code.py
    │   │   ├── 08_main.py
    │   │   ├── 08_routes.py
    │   │   ├── 09_apify_sdk.py
    │   │   ├── __init__.py
    │   │   └── routes.py
    │   └── index.mdx
    ├── pyproject.toml
    ├── quick-start
    │   ├── code_examples
    │   │   ├── beautifulsoup_crawler_example.py
    │   │   ├── parsel_crawler_example.py
    │   │   ├── playwright_crawler_example.py
    │   │   └── playwright_crawler_headful_example.py
    │   └── index.mdx
    └── upgrading
    │   └── upgrading_to_v0x.md
├── pyproject.toml
├── renovate.json
├── src
    └── crawlee
    │   ├── __init__.py
    │   ├── _autoscaling
    │       ├── __init__.py
    │       ├── _types.py
    │       ├── autoscaled_pool.py
    │       ├── py.typed
    │       ├── snapshotter.py
    │       └── system_status.py
    │   ├── _browserforge_workaround.py
    │   ├── _cli.py
    │   ├── _consts.py
    │   ├── _log_config.py
    │   ├── _request.py
    │   ├── _service_locator.py
    │   ├── _types.py
    │   ├── _utils
    │       ├── __init__.py
    │       ├── blocked.py
    │       ├── byte_size.py
    │       ├── console.py
    │       ├── context.py
    │       ├── crypto.py
    │       ├── data_processing.py
    │       ├── docs.py
    │       ├── file.py
    │       ├── globs.py
    │       ├── html_to_text.py
    │       ├── measure_time.py
    │       ├── models.py
    │       ├── recoverable_state.py
    │       ├── recurring_task.py
    │       ├── requests.py
    │       ├── robots.py
    │       ├── system.py
    │       ├── try_import.py
    │       ├── urls.py
    │       ├── wait.py
    │       └── web.py
    │   ├── browsers
    │       ├── __init__.py
    │       ├── _browser_controller.py
    │       ├── _browser_plugin.py
    │       ├── _browser_pool.py
    │       ├── _playwright_browser.py
    │       ├── _playwright_browser_controller.py
    │       ├── _playwright_browser_plugin.py
    │       ├── _types.py
    │       └── py.typed
    │   ├── configuration.py
    │   ├── crawlers
    │       ├── __init__.py
    │       ├── _abstract_http
    │       │   ├── __init__.py
    │       │   ├── _abstract_http_crawler.py
    │       │   ├── _abstract_http_parser.py
    │       │   ├── _http_crawling_context.py
    │       │   └── py.typed
    │       ├── _adaptive_playwright
    │       │   ├── __init__.py
    │       │   ├── _adaptive_playwright_crawler.py
    │       │   ├── _adaptive_playwright_crawler_statistics.py
    │       │   ├── _adaptive_playwright_crawling_context.py
    │       │   ├── _rendering_type_predictor.py
    │       │   └── _result_comparator.py
    │       ├── _basic
    │       │   ├── __init__.py
    │       │   ├── _basic_crawler.py
    │       │   ├── _basic_crawling_context.py
    │       │   ├── _context_pipeline.py
    │       │   ├── _logging_utils.py
    │       │   └── py.typed
    │       ├── _beautifulsoup
    │       │   ├── __init__.py
    │       │   ├── _beautifulsoup_crawler.py
    │       │   ├── _beautifulsoup_crawling_context.py
    │       │   ├── _beautifulsoup_parser.py
    │       │   ├── _utils.py
    │       │   └── py.typed
    │       ├── _http
    │       │   ├── __init__.py
    │       │   ├── _http_crawler.py
    │       │   └── _http_parser.py
    │       ├── _parsel
    │       │   ├── __init__.py
    │       │   ├── _parsel_crawler.py
    │       │   ├── _parsel_crawling_context.py
    │       │   ├── _parsel_parser.py
    │       │   └── _utils.py
    │       ├── _playwright
    │       │   ├── __init__.py
    │       │   ├── _playwright_crawler.py
    │       │   ├── _playwright_crawling_context.py
    │       │   ├── _playwright_http_client.py
    │       │   ├── _playwright_pre_nav_crawling_context.py
    │       │   ├── _types.py
    │       │   └── _utils.py
    │       ├── _types.py
    │       └── py.typed
    │   ├── errors.py
    │   ├── events
    │       ├── __init__.py
    │       ├── _event_manager.py
    │       ├── _local_event_manager.py
    │       ├── _types.py
    │       └── py.typed
    │   ├── fingerprint_suite
    │       ├── __init__.py
    │       ├── _browserforge_adapter.py
    │       ├── _consts.py
    │       ├── _fingerprint_generator.py
    │       ├── _header_generator.py
    │       ├── _types.py
    │       └── py.typed
    │   ├── http_clients
    │       ├── __init__.py
    │       ├── _base.py
    │       ├── _curl_impersonate.py
    │       └── _httpx.py
    │   ├── project_template
    │       ├── cookiecutter.json
    │       ├── hooks
    │       │   ├── post_gen_project.py
    │       │   └── pre_gen_project.py
    │       ├── templates
    │       │   ├── main.py
    │       │   ├── main_beautifulsoup.py
    │       │   ├── main_parsel.py
    │       │   ├── main_playwright.py
    │       │   ├── main_playwright_camoufox.py
    │       │   ├── routes_beautifulsoup.py
    │       │   ├── routes_camoufox.py
    │       │   ├── routes_parsel.py
    │       │   ├── routes_playwright.py
    │       │   └── routes_playwright_camoufox.py
    │       └── {{cookiecutter.project_name}}
    │       │   ├── .dockerignore
    │       │   ├── Dockerfile
    │       │   ├── README.md
    │       │   ├── pyproject.toml
    │       │   ├── requirements.txt
    │       │   └── {{cookiecutter.__package_name}}
    │       │       ├── __init__.py
    │       │       ├── __main__.py
    │       │       ├── main.py
    │       │       └── routes.py
    │   ├── proxy_configuration.py
    │   ├── py.typed
    │   ├── request_loaders
    │       ├── __init__.py
    │       ├── _request_list.py
    │       ├── _request_loader.py
    │       ├── _request_manager.py
    │       └── _request_manager_tandem.py
    │   ├── router.py
    │   ├── sessions
    │       ├── __init__.py
    │       ├── _cookies.py
    │       ├── _models.py
    │       ├── _session.py
    │       ├── _session_pool.py
    │       └── py.typed
    │   ├── statistics
    │       ├── __init__.py
    │       ├── _error_snapshotter.py
    │       ├── _error_tracker.py
    │       ├── _models.py
    │       └── _statistics.py
    │   ├── storage_clients
    │       ├── __init__.py
    │       ├── _base
    │       │   ├── __init__.py
    │       │   ├── _dataset_client.py
    │       │   ├── _dataset_collection_client.py
    │       │   ├── _key_value_store_client.py
    │       │   ├── _key_value_store_collection_client.py
    │       │   ├── _request_queue_client.py
    │       │   ├── _request_queue_collection_client.py
    │       │   ├── _storage_client.py
    │       │   ├── _types.py
    │       │   └── py.typed
    │       ├── _memory
    │       │   ├── __init__.py
    │       │   ├── _creation_management.py
    │       │   ├── _dataset_client.py
    │       │   ├── _dataset_collection_client.py
    │       │   ├── _key_value_store_client.py
    │       │   ├── _key_value_store_collection_client.py
    │       │   ├── _memory_storage_client.py
    │       │   ├── _request_queue_client.py
    │       │   ├── _request_queue_collection_client.py
    │       │   └── py.typed
    │       ├── models.py
    │       └── py.typed
    │   └── storages
    │       ├── __init__.py
    │       ├── _base.py
    │       ├── _creation_management.py
    │       ├── _dataset.py
    │       ├── _key_value_store.py
    │       ├── _request_queue.py
    │       └── py.typed
├── tests
    ├── __init__.py
    ├── e2e
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── project_template
    │   │   ├── test_static_crawlers_templates.py
    │   │   └── utils.py
    └── unit
    │   ├── README.md
    │   ├── __init__.py
    │   ├── _autoscaling
    │       ├── test_autoscaled_pool.py
    │       ├── test_snapshotter.py
    │       └── test_system_status.py
    │   ├── _statistics
    │       ├── test_error_tracker.py
    │       ├── test_periodic_logging.py
    │       └── test_persistence.py
    │   ├── _utils
    │       ├── test_byte_size.py
    │       ├── test_console.py
    │       ├── test_crypto.py
    │       ├── test_data_processing.py
    │       ├── test_file.py
    │       ├── test_globs.py
    │       ├── test_html_to_text.py
    │       ├── test_measure_time.py
    │       ├── test_recurring_task.py
    │       ├── test_requests.py
    │       ├── test_robots.py
    │       ├── test_system.py
    │       ├── test_timedelata_ms.py
    │       └── test_urls.py
    │   ├── browsers
    │       ├── test_browser_pool.py
    │       ├── test_playwright_browser.py
    │       ├── test_playwright_browser_controller.py
    │       └── test_playwright_browser_plugin.py
    │   ├── conftest.py
    │   ├── crawlers
    │       ├── _adaptive_playwright
    │       │   ├── test_adaptive_playwright_crawler.py
    │       │   ├── test_adaptive_playwright_crawler_statistics.py
    │       │   ├── test_adaptive_playwright_crawling_context.py
    │       │   └── test_predictor.py
    │       ├── _basic
    │       │   ├── test_basic_crawler.py
    │       │   └── test_context_pipeline.py
    │       ├── _beautifulsoup
    │       │   └── test_beautifulsoup_crawler.py
    │       ├── _http
    │       │   └── test_http_crawler.py
    │       ├── _parsel
    │       │   └── test_parsel_crawler.py
    │       └── _playwright
    │       │   └── test_playwright_crawler.py
    │   ├── events
    │       ├── test_event_manager.py
    │       └── test_local_event_manager.py
    │   ├── fingerprint_suite
    │       ├── test_adapters.py
    │       └── test_header_generator.py
    │   ├── http_clients
    │       ├── test_curl_impersonate.py
    │       └── test_httpx.py
    │   ├── proxy_configuration
    │       ├── test_new_proxy_info.py
    │       └── test_tiers.py
    │   ├── request_loaders
    │       └── test_request_list.py
    │   ├── server.py
    │   ├── server_endpoints.py
    │   ├── sessions
    │       ├── test_cookies.py
    │       ├── test_models.py
    │       ├── test_session.py
    │       └── test_session_pool.py
    │   ├── storage_clients
    │       └── _memory
    │       │   ├── test_creation_management.py
    │       │   ├── test_dataset_client.py
    │       │   ├── test_dataset_collection_client.py
    │       │   ├── test_key_value_store_client.py
    │       │   ├── test_key_value_store_collection_client.py
    │       │   ├── test_memory_storage_client.py
    │       │   ├── test_memory_storage_e2e.py
    │       │   ├── test_request_queue_client.py
    │       │   └── test_request_queue_collection_client.py
    │   ├── storages
    │       ├── test_dataset.py
    │       ├── test_key_value_store.py
    │       ├── test_request_manager_tandem.py
    │       └── test_request_queue.py
    │   ├── test_cli.py
    │   ├── test_configuration.py
    │   ├── test_log_config.py
    │   ├── test_router.py
    │   └── test_service_locator.py
├── uv.lock
└── website
    ├── .eslintrc.json
    ├── .yarnrc.yml
    ├── babel.config.js
    ├── build_api_reference.sh
    ├── docusaurus.config.js
    ├── generate_module_shortcuts.py
    ├── package.json
    ├── patches
        ├── @docusaurus+core+3.4.0.patch
        └── @docusaurus+core+3.5.2.patch
    ├── roa-loader
        ├── index.js
        └── package.json
    ├── sidebars.js
    ├── src
        ├── components
        │   ├── ApiLink.jsx
        │   ├── Button.jsx
        │   ├── Button.module.css
        │   ├── CopyButton.jsx
        │   ├── CopyButton.module.css
        │   ├── Gradients.jsx
        │   ├── Highlights.jsx
        │   ├── Highlights.module.css
        │   ├── Homepage
        │   │   ├── HomepageCliExample.jsx
        │   │   ├── HomepageCliExample.module.css
        │   │   ├── HomepageCtaSection.jsx
        │   │   ├── HomepageCtaSection.module.css
        │   │   ├── HomepageHeroSection.jsx
        │   │   ├── HomepageHeroSection.module.css
        │   │   ├── LanguageInfoWidget.jsx
        │   │   ├── LanguageInfoWidget.module.css
        │   │   ├── LanguageSwitch.jsx
        │   │   ├── LanguageSwitch.module.css
        │   │   ├── RiverSection.jsx
        │   │   ├── RiverSection.module.css
        │   │   ├── ThreeCardsWithIcon.jsx
        │   │   ├── ThreeCardsWithIcon.module.css
        │   │   ├── animated-crawlee-logo-dark.svg
        │   │   └── animated-crawlee-logo-light.svg
        │   ├── RunnableCodeBlock.jsx
        │   └── RunnableCodeBlock.module.css
        ├── css
        │   └── custom.css
        ├── pages
        │   ├── home_page_example.py
        │   ├── index.js
        │   └── index.module.css
        └── theme
        │   ├── ColorModeToggle
        │       ├── dark-mode-icon.svg
        │       ├── index.js
        │       ├── light-mode-icon.svg
        │       └── styles.module.css
        │   ├── DocItem
        │       └── Layout
        │       │   ├── index.js
        │       │   └── styles.module.css
        │   ├── Footer
        │       ├── LinkItem
        │       │   ├── index.js
        │       │   └── index.module.css
        │       ├── index.js
        │       └── index.module.css
        │   ├── MDXComponents
        │       └── A.js
        │   ├── Navbar
        │       ├── Content
        │       │   ├── index.js
        │       │   └── styles.module.css
        │       ├── Logo
        │       │   ├── index.js
        │       │   └── index.module.css
        │       └── MobileSidebar
        │       │   ├── Header
        │       │       ├── index.js
        │       │       └── index.module.css
        │       │   ├── Layout
        │       │       └── index.js
        │       │   ├── PrimaryMenu
        │       │       └── index.js
        │       │   └── index.js
        │   └── NavbarItem
        │       └── ComponentTypes.js
    ├── static
        ├── .nojekyll
        ├── font
        │   ├── lota.woff
        │   └── lota.woff2
        ├── img
        │   ├── API.png
        │   ├── apify_logo.svg
        │   ├── apify_og_SDK.png
        │   ├── apify_sdk.svg
        │   ├── apify_sdk_white.svg
        │   ├── arrow_right.svg
        │   ├── auto-scaling-dark.webp
        │   ├── auto-scaling-light.webp
        │   ├── check.svg
        │   ├── chrome-scrape-dark.gif
        │   ├── chrome-scrape-light.gif
        │   ├── cloud_icon.svg
        │   ├── community-dark-icon.svg
        │   ├── community-light-icon.svg
        │   ├── crawlee-dark-new.svg
        │   ├── crawlee-dark.svg
        │   ├── crawlee-javascript-dark.svg
        │   ├── crawlee-javascript-light.svg
        │   ├── crawlee-light-new.svg
        │   ├── crawlee-light.svg
        │   ├── crawlee-logo-monocolor.svg
        │   ├── crawlee-logo.svg
        │   ├── crawlee-python-dark.svg
        │   ├── crawlee-python-light.svg
        │   ├── crawlee-python-og.png
        │   ├── defaults-dark-icon.svg
        │   ├── defaults-light-icon.svg
        │   ├── discord-brand-dark.svg
        │   ├── discord-brand.svg
        │   ├── docusaurus.svg
        │   ├── external-link.svg
        │   ├── favicon.ico
        │   ├── favorite-tools-dark.webp
        │   ├── favorite-tools-light.webp
        │   ├── features
        │   │   ├── auto-scaling.svg
        │   │   ├── automate-everything.svg
        │   │   ├── fingerprints.svg
        │   │   ├── node-requests.svg
        │   │   ├── runs-on-py.svg
        │   │   ├── storage.svg
        │   │   └── works-everywhere.svg
        │   ├── fill-and-submit-web-form
        │   │   ├── 00.jpg
        │   │   ├── 01.jpg
        │   │   ├── 02.jpg
        │   │   └── 03.jpg
        │   ├── getting-started
        │   │   ├── current-price.jpg
        │   │   ├── scraping-practice.jpg
        │   │   ├── select-an-element.jpg
        │   │   ├── selected-element.jpg
        │   │   ├── sku.jpg
        │   │   └── title.jpg
        │   ├── github-brand-dark.svg
        │   ├── github-brand.svg
        │   ├── hearth copy.svg
        │   ├── hearth.svg
        │   ├── javascript_logo.svg
        │   ├── js_file.svg
        │   ├── logo-big.svg
        │   ├── logo-blur.png
        │   ├── logo-blur.svg
        │   ├── logo-zoom.svg
        │   ├── menu-arrows.svg
        │   ├── oss_logo.png
        │   ├── puppeteer-live-view-dashboard.png
        │   ├── puppeteer-live-view-detail.png
        │   ├── queue-dark-icon.svg
        │   ├── queue-light-icon.svg
        │   ├── resuming-paused-crawl
        │   │   ├── 00.webp
        │   │   └── 01.webp
        │   ├── robot.png
        │   ├── routing-dark-icon.svg
        │   ├── routing-light-icon.svg
        │   ├── scraping-utils-dark-icon.svg
        │   ├── scraping-utils-light-icon.svg
        │   ├── smart-proxy-dark.webp
        │   ├── smart-proxy-light.webp
        │   ├── source_code.png
        │   ├── system.svg
        │   ├── triangles_dark.svg
        │   ├── triangles_light.svg
        │   ├── workflow.svg
        │   ├── zero-setup-dark-icon.svg
        │   └── zero-setup-light-icon.svg
        ├── js
        │   └── custom.js
        └── robots.txt
    ├── tools
        ├── docs-prettier.config.js
        ├── utils
        │   └── externalLink.js
        └── website_gif
        │   ├── chrome-scrape-dark.gif
        │   ├── chrome-scrape-dark.mp4
        │   ├── chrome-scrape-light.gif
        │   ├── chrome-scrape-light.mp4
        │   └── website_gif.mjs
    ├── tsconfig.eslint.json
    └── yarn.lock


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | charset = utf-8
 7 | trim_trailing_whitespace = true
 8 | insert_final_newline = true
 9 | end_of_line = lf
10 | 
11 | [Makefile]
12 | indent_style = tab
13 | 
14 | [{*.yaml, *.yml}]
15 | indent_size = 2
16 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Documentation codeowner
2 | 
3 | /docs/*.md @TC-MO
4 | /docs/*.mdx @TC-MO
5 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ### Description
 2 | 
 3 | <!-- The purpose of the PR, list of the changes, ... -->
 4 | 
 5 | - TODO
 6 | 
 7 | ### Issues
 8 | 
 9 | <!-- If applicable, reference any related GitHub issues -->
10 | 
11 | - Closes: #TODO
12 | 
13 | ### Testing
14 | 
15 | <!-- Describe the testing process for these changes -->
16 | 
17 | - TODO
18 | 
19 | ### Checklist
20 | 
21 | - [ ] CI passed
22 | 


--------------------------------------------------------------------------------
/.github/workflows/check_pr_title.yaml:
--------------------------------------------------------------------------------
 1 | name: Check PR title
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types: [opened, edited, synchronize]
 6 | 
 7 | jobs:
 8 |   check_pr_title:
 9 |     name: Check PR title
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: amannn/action-semantic-pull-request@v5.5.3
13 |         env:
14 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
15 | 


--------------------------------------------------------------------------------
/.github/workflows/run_code_checks.yaml:
--------------------------------------------------------------------------------
 1 | name: Run code checks
 2 | 
 3 | on:
 4 |   # Trigger code checks on opening a new pull request.
 5 |   # Secrets are only made available to the integration tests job, with a manual approval
 6 |   # step required for PRs from forks. This prevents their potential exposure.
 7 |   pull_request:
 8 | 
 9 |   # Pushing to the master branch triggers code checks
10 |   push:
11 |     branches:
12 |       - master
13 |     tags-ignore:
14 |       - "**" # Ignore all tags to prevent duplicate checks when tags are pushed.
15 | 
16 |   # It should also be possible to trigger checks manually
17 |   workflow_dispatch:
18 | 
19 | jobs:
20 |   lint_check:
21 |     name: Lint check
22 |     uses: apify/workflows/.github/workflows/python_lint_check.yaml@main
23 | 
24 |   type_check:
25 |     name: Type check
26 |     uses: apify/workflows/.github/workflows/python_type_check.yaml@main
27 | 
28 |   unit_tests:
29 |     name: Unit tests
30 |     uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main
31 |     secrets:
32 |       httpbin_url: ${{ secrets.APIFY_HTTPBIN_TOKEN && format('https://httpbin.apify.actor?token={0}', secrets.APIFY_HTTPBIN_TOKEN) || 'https://httpbin.org'}}
33 | 
34 |   docs_check:
35 |     name: Docs check
36 |     uses: apify/workflows/.github/workflows/python_docs_check.yaml@main
37 | 


--------------------------------------------------------------------------------
/.github/workflows/update_new_issue.yaml:
--------------------------------------------------------------------------------
 1 | name: Update new issue
 2 | 
 3 | on:
 4 |   issues:
 5 |     types:
 6 |       - opened
 7 | 
 8 | jobs:
 9 |   label_issues:
10 |     name: Label issues
11 |     runs-on: ubuntu-latest
12 |     permissions:
13 |       issues: write
14 | 
15 |     steps:
16 |       # Add the "t-tooling" label to all new issues
17 |       - uses: actions/github-script@v7
18 |         with:
19 |           script: |
20 |             github.rest.issues.addLabels({
21 |               issue_number: context.issue.number,
22 |               owner: context.repo.owner,
23 |               repo: context.repo.repo,
24 |               labels: ["t-tooling"]
25 |             })
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Cache
 2 | __pycache__
 3 | .mypy_cache
 4 | .pytest_cache
 5 | .ruff_cache
 6 | 
 7 | # Virtual envs
 8 | .venv
 9 | .direnv
10 | .envrc
11 | .python-version
12 | 
13 | # Other Python tools
14 | .ropeproject
15 | 
16 | # Mise
17 | mise.toml
18 | .mise.toml
19 | 
20 | # Egg and build artifacts
21 | *.egg-info/
22 | *.egg
23 | dist/
24 | build/
25 | 
26 | # Coverage reports
27 | .coverage*
28 | htmlcov
29 | 
30 | # IDE, editors
31 | .vscode
32 | .idea
33 | .DS_Store
34 | .nvim.lua
35 | Session.vim
36 | 
37 | # Docs
38 | docs/changelog.md
39 | 
40 | # Website build artifacts, node dependencies
41 | website/build
42 | website/node_modules
43 | website/.yarn
44 | website/.docusaurus
45 | website/api-typedoc-generated.json
46 | website/apify-shared-docspec-dump.jsonl
47 | website/docspec-dump.jsonl
48 | website/module_shortcuts.json
49 | website/typedoc-types*
50 | # npm lockfile (we use yarn)
51 | website/package-lock.json
52 | 
53 | # Default directory for memory storage
54 | storage/
55 | 


--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
1 | default: true
2 | line-length:
3 |   line_length: 120
4 | MD007:
5 |   indent: 4
6 | MD004:
7 |   style: dash
8 | no-inline-html: false
9 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: lint-check
 5 |         name: Lint check
 6 |         entry: make lint
 7 |         language: system
 8 |         pass_filenames: false
 9 | 
10 |       - id: type-check
11 |         name: Type check
12 |         entry: make type-check
13 |         language: system
14 |         pass_filenames: false
15 | 


--------------------------------------------------------------------------------
/docs/deployment/code_examples/apify/crawler_as_actor_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from apify import Actor
 4 | 
 5 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 6 | 
 7 | 
 8 | async def main() -> None:
 9 |     # Wrap the crawler code in an Actor context manager.
10 |     async with Actor:
11 |         crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)
12 | 
13 |         @crawler.router.default_handler
14 |         async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |             context.log.info(f'Processing {context.request.url} ...')
16 |             data = {
17 |                 'url': context.request.url,
18 |                 'title': context.soup.title.string if context.soup.title else None,
19 |             }
20 |             await context.push_data(data)
21 |             await context.enqueue_links()
22 | 
23 |         await crawler.run(['https://crawlee.dev'])
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     asyncio.run(main())
28 | 


--------------------------------------------------------------------------------
/docs/deployment/code_examples/apify/get_public_url.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from apify import Actor
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     async with Actor:
 8 |         store = await Actor.open_key_value_store()
 9 |         await store.set_value('your-file', {'foo': 'bar'})
10 |         url = store.get_public_url('your-file')
11 |         Actor.log.info(f'KVS public URL: {url}')
12 |         # https://api.apify.com/v2/key-value-stores/<your-store-id>/records/your-file
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     asyncio.run(main())
17 | 


--------------------------------------------------------------------------------
/docs/deployment/code_examples/apify/log_with_config_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from apify import Actor, Configuration
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Create a new configuration with your API key. You can find it at
 8 |     # https://console.apify.com/settings/integrations. It can be provided either
 9 |     # as a parameter "token" or as an environment variable "APIFY_TOKEN".
10 |     config = Configuration(
11 |         token='apify_api_YOUR_TOKEN',
12 |     )
13 | 
14 |     async with Actor(config):
15 |         Actor.log.info('Hello from Apify platform!')
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     asyncio.run(main())
20 | 


--------------------------------------------------------------------------------
/docs/deployment/code_examples/apify/proxy_advanced_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from apify import Actor
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     async with Actor:
 8 |         proxy_configuration = await Actor.create_proxy_configuration(
 9 |             password='apify_proxy_YOUR_PASSWORD',
10 |             # Specify the proxy group to use.
11 |             groups=['RESIDENTIAL'],
12 |             # Set the country code for the proxy.
13 |             country_code='US',
14 |         )
15 | 
16 |         # ...
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     asyncio.run(main())
21 | 


--------------------------------------------------------------------------------
/docs/deployment/code_examples/apify/proxy_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from apify import Actor
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     async with Actor:
 8 |         # Create a new Apify Proxy configuration. The password can be found at
 9 |         # https://console.apify.com/proxy/http-settings and should be provided either
10 |         # as a parameter "password" or as an environment variable "APIFY_PROXY_PASSWORD".
11 |         proxy_configuration = await Actor.create_proxy_configuration(
12 |             password='apify_proxy_YOUR_PASSWORD',
13 |         )
14 | 
15 |         if not proxy_configuration:
16 |             Actor.log.warning('Failed to create proxy configuration.')
17 |             return
18 | 
19 |         proxy_url = await proxy_configuration.new_url()
20 |         Actor.log.info(f'Proxy URL: {proxy_url}')
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     asyncio.run(main())
25 | 


--------------------------------------------------------------------------------
/docs/examples/beautifulsoup_crawler.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: beautifulsoup-crawler
 3 | title: BeautifulSoup crawler
 4 | ---
 5 | 
 6 | import ApiLink from '@site/src/components/ApiLink';
 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 8 | 
 9 | import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler.py';
10 | 
11 | This example demonstrates how to use <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.
12 | 
13 | <RunnableCodeBlock className="language-python" language="python">
14 |     {BeautifulSoupExample}
15 | </RunnableCodeBlock>
16 | 


--------------------------------------------------------------------------------
/docs/examples/capture_screenshot_using_playwright.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: capture-screenshots-using-playwright
 3 | title: Capture screenshots using Playwright
 4 | ---
 5 | 
 6 | import ApiLink from '@site/src/components/ApiLink';
 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 8 | 
 9 | import CaptureScreenshotExample from '!!raw-loader!roa-loader!./code_examples/capture_screenshot_using_playwright.py';
10 | 
11 | This example demonstrates how to capture screenshots of web pages using <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and store them in the key-value store.
12 | 
13 | The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method.
14 | 
15 | The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page.
16 | 
17 | <RunnableCodeBlock className="language-python" language="python">
18 |     {CaptureScreenshotExample}
19 | </RunnableCodeBlock>
20 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/add_data_to_dataset_bs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler()
 8 | 
 9 |     # Define the default request handler, which will be called for every request.
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
12 |         context.log.info(f'Processing {context.request.url} ...')
13 | 
14 |         # Extract data from the page.
15 |         data = {
16 |             'url': context.request.url,
17 |             'title': context.soup.title.string if context.soup.title else None,
18 |             'html': str(context.soup)[:1000],
19 |         }
20 | 
21 |         # Push the extracted data to the default dataset.
22 |         await context.push_data(data)
23 | 
24 |     # Run the crawler with the initial list of requests.
25 |     await crawler.run(
26 |         [
27 |             'https://crawlee.dev',
28 |             'https://apify.com',
29 |             'https://example.com',
30 |         ]
31 |     )
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     asyncio.run(main())
36 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/add_data_to_dataset_dataset.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.storages import Dataset
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Open dataset manually using asynchronous constructor open().
 8 |     dataset = await Dataset.open()
 9 | 
10 |     # Interact with dataset directly.
11 |     await dataset.push_data({'key': 'value'})
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     asyncio.run(main())
16 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/add_data_to_dataset_pw.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = PlaywrightCrawler()
 8 | 
 9 |     # Define the default request handler, which will be called for every request.
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
12 |         context.log.info(f'Processing {context.request.url} ...')
13 | 
14 |         # Extract data from the page.
15 |         data = {
16 |             'url': context.request.url,
17 |             'title': await context.page.title(),
18 |             'html': str(await context.page.content())[:1000],
19 |         }
20 | 
21 |         # Push the extracted data to the default dataset.
22 |         await context.push_data(data)
23 | 
24 |     # Run the crawler with the initial list of requests.
25 |     await crawler.run(
26 |         [
27 |             'https://crawlee.dev',
28 |             'https://apify.com',
29 |             'https://example.com',
30 |         ]
31 |     )
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     asyncio.run(main())
36 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_all_links_on_website_bs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler(
 8 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
 9 |         max_requests_per_crawl=10,
10 |     )
11 | 
12 |     # Define the default request handler, which will be called for every request.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         # Enqueue all links found on the page.
18 |         await context.enqueue_links()
19 | 
20 |     # Run the crawler with the initial list of requests.
21 |     await crawler.run(['https://crawlee.dev'])
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_all_links_on_website_pw.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = PlaywrightCrawler(
 8 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
 9 |         max_requests_per_crawl=10,
10 |     )
11 | 
12 |     # Define the default request handler, which will be called for every request.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         # Enqueue all links found on the page.
18 |         await context.enqueue_links()
19 | 
20 |     # Run the crawler with the initial list of requests.
21 |     await crawler.run(['https://crawlee.dev'])
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_multiple_urls_bs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler()
 8 | 
 9 |     # Define the default request handler, which will be called for every request.
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
12 |         context.log.info(f'Processing {context.request.url} ...')
13 | 
14 |     # Run the crawler with the initial list of requests.
15 |     await crawler.run(
16 |         [
17 |             'https://crawlee.dev',
18 |             'https://apify.com',
19 |             'https://example.com',
20 |         ]
21 |     )
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_multiple_urls_pw.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = PlaywrightCrawler()
 8 | 
 9 |     # Define the default request handler, which will be called for every request.
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
12 |         context.log.info(f'Processing {context.request.url} ...')
13 | 
14 |     # Run the crawler with the initial list of requests.
15 |     await crawler.run(
16 |         [
17 |             'https://crawlee.dev',
18 |             'https://apify.com',
19 |             'https://example.com',
20 |         ]
21 |     )
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_specific_links_on_website_bs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee import Glob
 4 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     crawler = BeautifulSoupCrawler(
 9 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
10 |         max_requests_per_crawl=10,
11 |     )
12 | 
13 |     # Define the default request handler, which will be called for every request.
14 |     @crawler.router.default_handler
15 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
16 |         context.log.info(f'Processing {context.request.url} ...')
17 | 
18 |         # Enqueue all the documentation links found on the page, except for the examples.
19 |         await context.enqueue_links(
20 |             include=[Glob('https://crawlee.dev/docs/**')],
21 |             exclude=[Glob('https://crawlee.dev/docs/examples')],
22 |         )
23 | 
24 |     # Run the crawler with the initial list of requests.
25 |     await crawler.run(['https://crawlee.dev'])
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     asyncio.run(main())
30 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_specific_links_on_website_pw.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee import Glob
 4 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     crawler = PlaywrightCrawler(
 9 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
10 |         max_requests_per_crawl=10,
11 |     )
12 | 
13 |     # Define the default request handler, which will be called for every request.
14 |     @crawler.router.default_handler
15 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
16 |         context.log.info(f'Processing {context.request.url} ...')
17 | 
18 |         # Enqueue all the documentation links found on the page, except for the examples.
19 |         await context.enqueue_links(
20 |             include=[Glob('https://crawlee.dev/docs/**')],
21 |             exclude=[Glob('https://crawlee.dev/docs/examples')],
22 |         )
23 | 
24 |     # Run the crawler with the initial list of requests.
25 |     await crawler.run(['https://crawlee.dev'])
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     asyncio.run(main())
30 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_website_with_relative_links_all_links.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler(
 8 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
 9 |         max_requests_per_crawl=10,
10 |     )
11 | 
12 |     # Define the default request handler, which will be called for every request.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         # Enqueue all links found on the page. Any URLs found will be matched by
18 |         # this strategy, even if they go off the site you are currently crawling.
19 |         await context.enqueue_links(strategy='all')
20 | 
21 |     # Run the crawler with the initial list of requests.
22 |     await crawler.run(['https://crawlee.dev'])
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     asyncio.run(main())
27 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_website_with_relative_links_same_domain.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler(
 8 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
 9 |         max_requests_per_crawl=10,
10 |     )
11 | 
12 |     # Define the default request handler, which will be called for every request.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         # Setting the strategy to same domain will enqueue all links found that
18 |         # are on the same hostname as request.loaded_url or request.url.
19 |         await context.enqueue_links(strategy='same-domain')
20 | 
21 |     # Run the crawler with the initial list of requests.
22 |     await crawler.run(['https://crawlee.dev'])
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     asyncio.run(main())
27 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_website_with_relative_links_same_hostname.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler(
 8 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
 9 |         max_requests_per_crawl=10,
10 |     )
11 | 
12 |     # Define the default request handler, which will be called for every request.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         # Setting the strategy to same hostname will enqueue all links found that are on
18 |         # the same hostname (including subdomains) as request.loaded_url or request.url.
19 |         await context.enqueue_links(strategy='same-hostname')
20 | 
21 |     # Run the crawler with the initial list of requests.
22 |     await crawler.run(['https://crawlee.dev'])
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     asyncio.run(main())
27 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/crawl_website_with_relative_links_same_origin.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler(
 8 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
 9 |         max_requests_per_crawl=10,
10 |     )
11 | 
12 |     # Define the default request handler, which will be called for every request.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         # Setting the strategy to same origin will enqueue all links found that are on
18 |         # the same origin as request.loaded_url or request.url.
19 |         await context.enqueue_links(strategy='same-origin')
20 | 
21 |     # Run the crawler with the initial list of requests.
22 |     await crawler.run(['https://crawlee.dev'])
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     asyncio.run(main())
27 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/export_entire_dataset_to_file_csv.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler(
 8 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
 9 |         max_requests_per_crawl=10,
10 |     )
11 | 
12 |     # Define the default request handler, which will be called for every request.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         # Extract data from the page.
18 |         data = {
19 |             'url': context.request.url,
20 |             'title': context.soup.title.string if context.soup.title else None,
21 |         }
22 | 
23 |         # Enqueue all links found on the page.
24 |         await context.enqueue_links()
25 | 
26 |         # Push the extracted data to the default dataset.
27 |         await context.push_data(data)
28 | 
29 |     # Run the crawler with the initial list of URLs.
30 |     await crawler.run(['https://crawlee.dev'])
31 | 
32 |     # Export the entire dataset to a CSV file.
33 |     await crawler.export_data_csv(path='results.csv')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     asyncio.run(main())
38 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/export_entire_dataset_to_file_json.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler(
 8 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
 9 |         max_requests_per_crawl=10,
10 |     )
11 | 
12 |     # Define the default request handler, which will be called for every request.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         # Extract data from the page.
18 |         data = {
19 |             'url': context.request.url,
20 |             'title': context.soup.title.string if context.soup.title else None,
21 |         }
22 | 
23 |         # Enqueue all links found on the page.
24 |         await context.enqueue_links()
25 | 
26 |         # Push the extracted data to the default dataset.
27 |         await context.push_data(data)
28 | 
29 |     # Run the crawler with the initial list of URLs.
30 |     await crawler.run(['https://crawlee.dev'])
31 | 
32 |     # Export the entire dataset to a JSON file.
33 |     await crawler.export_data_json(path='results.json')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     asyncio.run(main())
38 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/fill_and_submit_web_form_request.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from urllib.parse import urlencode
 3 | 
 4 | from crawlee import Request
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Prepare a POST request to the form endpoint.
 9 |     request = Request.from_url(
10 |         url='https://httpbin.org/post',
11 |         method='POST',
12 |         headers={'content-type': 'application/x-www-form-urlencoded'},
13 |         payload=urlencode(
14 |             {
15 |                 'custname': 'John Doe',
16 |                 'custtel': '1234567890',
17 |                 'custemail': 'johndoe@example.com',
18 |                 'size': 'large',
19 |                 'topping': ['bacon', 'cheese', 'mushroom'],
20 |                 'delivery': '13:00',
21 |                 'comments': 'Please ring the doorbell upon arrival.',
22 |             }
23 |         ).encode(),
24 |     )
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     asyncio.run(main())
29 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from random import choice
 3 | 
 4 | from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 5 | from crawlee.statistics import Statistics
 6 | 
 7 | 
 8 | async def main() -> None:
 9 |     crawler = ParselCrawler(
10 |         statistics=Statistics.with_default_state(save_error_snapshots=True)
11 |     )
12 | 
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: ParselCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 |         # Simulate various errors to demonstrate `ErrorSnapshotter`
17 |         # saving only the first occurrence of unique error.
18 |         await context.enqueue_links()
19 |         random_number = choice(range(10))
20 |         if random_number == 1:
21 |             raise KeyError('Some KeyError')
22 |         if random_number == 2:
23 |             raise ValueError('Some ValueError')
24 |         if random_number == 3:
25 |             raise RuntimeError('Some RuntimeError')
26 | 
27 |     await crawler.run(['https://crawlee.dev'])
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/playwright_block_requests.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import (
 4 |     PlaywrightCrawler,
 5 |     PlaywrightCrawlingContext,
 6 |     PlaywrightPreNavCrawlingContext,
 7 | )
 8 | 
 9 | 
10 | async def main() -> None:
11 |     crawler = PlaywrightCrawler(
12 |         # Limit the crawl to max requests. Remove or increase it for crawling all links.
13 |         max_requests_per_crawl=10,
14 |     )
15 | 
16 |     # Define the default request handler, which will be called for every request.
17 |     @crawler.router.default_handler
18 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
19 |         context.log.info(f'Processing {context.request.url} ...')
20 | 
21 |         await context.enqueue_links()
22 | 
23 |     # Define the hook, which will be called before every request.
24 |     @crawler.pre_navigation_hook
25 |     async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:
26 |         context.log.info(f'Navigating to {context.request.url} ...')
27 | 
28 |         # Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
29 |         await context.block_requests(extra_url_patterns=['adsbygoogle.js'])
30 | 
31 |     # Run the crawler with the initial list of URLs.
32 |     await crawler.run(['https://crawlee.dev/'])
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     asyncio.run(main())
37 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from random import choice
 3 | 
 4 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 5 | from crawlee.statistics import Statistics
 6 | 
 7 | 
 8 | async def main() -> None:
 9 |     crawler = PlaywrightCrawler(
10 |         statistics=Statistics.with_default_state(save_error_snapshots=True)
11 |     )
12 | 
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 |         # Simulate various errors to demonstrate `ErrorSnapshotter`
17 |         # saving only the first occurrence of unique error.
18 |         await context.enqueue_links()
19 |         random_number = choice(range(10))
20 |         if random_number == 1:
21 |             raise KeyError('Some KeyError')
22 |         if random_number == 2:
23 |             raise ValueError('Some ValueError')
24 |         if random_number == 3:
25 |             raise RuntimeError('Some RuntimeError')
26 | 
27 |     await crawler.run(['https://crawlee.dev'])
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/respect_robots_on_skipped_request.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee import SkippedReason
 4 | from crawlee.crawlers import (
 5 |     BeautifulSoupCrawler,
 6 |     BeautifulSoupCrawlingContext,
 7 | )
 8 | 
 9 | 
10 | async def main() -> None:
11 |     # Initialize the crawler with robots.txt compliance enabled
12 |     crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)
13 | 
14 |     @crawler.router.default_handler
15 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
16 |         context.log.info(f'Processing {context.request.url} ...')
17 | 
18 |     # highlight-start
19 |     # This handler is called when a request is skipped
20 |     @crawler.on_skipped_request
21 |     async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
22 |         # Check if the request was skipped due to robots.txt rules
23 |         if reason == 'robots_txt':
24 |             crawler.log.info(f'Skipped {url} due to robots.txt rules.')
25 | 
26 |     # highlight-end
27 | 
28 |     # Start the crawler with the specified URLs
29 |     # The login URL will be skipped and handled by the skipped_request_handler
30 |     await crawler.run(
31 |         ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
32 |     )
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     asyncio.run(main())
37 | 


--------------------------------------------------------------------------------
/docs/examples/code_examples/respect_robots_txt_file.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import (
 4 |     BeautifulSoupCrawler,
 5 |     BeautifulSoupCrawlingContext,
 6 | )
 7 | 
 8 | 
 9 | async def main() -> None:
10 |     # Initialize the crawler with robots.txt compliance enabled
11 |     crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)
12 | 
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |     # Start the crawler with the specified URLs
18 |     # The crawler will check the robots.txt file before making requests
19 |     # In this example, 'https://news.ycombinator.com/login' will be skipped
20 |     # because it's disallowed in the site's robots.txt file
21 |     await crawler.run(
22 |         ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
23 |     )
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     asyncio.run(main())
28 | 


--------------------------------------------------------------------------------
/docs/examples/crawl_multiple_urls.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: crawl-multiple-urls
 3 | title: Crawl multiple URLs
 4 | ---
 5 | 
 6 | import ApiLink from '@site/src/components/ApiLink';
 7 | import Tabs from '@theme/Tabs';
 8 | import TabItem from '@theme/TabItem';
 9 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
10 | 
11 | import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_bs.py';
12 | import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_pw.py';
13 | 
14 | This example demonstrates how to crawl a specified list of URLs using different crawlers. You'll learn how to set up the crawler, define a request handler, and run the crawler with multiple URLs. This setup is useful for scraping data from multiple pages or websites concurrently.
15 | 
16 | <Tabs groupId="main">
17 |     <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
18 |         <RunnableCodeBlock className="language-python" language="python">
19 |             {BeautifulSoupExample}
20 |         </RunnableCodeBlock>
21 |     </TabItem>
22 |     <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
23 |         <RunnableCodeBlock className="language-python" language="python">
24 |             {PlaywrightExample}
25 |         </RunnableCodeBlock>
26 |     </TabItem>
27 | </Tabs>
28 | 


--------------------------------------------------------------------------------
/docs/examples/crawler_keep_alive.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: crawler-keep-alive
 3 | title: Keep a Crawler alive waiting for more requests
 4 | ---
 5 | 
 6 | import ApiLink from '@site/src/components/ApiLink';
 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 8 | 
 9 | import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_keep_alive.py';
10 | 
11 | This example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of <ApiLink to="class/BasicCrawler#__init__">`BasicCrawler.__init__`</ApiLink>. This is available to all crawlers that inherit from <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`.
12 | 
13 | <RunnableCodeBlock className="language-python" language="python">
14 |     {BeautifulSoupExample}
15 | </RunnableCodeBlock>
16 | 


--------------------------------------------------------------------------------
/docs/examples/crawler_stop.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: crawler-stop
 3 | title: Stopping a Crawler with stop method
 4 | ---
 5 | 
 6 | import ApiLink from '@site/src/components/ApiLink';
 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 8 | 
 9 | import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_stop.py';
10 | 
11 | This example demonstrates how to use `stop` method of <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> to stop crawler once the crawler finds what it is looking for. This method is available to all crawlers that inherit from <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. Simply call `crawler.stop()` to stop the crawler. It will not continue to crawl through new requests. Requests that are already being concurrently processed are going to get finished. It is possible to call `stop` method with optional argument `reason` that is a string that will be used in logs and it can improve logs readability especially if you have multiple different conditions for triggering `stop`.
12 | 
13 | <RunnableCodeBlock className="language-python" language="python">
14 |     {BeautifulSoupExample}
15 | </RunnableCodeBlock>
16 | 


--------------------------------------------------------------------------------
/docs/examples/parsel_crawler.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: parsel-crawler
 3 | title: Parsel crawler
 4 | ---
 5 | 
 6 | import ApiLink from '@site/src/components/ApiLink';
 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 8 | 
 9 | import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler.py';
10 | 
11 | This example shows how to use <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping.  It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.
12 | 
13 | <RunnableCodeBlock className="language-python" language="python">
14 |     {ParselCrawlerExample}
15 | </RunnableCodeBlock>
16 | 


--------------------------------------------------------------------------------
/docs/examples/playwright_crawler.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: playwright-crawler
 3 | title: Playwright crawler
 4 | ---
 5 | 
 6 | import ApiLink from '@site/src/components/ApiLink';
 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 8 | 
 9 | import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler.py';
10 | 
11 | This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> to recursively scrape the Hacker news website using headless Chromium and Playwright.
12 | 
13 | The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content.
14 | 
15 | A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation.
16 | 
17 | <RunnableCodeBlock className="language-python" language="python">
18 |     {PlaywrightCrawlerExample}
19 | </RunnableCodeBlock>
20 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.fingerprint_suite import (
 4 |     DefaultFingerprintGenerator,
 5 |     HeaderGeneratorOptions,
 6 |     ScreenOptions,
 7 | )
 8 | 
 9 | 
10 | async def main() -> None:
11 |     fingerprint_generator = DefaultFingerprintGenerator(
12 |         header_options=HeaderGeneratorOptions(browsers=['chromium']),
13 |         screen_options=ScreenOptions(min_width=400),
14 |     )
15 | 
16 |     # ...
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     asyncio.run(main())
21 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Fingerprint generator is used by default.
 8 |     crawler = PlaywrightCrawler()
 9 | 
10 |     # Define the default request handler, which will be called for every request.
11 |     @crawler.router.default_handler
12 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
13 |         context.log.info(f'Processing {context.request.url} ...')
14 | 
15 |         # Find a link to the next page and enqueue it if it exists.
16 |         await context.enqueue_links(selector='.morelink')
17 | 
18 |     # Run the crawler with the initial list of URLs.
19 |     await crawler.run(['https://news.ycombinator.com/'])
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     asyncio.run(main())
24 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/error_handling/disable_retry.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
 4 | from crawlee.errors import HttpStatusCodeError, SessionError
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     crawler = HttpCrawler(max_request_retries=5)
 9 | 
10 |     # Create a parsing error for demonstration
11 |     @crawler.router.default_handler
12 |     async def default_handler(context: HttpCrawlingContext) -> None:
13 |         context.log.info(f'Processing {context.request.url} ...')
14 |         raise ValueError('Simulated parsing error')
15 | 
16 |     # This handler runs before any retry attempts
17 |     @crawler.error_handler
18 |     async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None:
19 |         context.log.error(f'Failed request {context.request.url}')
20 |         # Only allow retries for network-related errors
21 |         if not isinstance(error, (SessionError, HttpStatusCodeError)):
22 |             context.log.error('Non-network error detected')
23 |             # Stop further retry attempts for this `Request`
24 |             context.request.no_retry = True
25 | 
26 |     await crawler.run(['https://crawlee.dev/'])
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     asyncio.run(main())
31 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
 4 | from crawlee.crawlers import PlaywrightCrawler
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     crawler = PlaywrightCrawler(
 9 |         browser_pool=BrowserPool(
10 |             plugins=[
11 |                 PlaywrightBrowserPlugin(
12 |                     browser_type='chromium',
13 |                     browser_launch_options={
14 |                         'headless': False,
15 |                         'channel': 'msedge',
16 |                         'slow_mo': 200,
17 |                     },
18 |                     browser_new_context_options={
19 |                         'color_scheme': 'dark',
20 |                         'extra_http_headers': {
21 |                             'Custom-Header': 'my-header',
22 |                             'Accept-Language': 'en',
23 |                         },
24 |                         'user_agent': 'My-User-Agent',
25 |                     },
26 |                 )
27 |             ]
28 |         )
29 |     )
30 | 
31 |     # ...
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     asyncio.run(main())
36 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import (
 4 |     PlaywrightCrawler,
 5 |     PlaywrightCrawlingContext,
 6 |     PlaywrightPreNavCrawlingContext,
 7 | )
 8 | 
 9 | 
10 | async def main() -> None:
11 |     crawler = PlaywrightCrawler(max_requests_per_crawl=10)
12 | 
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 | 
17 |         await context.enqueue_links()
18 | 
19 |     @crawler.pre_navigation_hook
20 |     async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None:
21 |         context.log.info(f'Navigating to {context.request.url} ...')
22 | 
23 |         # will set a timeout for all navigation methods
24 |         context.page.set_default_navigation_timeout(600_000)
25 | 
26 |         # will set the page size before you go to the target URL
27 |         await context.page.set_viewport_size({'width': 1280, 'height': 1024})
28 | 
29 |     # Run the crawler with the initial list of URLs.
30 |     await crawler.run(['https://crawlee.dev'])
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     asyncio.run(main())
35 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/playwright_crawler_adaptive/handler.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from datetime import timedelta
 3 | 
 4 | from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()
 9 | 
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
12 |         # Locate element h2 within 5 seconds
13 |         h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))
14 |         # Do stuff with element found by the selector
15 |         context.log.info(h2)
16 | 
17 |     await crawler.run(['https://crawlee.dev/'])
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     asyncio.run(main())
22 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import AdaptivePlaywrightCrawler
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
 8 |         # Arguments relevant only for PlaywrightCrawler
 9 |         playwright_crawler_specific_kwargs={
10 |             'headless': False,
11 |             'browser_type': 'chromium',
12 |         },
13 |         # Common arguments relevant to all crawlers
14 |         max_crawl_depth=5,
15 |     )
16 | 
17 |     # ...
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     asyncio.run(main())
22 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/playwright_crawler_adaptive/init_parsel.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import AdaptivePlaywrightCrawler
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
 8 |         # Arguments relevant only for PlaywrightCrawler
 9 |         playwright_crawler_specific_kwargs={
10 |             'headless': False,
11 |             'browser_type': 'chromium',
12 |         },
13 |         # Common arguments relevant to all crawlers
14 |         max_crawl_depth=5,
15 |     )
16 | 
17 |     # ...
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     asyncio.run(main())
22 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/proxy_management/inspecting_bs_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | from crawlee.proxy_configuration import ProxyConfiguration
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Create a ProxyConfiguration object and pass it to the crawler.
 9 |     proxy_configuration = ProxyConfiguration(
10 |         proxy_urls=[
11 |             'http://proxy-1.com/',
12 |             'http://proxy-2.com/',
13 |         ]
14 |     )
15 |     crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)
16 | 
17 |     # Define the default request handler, which will be called for every request.
18 |     @crawler.router.default_handler
19 |     async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
20 |         # Log the proxy used for the current request.
21 |         context.log.info(f'Proxy for the current request: {context.proxy_info}')
22 | 
23 |     # Run the crawler with the initial list of requests.
24 |     await crawler.run(['https://crawlee.dev/'])
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     asyncio.run(main())
29 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/proxy_management/inspecting_pw_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | from crawlee.proxy_configuration import ProxyConfiguration
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Create a ProxyConfiguration object and pass it to the crawler.
 9 |     proxy_configuration = ProxyConfiguration(
10 |         proxy_urls=[
11 |             'http://proxy-1.com/',
12 |             'http://proxy-2.com/',
13 |         ]
14 |     )
15 |     crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)
16 | 
17 |     # Define the default request handler, which will be called for every request.
18 |     @crawler.router.default_handler
19 |     async def default_handler(context: PlaywrightCrawlingContext) -> None:
20 |         # Log the proxy used for the current request.
21 |         context.log.info(f'Proxy for the current request: {context.proxy_info}')
22 | 
23 |     # Run the crawler with the initial list of requests.
24 |     await crawler.run(['https://crawlee.dev/'])
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     asyncio.run(main())
29 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/proxy_management/integration_bs_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | from crawlee.proxy_configuration import ProxyConfiguration
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Create a ProxyConfiguration object and pass it to the crawler.
 9 |     proxy_configuration = ProxyConfiguration(
10 |         proxy_urls=[
11 |             'http://proxy-1.com/',
12 |             'http://proxy-2.com/',
13 |         ]
14 |     )
15 |     crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)
16 | 
17 |     # Define the default request handler, which will be called for every request.
18 |     @crawler.router.default_handler
19 |     async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
20 |         # Extract data from the page.
21 |         data = {
22 |             'url': context.request.url,
23 |             'title': context.soup.title.string if context.soup.title else None,
24 |         }
25 |         context.log.info(f'Extracted data: {data}')
26 | 
27 |     # Run the crawler with the initial list of requests.
28 |     await crawler.run(['https://crawlee.dev/'])
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     asyncio.run(main())
33 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/proxy_management/integration_pw_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | from crawlee.proxy_configuration import ProxyConfiguration
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Create a ProxyConfiguration object and pass it to the crawler.
 9 |     proxy_configuration = ProxyConfiguration(
10 |         proxy_urls=[
11 |             'http://proxy-1.com/',
12 |             'http://proxy-2.com/',
13 |         ]
14 |     )
15 |     crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)
16 | 
17 |     # Define the default request handler, which will be called for every request.
18 |     @crawler.router.default_handler
19 |     async def default_handler(context: PlaywrightCrawlingContext) -> None:
20 |         # Extract data from the page.
21 |         data = {
22 |             'url': context.request.url,
23 |             'title': await context.page.title(),
24 |         }
25 |         context.log.info(f'Extracted data: {data}')
26 | 
27 |     # Run the crawler with the initial list of requests.
28 |     await crawler.run(['https://crawlee.dev/'])
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     asyncio.run(main())
33 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/proxy_management/quick_start_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.proxy_configuration import ProxyConfiguration
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     proxy_configuration = ProxyConfiguration(
 8 |         proxy_urls=[
 9 |             'http://proxy-1.com/',
10 |             'http://proxy-2.com/',
11 |         ]
12 |     )
13 | 
14 |     # The proxy URLs are rotated in a round-robin.
15 |     proxy_url_1 = await proxy_configuration.new_url()  # http://proxy-1.com/
16 |     proxy_url_2 = await proxy_configuration.new_url()  # http://proxy-2.com/
17 |     proxy_url_3 = await proxy_configuration.new_url()  # http://proxy-1.com/
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     asyncio.run(main())
22 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/proxy_management/session_bs_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler
 4 | from crawlee.proxy_configuration import ProxyConfiguration
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Create a ProxyConfiguration object and pass it to the crawler.
 9 |     proxy_configuration = ProxyConfiguration(
10 |         proxy_urls=[
11 |             'http://proxy-1.com/',
12 |             'http://proxy-2.com/',
13 |         ]
14 |     )
15 |     crawler = BeautifulSoupCrawler(
16 |         proxy_configuration=proxy_configuration,
17 |         use_session_pool=True,
18 |     )
19 | 
20 |     # ...
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     asyncio.run(main())
25 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/proxy_management/session_pw_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler
 4 | from crawlee.proxy_configuration import ProxyConfiguration
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Create a ProxyConfiguration object and pass it to the crawler.
 9 |     proxy_configuration = ProxyConfiguration(
10 |         proxy_urls=[
11 |             'http://proxy-1.com/',
12 |             'http://proxy-2.com/',
13 |         ]
14 |     )
15 |     crawler = PlaywrightCrawler(
16 |         proxy_configuration=proxy_configuration,
17 |         use_session_pool=True,
18 |     )
19 | 
20 |     # ...
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     asyncio.run(main())
25 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/request_loaders/rl_basic_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.request_loaders import RequestList
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Open the request list, if it does not exist, it will be created.
 8 |     # Leave name empty to use the default request list.
 9 |     request_list = RequestList(
10 |         name='my-request-list',
11 |         requests=[
12 |             'https://apify.com/',
13 |             'https://crawlee.dev/',
14 |             'https://crawlee.dev/python/',
15 |         ],
16 |     )
17 | 
18 |     # Fetch and process requests from the queue.
19 |     while request := await request_list.fetch_next_request():
20 |         # Do something with it...
21 | 
22 |         # And mark it as handled.
23 |         await request_list.mark_request_as_handled(request)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     asyncio.run(main())
28 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/request_loaders/tandem_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 4 | from crawlee.request_loaders import RequestList
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Create a static request list.
 9 |     request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])
10 | 
11 |     # Convert the request list to a request manager using the to_tandem method.
12 |     # It is a tandem with the default request queue.
13 |     request_manager = await request_list.to_tandem()
14 | 
15 |     # Create a crawler and pass the request manager to it.
16 |     crawler = ParselCrawler(request_manager=request_manager)
17 | 
18 |     @crawler.router.default_handler
19 |     async def handler(context: ParselCrawlingContext) -> None:
20 |         # New links will be enqueued directly to the queue.
21 |         await context.enqueue_links()
22 | 
23 |     await crawler.run()
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     asyncio.run(main())
28 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/request_loaders/tandem_example_explicit.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 4 | from crawlee.request_loaders import RequestList, RequestManagerTandem
 5 | from crawlee.storages import RequestQueue
 6 | 
 7 | 
 8 | async def main() -> None:
 9 |     # Create a static request list.
10 |     request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])
11 | 
12 |     # Open the default request queue.
13 |     request_queue = await RequestQueue.open()
14 | 
15 |     # And combine them together to a sinhle request manager.
16 |     request_manager = RequestManagerTandem(request_list, request_queue)
17 | 
18 |     # Create a crawler and pass the request manager to it.
19 |     crawler = ParselCrawler(request_manager=request_manager)
20 | 
21 |     @crawler.router.default_handler
22 |     async def handler(context: ParselCrawlingContext) -> None:
23 |         # New links will be enqueued directly to the queue.
24 |         await context.enqueue_links()
25 | 
26 |     await crawler.run()
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     asyncio.run(main())
31 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/running_in_web_server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/docs/guides/code_examples/running_in_web_server/__init__.py


--------------------------------------------------------------------------------
/docs/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee import ConcurrencySettings
 4 | from crawlee.crawlers import BeautifulSoupCrawler
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     concurrency_settings = ConcurrencySettings(
 9 |         # Set the maximum number of concurrent requests the crawler can run to 100.
10 |         max_concurrency=100,
11 |         # Limit the total number of requests to 10 per minute to avoid overwhelming
12 |         # the target website.
13 |         max_tasks_per_minute=10,
14 |     )
15 | 
16 |     crawler = BeautifulSoupCrawler(
17 |         # Apply the defined concurrency settings to the crawler.
18 |         concurrency_settings=concurrency_settings,
19 |     )
20 | 
21 |     # ...
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee import ConcurrencySettings
 4 | from crawlee.crawlers import BeautifulSoupCrawler
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     concurrency_settings = ConcurrencySettings(
 9 |         # Start with 8 concurrent tasks, as long as resources are available.
10 |         desired_concurrency=8,
11 |         # Maintain a minimum of 5 concurrent tasks to ensure steady crawling.
12 |         min_concurrency=5,
13 |         # Limit the maximum number of concurrent tasks to 10 to prevent
14 |         # overloading the system.
15 |         max_concurrency=10,
16 |     )
17 | 
18 |     crawler = BeautifulSoupCrawler(
19 |         # Use the configured concurrency settings for the crawler.
20 |         concurrency_settings=concurrency_settings,
21 |     )
22 | 
23 |     # ...
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     asyncio.run(main())
28 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/session_management/sm_standalone.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.sessions import SessionPool
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Override the default Session pool configuration.
 8 |     async with SessionPool(
 9 |         max_pool_size=100,
10 |         create_session_settings={'max_usage_count': 10, 'blocked_status_codes': [403]},
11 |     ) as session_pool:
12 |         session = await session_pool.get_session()
13 | 
14 |         # Increase the error_score.
15 |         session.mark_bad()
16 | 
17 |         # Throw away the session.
18 |         session.retire()
19 | 
20 |         # Lower the error_score and mark the session good.
21 |         session.mark_good()
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/cleaning_do_not_purge_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.configuration import Configuration
 4 | from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Set the purge_on_start field to False to avoid purging the storage on start.
 9 |     # highlight-next-line
10 |     configuration = Configuration(purge_on_start=False)
11 | 
12 |     # Pass the configuration to the crawler.
13 |     crawler = HttpCrawler(configuration=configuration)
14 | 
15 |     @crawler.router.default_handler
16 |     async def request_handler(context: HttpCrawlingContext) -> None:
17 |         context.log.info(f'Processing {context.request.url} ...')
18 | 
19 |     await crawler.run(['https://crawlee.dev/'])
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     asyncio.run(main())
24 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import HttpCrawler
 4 | from crawlee.storage_clients import MemoryStorageClient
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     storage_client = MemoryStorageClient.from_config()
 9 | 
10 |     # Call the purge_on_start method to explicitly purge the storage.
11 |     # highlight-next-line
12 |     await storage_client.purge_on_start()
13 | 
14 |     # Pass the storage client to the crawler.
15 |     crawler = HttpCrawler(storage_client=storage_client)
16 | 
17 |     # ...
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     asyncio.run(main())
22 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/dataset_basic_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.storages import Dataset
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Open the dataset, if it does not exist, it will be created.
 8 |     # Leave name empty to use the default dataset.
 9 |     dataset = await Dataset.open()
10 | 
11 |     # Push a single row of data.
12 |     await dataset.push_data({'foo': 'bar'})
13 | 
14 |     # Push multiple rows of data (anything JSON-serializable can be pushed).
15 |     await dataset.push_data([{'foo': 'bar2', 'col2': 'val2'}, {'col3': 123}])
16 | 
17 |     # Fetch all data from the dataset.
18 |     data = await dataset.get_data()
19 |     # Do something with it...
20 | 
21 |     # Remove the dataset.
22 |     await dataset.drop()
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     asyncio.run(main())
27 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/dataset_with_crawler_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Create a new crawler (it can be any subclass of BasicCrawler).
 8 |     crawler = BeautifulSoupCrawler()
 9 | 
10 |     # Define the default request handler, which will be called for every request.
11 |     @crawler.router.default_handler
12 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
13 |         context.log.info(f'Processing {context.request.url} ...')
14 | 
15 |         # Extract data from the page.
16 |         data = {
17 |             'url': context.request.url,
18 |             'title': context.soup.title.string if context.soup.title else None,
19 |         }
20 | 
21 |         # Push the extracted data to the (default) dataset.
22 |         await context.push_data(data)
23 | 
24 |     # Run the crawler with the initial URLs.
25 |     await crawler.run(['https://crawlee.dev'])
26 | 
27 |     # Export the dataset to a file.
28 |     await crawler.export_data(path='dataset.csv')
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     asyncio.run(main())
33 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | from crawlee.storages import Dataset
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Open the dataset, if it does not exist, it will be created.
 9 |     # Leave name empty to use the default dataset.
10 |     dataset = await Dataset.open()
11 | 
12 |     # Create a new crawler (it can be any subclass of BasicCrawler).
13 |     crawler = BeautifulSoupCrawler()
14 | 
15 |     # Define the default request handler, which will be called for every request.
16 |     @crawler.router.default_handler
17 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
18 |         context.log.info(f'Processing {context.request.url} ...')
19 | 
20 |         # Extract data from the page.
21 |         data = {
22 |             'url': context.request.url,
23 |             'title': context.soup.title.string if context.soup.title else None,
24 |         }
25 | 
26 |         # Push the extracted data to the dataset.
27 |         await dataset.push_data(data)
28 | 
29 |     # Run the crawler with the initial URLs.
30 |     await crawler.run(['https://crawlee.dev'])
31 | 
32 |     # Export the dataset to the key-value store.
33 |     await dataset.export_to(key='dataset', content_type='csv')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     asyncio.run(main())
38 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/helper_add_requests_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler()
 8 | 
 9 |     @crawler.router.default_handler
10 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
11 |         context.log.info(f'Processing {context.request.url} ...')
12 |         # highlight-next-line
13 |         await context.add_requests(['https://apify.com/'])
14 | 
15 |     await crawler.run(['https://crawlee.dev/'])
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     asyncio.run(main())
20 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/helper_enqueue_links_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler()
 8 | 
 9 |     @crawler.router.default_handler
10 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
11 |         context.log.info(f'Processing {context.request.url} ...')
12 |         # highlight-next-line
13 |         await context.enqueue_links()
14 | 
15 |     await crawler.run(['https://crawlee.dev/'])
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     asyncio.run(main())
20 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/kvs_basic_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.storages import KeyValueStore
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Open the key-value store, if it does not exist, it will be created.
 8 |     # Leave name empty to use the default KVS.
 9 |     kvs = await KeyValueStore.open()
10 | 
11 |     # Set a value associated with 'some-key'.
12 |     await kvs.set_value(key='some-key', value={'foo': 'bar'})
13 | 
14 |     # Get the value associated with 'some-key'.
15 |     value = kvs.get_value('some-key')
16 |     # Do something with it...
17 | 
18 |     # Delete the value associated with 'some-key' by setting it to None.
19 |     await kvs.set_value(key='some-key', value=None)
20 | 
21 |     # Remove the key-value store.
22 |     await kvs.drop()
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     asyncio.run(main())
27 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/kvs_with_crawler_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Create a new Playwright crawler.
 8 |     crawler = PlaywrightCrawler()
 9 | 
10 |     # Define the default request handler, which will be called for every request.
11 |     @crawler.router.default_handler
12 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
13 |         context.log.info(f'Processing {context.request.url} ...')
14 | 
15 |         # Capture the screenshot of the page using Playwright's API.
16 |         screenshot = await context.page.screenshot()
17 |         name = context.request.url.split('/')[-1]
18 | 
19 |         # Get the key-value store from the context. # If it does not exist,
20 |         # it will be created. Leave name empty to use the default KVS.
21 |         kvs = await context.get_key_value_store()
22 | 
23 |         # Store the screenshot in the key-value store.
24 |         await kvs.set_value(
25 |             key=f'screenshot-{name}',
26 |             value=screenshot,
27 |             content_type='image/png',
28 |         )
29 | 
30 |     # Run the crawler with the initial URLs.
31 |     await crawler.run(['https://crawlee.dev'])
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     asyncio.run(main())
36 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | from crawlee.storages import KeyValueStore
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Open the key-value store, if it does not exist, it will be created.
 9 |     # Leave name empty to use the default KVS.
10 |     kvs = await KeyValueStore.open()
11 | 
12 |     # Create a new Playwright crawler.
13 |     crawler = PlaywrightCrawler()
14 | 
15 |     # Define the default request handler, which will be called for every request.
16 |     @crawler.router.default_handler
17 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
18 |         context.log.info(f'Processing {context.request.url} ...')
19 | 
20 |         # Capture the screenshot of the page using Playwright's API.
21 |         screenshot = await context.page.screenshot()
22 |         name = context.request.url.split('/')[-1]
23 | 
24 |         # Store the screenshot in the key-value store.
25 |         await kvs.set_value(
26 |             key=f'screenshot-{name}',
27 |             value=screenshot,
28 |             content_type='image/png',
29 |         )
30 | 
31 |     # Run the crawler with the initial URLs.
32 |     await crawler.run(['https://crawlee.dev'])
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     asyncio.run(main())
37 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/rq_basic_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.storages import RequestQueue
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Open the request queue, if it does not exist, it will be created.
 8 |     # Leave name empty to use the default request queue.
 9 |     request_queue = await RequestQueue.open(name='my-request-queue')
10 | 
11 |     # Add a single request.
12 |     await request_queue.add_request('https://apify.com/')
13 | 
14 |     # Add multiple requests as a batch.
15 |     await request_queue.add_requests_batched(
16 |         ['https://crawlee.dev/', 'https://crawlee.dev/python/']
17 |     )
18 | 
19 |     # Fetch and process requests from the queue.
20 |     while request := await request_queue.fetch_next_request():
21 |         # Do something with it...
22 | 
23 |         # And mark it as handled.
24 |         await request_queue.mark_request_as_handled(request)
25 | 
26 |     # Remove the request queue.
27 |     await request_queue.drop()
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/rq_with_crawler_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is
 8 |     # a default request manager, it will be opened, and fully managed if not specified.
 9 |     crawler = HttpCrawler()
10 | 
11 |     # Define the default request handler, which will be called for every request.
12 |     @crawler.router.default_handler
13 |     async def request_handler(context: HttpCrawlingContext) -> None:
14 |         context.log.info(f'Processing {context.request.url} ...')
15 | 
16 |         # Use context's add_requests method helper to add new requests from the handler.
17 |         await context.add_requests(['https://crawlee.dev/python/'])
18 | 
19 |     # Use crawler's add_requests method helper to add new requests.
20 |     await crawler.add_requests(['https://apify.com/'])
21 | 
22 |     # Run the crawler. You can optionally pass the list of initial requests.
23 |     await crawler.run(['https://crawlee.dev/'])
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     asyncio.run(main())
28 | 


--------------------------------------------------------------------------------
/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
 4 | from crawlee.storages import RequestQueue
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     # Open the request queue, if it does not exist, it will be created.
 9 |     # Leave name empty to use the default request queue.
10 |     request_queue = await RequestQueue.open(name='my-request-queue')
11 | 
12 |     # Interact with the request queue directly, e.g. add a batch of requests.
13 |     await request_queue.add_requests_batched(
14 |         ['https://apify.com/', 'https://crawlee.dev/']
15 |     )
16 | 
17 |     # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
18 |     # list as request manager to it. It will be managed by the crawler.
19 |     crawler = HttpCrawler(request_manager=request_queue)
20 | 
21 |     # Define the default request handler, which will be called for every request.
22 |     @crawler.router.default_handler
23 |     async def request_handler(context: HttpCrawlingContext) -> None:
24 |         context.log.info(f'Processing {context.request.url} ...')
25 | 
26 |     # And execute the crawler.
27 |     await crawler.run()
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/02_bs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | # Add import of crawler and crawling context.
 4 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 5 | from crawlee.storages import RequestQueue
 6 | 
 7 | 
 8 | async def main() -> None:
 9 |     # First you create the request queue instance.
10 |     rq = await RequestQueue.open()
11 | 
12 |     # And then you add one or more requests to it.
13 |     await rq.add_request('https://crawlee.dev')
14 | 
15 |     crawler = BeautifulSoupCrawler(request_manager=rq)
16 | 
17 |     # Define a request handler and attach it to the crawler using the decorator.
18 |     @crawler.router.default_handler
19 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
20 |         # Extract <title> text with BeautifulSoup.
21 |         # See BeautifulSoup documentation for API docs.
22 |         url = context.request.url
23 |         title = context.soup.title.string if context.soup.title else ''
24 |         context.log.info(f'The title of {url} is: {title}.')
25 | 
26 |     await crawler.run()
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     asyncio.run(main())
31 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/02_bs_better.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | # You don't need to import RequestQueue anymore.
 4 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     crawler = BeautifulSoupCrawler()
 9 | 
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
12 |         url = context.request.url
13 |         title = context.soup.title.string if context.soup.title else ''
14 |         context.log.info(f'The title of {url} is: {title}.')
15 | 
16 |     # Start the crawler with the provided URLs.
17 |     await crawler.run(['https://crawlee.dev/'])
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     asyncio.run(main())
22 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/02_request_queue.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.storages import RequestQueue
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # First you create the request queue instance.
 8 |     rq = await RequestQueue.open()
 9 | 
10 |     # And then you add one or more requests to it.
11 |     await rq.add_request('https://crawlee.dev')
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     asyncio.run(main())
16 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/03_enqueue_strategy.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)
 8 | 
 9 |     @crawler.router.default_handler
10 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
11 |         context.log.info(f'Processing {context.request.url}.')
12 | 
13 |         # See the `EnqueueStrategy` type alias for more strategy options.
14 |         # highlight-next-line
15 |         await context.enqueue_links(
16 |             # highlight-next-line
17 |             strategy='same-domain',
18 |             # highlight-next-line
19 |         )
20 | 
21 |     await crawler.run(['https://crawlee.dev/'])
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/03_finding_new_links.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # Let's limit our crawls to make our tests shorter and safer.
 8 |     crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)
 9 | 
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
12 |         url = context.request.url
13 |         title = context.soup.title.string if context.soup.title else ''
14 |         context.log.info(f'The title of {url} is: {title}.')
15 | 
16 |         # The enqueue_links function is available as one of the fields of the context.
17 |         # It is also context aware, so it does not require any parameters.
18 |         await context.enqueue_links()
19 | 
20 |     await crawler.run(['https://crawlee.dev/'])
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     asyncio.run(main())
25 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/03_globs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee import Glob
 4 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)
 9 | 
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
12 |         context.log.info(f'Processing {context.request.url}.')
13 | 
14 |         # Enqueue links that match the 'include' glob pattern and
15 |         # do not match the 'exclude' glob pattern.
16 |         # highlight-next-line
17 |         await context.enqueue_links(
18 |             # highlight-next-line
19 |             include=[Glob('https://someplace.com/**/cats')],
20 |             # highlight-next-line
21 |             exclude=[Glob('https://**/archive/**')],
22 |             # highlight-next-line
23 |         )
24 | 
25 |     await crawler.run(['https://crawlee.dev/'])
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     asyncio.run(main())
30 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/03_original_code.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = BeautifulSoupCrawler()
 8 | 
 9 |     @crawler.router.default_handler
10 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
11 |         url = context.request.url
12 |         title = context.soup.title.string if context.soup.title else ''
13 |         context.log.info(f'The title of {url} is: {title}.')
14 | 
15 |     await crawler.run(['https://crawlee.dev/'])
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     asyncio.run(main())
20 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/04_sanity_check.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | # Instead of BeautifulSoupCrawler let's use Playwright to be able to render JavaScript.
 4 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 5 | 
 6 | 
 7 | async def main() -> None:
 8 |     crawler = PlaywrightCrawler()
 9 | 
10 |     @crawler.router.default_handler
11 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
12 |         # Wait for the collection cards to render on the page. This ensures that
13 |         # the elements we want to interact with are present in the DOM.
14 |         await context.page.wait_for_selector('.collection-block-item')
15 | 
16 |         # Execute a function within the browser context to target the collection
17 |         # card elements and extract their text content, trimming any leading or
18 |         # trailing whitespace.
19 |         category_texts = await context.page.eval_on_selector_all(
20 |             '.collection-block-item',
21 |             '(els) => els.map(el => el.textContent.trim())',
22 |         )
23 | 
24 |         # Log the extracted texts.
25 |         for i, text in enumerate(category_texts):
26 |             context.log.info(f'CATEGORY_{i + 1}: {text}')
27 | 
28 |     await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     asyncio.run(main())
33 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/05_crawling_listing.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = PlaywrightCrawler()
 8 | 
 9 |     @crawler.router.default_handler
10 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
11 |         context.log.info(f'Processing {context.request.url}')
12 | 
13 |         # Wait for the category cards to render on the page. This ensures that
14 |         # the elements we want to interact with are present in the DOM.
15 |         await context.page.wait_for_selector('.collection-block-item')
16 | 
17 |         # Enqueue links found within elements that match the specified selector.
18 |         # These links will be added to the crawling queue with the label CATEGORY.
19 |         await context.enqueue_links(
20 |             selector='.collection-block-item',
21 |             label='CATEGORY',
22 |         )
23 | 
24 |     await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     asyncio.run(main())
29 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/07_first_code.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | from crawlee.storages import Dataset
 5 | 
 6 | # ...
 7 | 
 8 | 
 9 | async def main() -> None:
10 |     crawler = PlaywrightCrawler()
11 |     dataset = await Dataset.open()
12 | 
13 |     # ...
14 | 
15 |     @crawler.router.default_handler
16 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
17 |         ...
18 |         # ...
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     asyncio.run(main())
23 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/08_main.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler
 4 | 
 5 | from .routes import router
 6 | 
 7 | 
 8 | async def main() -> None:
 9 |     crawler = PlaywrightCrawler(
10 |         # Let's limit our crawls to make our tests shorter and safer.
11 |         max_requests_per_crawl=10,
12 |         # Provide our router instance to the crawler.
13 |         request_handler=router,
14 |     )
15 | 
16 |     await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     asyncio.run(main())
21 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/09_apify_sdk.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | # highlight-next-line
 4 | from apify import Actor
 5 | 
 6 | from crawlee.crawlers import PlaywrightCrawler
 7 | 
 8 | from .routes import router
 9 | 
10 | 
11 | async def main() -> None:
12 |     # highlight-next-line
13 |     async with Actor:
14 |         crawler = PlaywrightCrawler(
15 |             # Let's limit our crawls to make our tests shorter and safer.
16 |             max_requests_per_crawl=10,
17 |             # Provide our router instance to the crawler.
18 |             request_handler=router,
19 |         )
20 | 
21 |         await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/docs/introduction/code_examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/docs/introduction/code_examples/__init__.py


--------------------------------------------------------------------------------
/docs/introduction/code_examples/routes.py:
--------------------------------------------------------------------------------
1 | from crawlee.crawlers import PlaywrightCrawlingContext
2 | from crawlee.router import Router
3 | 
4 | router = Router[PlaywrightCrawlingContext]()
5 | 


--------------------------------------------------------------------------------
/docs/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Line lenght different from the rest of the code to make sure that the example codes visualised on the generated
 2 | # documentation webpages are shown without vertical slider to make them more readable.
 3 | 
 4 | [tool.ruff]
 5 | # Inherit all from project top configuration file.
 6 | extend = "../pyproject.toml"
 7 | 
 8 | # Override just line length
 9 | line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider.
10 | 


--------------------------------------------------------------------------------
/docs/quick-start/code_examples/beautifulsoup_crawler_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # BeautifulSoupCrawler crawls the web using HTTP requests
 8 |     # and parses HTML using the BeautifulSoup library.
 9 |     crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)
10 | 
11 |     # Define a request handler to process each crawled page
12 |     # and attach it to the crawler using a decorator.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 |         # Extract relevant data from the page context.
17 |         data = {
18 |             'url': context.request.url,
19 |             'title': context.soup.title.string if context.soup.title else None,
20 |         }
21 |         # Store the extracted data.
22 |         await context.push_data(data)
23 |         # Extract links from the current page and add them to the crawling queue.
24 |         await context.enqueue_links()
25 | 
26 |     # Add first URL to the queue and start the crawl.
27 |     await crawler.run(['https://crawlee.dev'])
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/docs/quick-start/code_examples/parsel_crawler_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # ParselCrawler crawls the web using HTTP requests
 8 |     # and parses HTML using the Parsel library.
 9 |     crawler = ParselCrawler(max_requests_per_crawl=10)
10 | 
11 |     # Define a request handler to process each crawled page
12 |     # and attach it to the crawler using a decorator.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: ParselCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 |         # Extract relevant data from the page context.
17 |         data = {
18 |             'url': context.request.url,
19 |             'title': context.selector.xpath('//title/text()').get(),
20 |         }
21 |         # Store the extracted data.
22 |         await context.push_data(data)
23 |         # Extract links from the current page and add them to the crawling queue.
24 |         await context.enqueue_links()
25 | 
26 |     # Add first URL to the queue and start the crawl.
27 |     await crawler.run(['https://crawlee.dev'])
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/docs/quick-start/code_examples/playwright_crawler_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     # PlaywrightCrawler crawls the web using a headless browser
 8 |     # controlled by the Playwright library.
 9 |     crawler = PlaywrightCrawler()
10 | 
11 |     # Define a request handler to process each crawled page
12 |     # and attach it to the crawler using a decorator.
13 |     @crawler.router.default_handler
14 |     async def request_handler(context: PlaywrightCrawlingContext) -> None:
15 |         context.log.info(f'Processing {context.request.url} ...')
16 |         # Extract relevant data from the page context.
17 |         data = {
18 |             'url': context.request.url,
19 |             'title': await context.page.title(),
20 |         }
21 |         # Store the extracted data.
22 |         await context.push_data(data)
23 |         # Extract links from the current page and add them to the crawling queue.
24 |         await context.enqueue_links()
25 | 
26 |     # Add first URL to the queue and start the crawl.
27 |     await crawler.run(['https://crawlee.dev'])
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/docs/quick-start/code_examples/playwright_crawler_headful_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from crawlee.crawlers import PlaywrightCrawler
 4 | 
 5 | 
 6 | async def main() -> None:
 7 |     crawler = PlaywrightCrawler(
 8 |         # Run with a visible browser window.
 9 |         # highlight-next-line
10 |         headless=False,
11 |         # Switch to the Firefox browser.
12 |         browser_type='firefox',
13 |     )
14 | 
15 |     # ...
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     asyncio.run(main())
20 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": ["config:base", ":semanticCommitTypeAll(chore)"],
 3 |     "pinVersions": false,
 4 |     "separateMajorMinor": false,
 5 |     "dependencyDashboard": false,
 6 |     "semanticCommits": "enabled",
 7 |     "lockFileMaintenance": {
 8 |         "enabled": true,
 9 |         "automerge": true,
10 |         "automergeType": "branch"
11 |     },
12 |     "packageRules": [
13 |         {
14 |             "matchPaths": ["pyproject.toml"],
15 |             "matchDepTypes": ["devDependencies"],
16 |             "matchUpdateTypes": ["major", "minor"],
17 |             "groupName": "major/minor dev dependencies",
18 |             "groupSlug": "dev-dependencies",
19 |             "automerge": true,
20 |             "automergeType": "branch"
21 |         }
22 |     ],
23 |     "schedule": ["before 7am every weekday"],
24 |     "ignoreDeps": ["crawlee", "docusaurus-plugin-typedoc-api"]
25 | }
26 | 


--------------------------------------------------------------------------------
/src/crawlee/__init__.py:
--------------------------------------------------------------------------------
 1 | from importlib import metadata
 2 | 
 3 | from ._request import Request, RequestOptions
 4 | from ._service_locator import service_locator
 5 | from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
 6 | from ._utils.globs import Glob
 7 | 
 8 | __version__ = metadata.version('crawlee')
 9 | 
10 | __all__ = [
11 |     'ConcurrencySettings',
12 |     'EnqueueStrategy',
13 |     'Glob',
14 |     'HttpHeaders',
15 |     'Request',
16 |     'RequestOptions',
17 |     'RequestTransformAction',
18 |     'SkippedReason',
19 |     'service_locator',
20 | ]
21 | 


--------------------------------------------------------------------------------
/src/crawlee/_autoscaling/__init__.py:
--------------------------------------------------------------------------------
1 | from .autoscaled_pool import AutoscaledPool
2 | from .snapshotter import Snapshotter
3 | from .system_status import SystemStatus
4 | 
5 | __all__ = ['AutoscaledPool', 'Snapshotter', 'SystemStatus']
6 | 


--------------------------------------------------------------------------------
/src/crawlee/_autoscaling/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/_autoscaling/py.typed


--------------------------------------------------------------------------------
/src/crawlee/_consts.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | METADATA_FILENAME = '__metadata__.json'
4 | 


--------------------------------------------------------------------------------
/src/crawlee/_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/_utils/__init__.py


--------------------------------------------------------------------------------
/src/crawlee/_utils/blocked.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/blocked.ts
 4 | 
 5 | CLOUDFLARE_RETRY_CSS_SELECTORS = [
 6 |     '#turnstile-wrapper iframe[src^="https://challenges.cloudflare.com"]',
 7 | ]
 8 | 
 9 | RETRY_CSS_SELECTORS = [
10 |     *CLOUDFLARE_RETRY_CSS_SELECTORS,
11 |     'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]',
12 |     'iframe[src*="_Incapsula_Resource"]',
13 | ]
14 | """
15 | CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked.
16 | """
17 | 
18 | ROTATE_PROXY_ERRORS = [
19 |     'ECONNRESET',
20 |     'ECONNREFUSED',
21 |     'ERR_PROXY_CONNECTION_FAILED',
22 |     'ERR_TUNNEL_CONNECTION_FAILED',
23 |     'Proxy responded with',
24 | ]
25 | """
26 | Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning.
27 | """
28 | 


--------------------------------------------------------------------------------
/src/crawlee/_utils/crypto.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import secrets
 4 | from hashlib import sha256
 5 | 
 6 | 
 7 | def compute_short_hash(data: bytes, *, length: int = 8) -> str:
 8 |     """Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.
 9 | 
10 |     Args:
11 |         data: The binary data to be hashed.
12 |         length: The length of the hash to be returned.
13 | 
14 |     Returns:
15 |         A substring (prefix) of the hexadecimal hash of the data.
16 |     """
17 |     hash_object = sha256(data)
18 |     return hash_object.hexdigest()[:length]
19 | 
20 | 
21 | def crypto_random_object_id(length: int = 17) -> str:
22 |     """Generate a random object ID."""
23 |     chars = 'abcdefghijklmnopqrstuvwxyzABCEDFGHIJKLMNOPQRSTUVWXYZ0123456789'
24 |     return ''.join(secrets.choice(chars) for _ in range(length))
25 | 


--------------------------------------------------------------------------------
/src/crawlee/_utils/docs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Callable, Literal
 4 | 
 5 | GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Event payloads', 'Errors', 'Functions']
 6 | 
 7 | 
 8 | def docs_group(group_name: GroupName) -> Callable:  # noqa: ARG001
 9 |     """Mark a symbol for rendering and grouping in documentation.
10 | 
11 |     This decorator is used solely for documentation purposes and does not modify the behavior
12 |     of the decorated callable.
13 | 
14 |     Args:
15 |         group_name: The documentation group to which the symbol belongs.
16 | 
17 |     Returns:
18 |         The original callable without modification.
19 |     """
20 | 
21 |     def wrapper(func: Callable) -> Callable:
22 |         return func
23 | 
24 |     return wrapper
25 | 


--------------------------------------------------------------------------------
/src/crawlee/_utils/html_to_text.py:
--------------------------------------------------------------------------------
 1 | # This file contains shared constants used by different implementations of html_to_text function.
 2 | from __future__ import annotations
 3 | 
 4 | import re
 5 | 
 6 | # Tags based on Javascript implementation of htmlToText from:
 7 | # https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11
 8 | # Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9
 9 | SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}
10 | BLOCK_TAGS = {
11 |     'p',
12 |     'h1',
13 |     'h2',
14 |     'h3',
15 |     'h4',
16 |     'h5',
17 |     'h6',
18 |     'ol',
19 |     'ul',
20 |     'li',
21 |     'pre',
22 |     'address',
23 |     'blockquote',
24 |     'dl',
25 |     'div',
26 |     'fieldset',
27 |     'form',
28 |     'table',
29 |     'tr',
30 |     'select',
31 |     'option',
32 | }
33 | 
34 | _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$')
35 | _EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$')
36 | _ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+')
37 | 


--------------------------------------------------------------------------------
/src/crawlee/_utils/measure_time.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import time
 4 | from contextlib import contextmanager
 5 | from dataclasses import dataclass
 6 | from typing import TYPE_CHECKING
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from collections.abc import Iterator
10 | 
11 | 
12 | @dataclass
13 | class TimerResult:
14 |     wall: float | None = None
15 |     cpu: float | None = None
16 | 
17 | 
18 | @contextmanager
19 | def measure_time() -> Iterator[TimerResult]:
20 |     """Measure the execution time (wall-clock and CPU) between the start and end of the with-block."""
21 |     result = TimerResult()
22 |     before_wall = time.monotonic()
23 |     before_cpu = time.thread_time()
24 | 
25 |     try:
26 |         yield result
27 |     finally:
28 |         after_wall = time.monotonic()
29 |         after_cpu = time.thread_time()
30 |         result.wall = after_wall - before_wall
31 |         result.cpu = after_cpu - before_cpu
32 | 


--------------------------------------------------------------------------------
/src/crawlee/_utils/urls.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pydantic import AnyHttpUrl, TypeAdapter
 4 | from yarl import URL
 5 | 
 6 | 
 7 | def is_url_absolute(url: str) -> bool:
 8 |     """Check if a URL is absolute."""
 9 |     url_parsed = URL(url)
10 | 
11 |     # We don't use .absolute because in yarl.URL, it is always True for links that start with '//'
12 |     return bool(url_parsed.scheme) and bool(url_parsed.raw_authority)
13 | 
14 | 
15 | def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
16 |     """Convert a relative URL to an absolute URL using a base URL."""
17 |     return str(URL(base_url).join(URL(relative_url)))
18 | 
19 | 
20 | _http_url_adapter = TypeAdapter(AnyHttpUrl)
21 | 
22 | 
23 | def validate_http_url(value: str | None) -> str | None:
24 |     """Validate the given HTTP URL.
25 | 
26 |     Raises:
27 |         pydantic.ValidationError: If the URL is not valid.
28 |     """
29 |     if value is not None:
30 |         _http_url_adapter.validate_python(value)
31 | 
32 |     return value
33 | 


--------------------------------------------------------------------------------
/src/crawlee/_utils/web.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | def is_status_code_client_error(value: int) -> bool:
 5 |     """Return `True` for 4xx status codes, `False` otherwise."""
 6 |     return 400 <= value <= 499  # noqa: PLR2004
 7 | 
 8 | 
 9 | def is_status_code_server_error(value: int) -> bool:
10 |     """Return `True` for 5xx status codes, `False` otherwise."""
11 |     return value >= 500  # noqa: PLR2004
12 | 


--------------------------------------------------------------------------------
/src/crawlee/browsers/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: E402, TID252
 2 | 
 3 | from crawlee._utils.try_import import install_import_hook as _install_import_hook
 4 | from crawlee._utils.try_import import try_import as _try_import
 5 | 
 6 | _install_import_hook(__name__)
 7 | 
 8 | # Due to patch_browserforge
 9 | from .._browserforge_workaround import patch_browserforge
10 | 
11 | patch_browserforge()
12 | 
13 | # The following imports are wrapped in try_import to handle optional dependencies,
14 | # ensuring the module can still function even if these dependencies are missing.
15 | with _try_import(__name__, 'BrowserPool'):
16 |     from ._browser_pool import BrowserPool
17 | with _try_import(__name__, 'PlaywrightBrowserController'):
18 |     from ._playwright_browser_controller import PlaywrightBrowserController
19 | with _try_import(__name__, 'PlaywrightBrowserPlugin'):
20 |     from ._playwright_browser_plugin import PlaywrightBrowserPlugin
21 | 
22 | __all__ = [
23 |     'BrowserPool',
24 |     'PlaywrightBrowserController',
25 |     'PlaywrightBrowserPlugin',
26 | ]
27 | 


--------------------------------------------------------------------------------
/src/crawlee/browsers/_types.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import TYPE_CHECKING, Literal
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from playwright.async_api import Page
 8 | 
 9 | BrowserType = Literal['chromium', 'firefox', 'webkit']
10 | 
11 | 
12 | @dataclass
13 | class CrawleePage:
14 |     """Represents a page object within a browser, with additional metadata for tracking and management."""
15 | 
16 |     id: str
17 |     browser_type: BrowserType
18 |     page: Page
19 | 


--------------------------------------------------------------------------------
/src/crawlee/browsers/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/browsers/py.typed


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_abstract_http/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._abstract_http_crawler import AbstractHttpCrawler
 2 | from ._abstract_http_parser import AbstractHttpParser
 3 | from ._http_crawling_context import ParsedHttpCrawlingContext
 4 | 
 5 | __all__ = [
 6 |     'AbstractHttpCrawler',
 7 |     'AbstractHttpParser',
 8 |     'ParsedHttpCrawlingContext',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_abstract_http/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/crawlers/_abstract_http/py.typed


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_adaptive_playwright/__init__.py:
--------------------------------------------------------------------------------
 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook
 2 | from crawlee._utils.try_import import try_import as _try_import
 3 | 
 4 | # These imports have only mandatory dependencies, so they are imported directly.
 5 | from ._adaptive_playwright_crawling_context import (
 6 |     AdaptivePlaywrightCrawlingContext,
 7 |     AdaptivePlaywrightPreNavCrawlingContext,
 8 | )
 9 | 
10 | _install_import_hook(__name__)
11 | 
12 | # The following imports are wrapped in try_import to handle optional dependencies,
13 | # ensuring the module can still function even if these dependencies are missing.
14 | with _try_import(__name__, 'BeautifulSoupCrawler'):
15 |     from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
16 | with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
17 |     from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
18 | 
19 | __all__ = [
20 |     'AdaptivePlaywrightCrawler',
21 |     'AdaptivePlaywrightCrawlingContext',
22 |     'AdaptivePlaywrightPreNavCrawlingContext',
23 |     'RenderingType',
24 |     'RenderingTypePrediction',
25 |     'RenderingTypePredictor',
26 | ]
27 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Annotated
 4 | 
 5 | from pydantic import ConfigDict, Field
 6 | 
 7 | from crawlee._utils.docs import docs_group
 8 | from crawlee.statistics import StatisticsState
 9 | 
10 | 
11 | @docs_group('Data structures')
12 | class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
13 |     """Statistic data about a crawler run with additional information related to adaptive crawling."""
14 | 
15 |     model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
16 | 
17 |     http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
18 |     """Number representing how many times static http based crawling was used."""
19 | 
20 |     browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0
21 |     """Number representing how many times browser based crawling was used."""
22 | 
23 |     rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0
24 |     """Number representing how many times the predictor gave incorrect prediction."""
25 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_basic/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._basic_crawler import BasicCrawler, BasicCrawlerOptions
 2 | from ._basic_crawling_context import BasicCrawlingContext
 3 | from ._context_pipeline import ContextPipeline
 4 | 
 5 | __all__ = [
 6 |     'BasicCrawler',
 7 |     'BasicCrawlerOptions',
 8 |     'BasicCrawlingContext',
 9 |     'ContextPipeline',
10 | ]
11 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_basic/_basic_crawling_context.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | # Do just the re-export because of the circular imports.
4 | from crawlee._types import BasicCrawlingContext  # noqa: F401
5 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_basic/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/crawlers/_basic/py.typed


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_beautifulsoup/__init__.py:
--------------------------------------------------------------------------------
 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook
 2 | from crawlee._utils.try_import import try_import as _try_import
 3 | 
 4 | _install_import_hook(__name__)
 5 | 
 6 | # The following imports are wrapped in try_import to handle optional dependencies,
 7 | # ensuring the module can still function even if these dependencies are missing.
 8 | with _try_import(__name__, 'BeautifulSoupCrawler'):
 9 |     from ._beautifulsoup_crawler import BeautifulSoupCrawler
10 | with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
11 |     from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
12 | with _try_import(__name__, 'BeautifulSoupParserType'):
13 |     from ._beautifulsoup_parser import BeautifulSoupParserType
14 | 
15 | __all__ = [
16 |     'BeautifulSoupCrawler',
17 |     'BeautifulSoupCrawlingContext',
18 |     'BeautifulSoupParserType',
19 | ]
20 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, fields
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from typing_extensions import Self
 5 | 
 6 | from crawlee._utils.docs import docs_group
 7 | from crawlee.crawlers import ParsedHttpCrawlingContext
 8 | 
 9 | from ._utils import html_to_text
10 | 
11 | 
12 | @dataclass(frozen=True)
13 | @docs_group('Data structures')
14 | class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]):
15 |     """The crawling context used by the `BeautifulSoupCrawler`.
16 | 
17 |     It provides access to key objects as well as utility functions for handling crawling tasks.
18 |     """
19 | 
20 |     @property
21 |     def soup(self) -> BeautifulSoup:
22 |         """Convenience alias."""
23 |         return self.parsed_content
24 | 
25 |     @classmethod
26 |     def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self:
27 |         """Initialize a new instance from an existing `ParsedHttpCrawlingContext`."""
28 |         return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
29 | 
30 |     def html_to_text(self) -> str:
31 |         """Convert the parsed HTML content to newline-separated plain text without tags."""
32 |         return html_to_text(self.parsed_content)
33 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_beautifulsoup/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/crawlers/_beautifulsoup/py.typed


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_http/__init__.py:
--------------------------------------------------------------------------------
 1 | from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext
 2 | from crawlee.http_clients import HttpCrawlingResult
 3 | 
 4 | from ._http_crawler import HttpCrawler
 5 | 
 6 | __all__ = [
 7 |     'HttpCrawler',
 8 |     'HttpCrawlingContext',
 9 |     'HttpCrawlingResult',
10 | ]
11 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_parsel/__init__.py:
--------------------------------------------------------------------------------
 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook
 2 | from crawlee._utils.try_import import try_import as _try_import
 3 | 
 4 | _install_import_hook(__name__)
 5 | 
 6 | # The following imports are wrapped in try_import to handle optional dependencies,
 7 | # ensuring the module can still function even if these dependencies are missing.
 8 | with _try_import(__name__, 'ParselCrawler'):
 9 |     from ._parsel_crawler import ParselCrawler
10 | with _try_import(__name__, 'ParselCrawlingContext'):
11 |     from ._parsel_crawling_context import ParselCrawlingContext
12 | 
13 | __all__ = [
14 |     'ParselCrawler',
15 |     'ParselCrawlingContext',
16 | ]
17 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, fields
 2 | 
 3 | from parsel import Selector
 4 | from typing_extensions import Self
 5 | 
 6 | from crawlee._utils.docs import docs_group
 7 | from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext
 8 | 
 9 | from ._utils import html_to_text
10 | 
11 | 
12 | @dataclass(frozen=True)
13 | @docs_group('Data structures')
14 | class ParselCrawlingContext(ParsedHttpCrawlingContext[Selector]):
15 |     """The crawling context used by the `ParselCrawler`.
16 | 
17 |     It provides access to key objects as well as utility functions for handling crawling tasks.
18 |     """
19 | 
20 |     @property
21 |     def selector(self) -> Selector:
22 |         """Convenience alias."""
23 |         return self.parsed_content
24 | 
25 |     @classmethod
26 |     def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self:
27 |         """Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`."""
28 |         return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
29 | 
30 |     def html_to_text(self) -> str:
31 |         """Convert the parsed HTML content to newline-separated plain text without tags."""
32 |         return html_to_text(self.parsed_content)
33 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_playwright/__init__.py:
--------------------------------------------------------------------------------
 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook
 2 | from crawlee._utils.try_import import try_import as _try_import
 3 | 
 4 | _install_import_hook(__name__)
 5 | 
 6 | # The following imports are wrapped in try_import to handle optional dependencies,
 7 | # ensuring the module can still function even if these dependencies are missing.
 8 | with _try_import(__name__, 'PlaywrightCrawler'):
 9 |     from ._playwright_crawler import PlaywrightCrawler
10 | with _try_import(__name__, 'PlaywrightCrawlingContext'):
11 |     from ._playwright_crawling_context import PlaywrightCrawlingContext
12 | with _try_import(__name__, 'PlaywrightPreNavCrawlingContext'):
13 |     from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
14 | 
15 | __all__ = [
16 |     'PlaywrightCrawler',
17 |     'PlaywrightCrawlingContext',
18 |     'PlaywrightPreNavCrawlingContext',
19 | ]
20 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/_types.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass(frozen=True)
 7 | class BlockedInfo:
 8 |     """Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked."""
 9 | 
10 |     reason: str
11 | 
12 |     def __bool__(self) -> bool:
13 |         """No reason means no blocking."""
14 |         return bool(self.reason)
15 | 


--------------------------------------------------------------------------------
/src/crawlee/crawlers/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/crawlers/py.typed


--------------------------------------------------------------------------------
/src/crawlee/events/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._event_manager import EventManager
 2 | from ._local_event_manager import LocalEventManager
 3 | from ._types import (
 4 |     Event,
 5 |     EventAbortingData,
 6 |     EventData,
 7 |     EventExitData,
 8 |     EventListener,
 9 |     EventMigratingData,
10 |     EventPersistStateData,
11 |     EventSystemInfoData,
12 | )
13 | 
14 | __all__ = [
15 |     'Event',
16 |     'EventAbortingData',
17 |     'EventData',
18 |     'EventExitData',
19 |     'EventListener',
20 |     'EventManager',
21 |     'EventMigratingData',
22 |     'EventPersistStateData',
23 |     'EventSystemInfoData',
24 |     'LocalEventManager',
25 | ]
26 | 


--------------------------------------------------------------------------------
/src/crawlee/events/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/events/py.typed


--------------------------------------------------------------------------------
/src/crawlee/fingerprint_suite/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: E402, TID252
 2 | 
 3 | # Due to patch_browserforge
 4 | from .._browserforge_workaround import patch_browserforge
 5 | 
 6 | patch_browserforge()
 7 | 
 8 | from ._browserforge_adapter import BrowserforgeFingerprintGenerator as DefaultFingerprintGenerator
 9 | from ._fingerprint_generator import FingerprintGenerator
10 | from ._header_generator import HeaderGenerator
11 | from ._types import HeaderGeneratorOptions, ScreenOptions
12 | 
13 | __all__ = [
14 |     'DefaultFingerprintGenerator',
15 |     'FingerprintGenerator',
16 |     'HeaderGenerator',
17 |     'HeaderGeneratorOptions',
18 |     'ScreenOptions',
19 | ]
20 | 


--------------------------------------------------------------------------------
/src/crawlee/fingerprint_suite/_consts.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | # ruff: noqa: E501
 4 | 
 5 | COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'
 6 | 
 7 | BROWSER_TYPE_HEADER_KEYWORD = {
 8 |     'chromium': {'Chrome', 'CriOS'},
 9 |     'firefox': {'Firefox', 'FxiOS'},
10 |     'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'},
11 |     'webkit': {'Safari'},
12 | }
13 | 


--------------------------------------------------------------------------------
/src/crawlee/fingerprint_suite/_fingerprint_generator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from crawlee._utils.docs import docs_group
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from browserforge.fingerprints import Fingerprint
10 | 
11 | 
12 | @docs_group('Abstract classes')
13 | class FingerprintGenerator(ABC):
14 |     """A class for creating browser fingerprints that mimic browser fingerprints of real users."""
15 | 
16 |     @abstractmethod
17 |     def generate(self) -> Fingerprint:
18 |         """Generate browser fingerprints.
19 | 
20 |         This is experimental feature.
21 |         Return type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely
22 |         it will change to custom `Fingerprint` class defined in this repo later.
23 |         """
24 | 


--------------------------------------------------------------------------------
/src/crawlee/fingerprint_suite/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/fingerprint_suite/py.typed


--------------------------------------------------------------------------------
/src/crawlee/http_clients/__init__.py:
--------------------------------------------------------------------------------
 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook
 2 | from crawlee._utils.try_import import try_import as _try_import
 3 | 
 4 | # These imports have only mandatory dependencies, so they are imported directly.
 5 | from ._base import HttpClient, HttpCrawlingResult, HttpResponse
 6 | from ._httpx import HttpxHttpClient
 7 | 
 8 | _install_import_hook(__name__)
 9 | 
10 | # The following imports are wrapped in try_import to handle optional dependencies,
11 | # ensuring the module can still function even if these dependencies are missing.
12 | with _try_import(__name__, 'CurlImpersonateHttpClient'):
13 |     from ._curl_impersonate import CurlImpersonateHttpClient
14 | 
15 | 
16 | __all__ = [
17 |     'CurlImpersonateHttpClient',
18 |     'HttpClient',
19 |     'HttpCrawlingResult',
20 |     'HttpResponse',
21 |     'HttpxHttpClient',
22 | ]
23 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/cookiecutter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "project_name": "crawlee-python-project",
 3 |     "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
 4 |     "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox"],
 5 |     "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}",
 6 |     "http_client": ["httpx", "curl-impersonate"],
 7 |     "package_manager": ["poetry", "pip", "uv", "manual"],
 8 |     "enable_apify_integration": false,
 9 |     "start_url": "https://crawlee.dev",
10 |     "_jinja2_env_vars": {
11 |         "line_statement_prefix": "# %"
12 |     },
13 |     "_extensions": ["jinja2.ext.do"]
14 | }
15 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/hooks/pre_gen_project.py:
--------------------------------------------------------------------------------
 1 | # % if cookiecutter.package_manager in ['poetry', 'uv']
 2 | import subprocess
 3 | import shutil
 4 | import re
 5 | import sys
 6 | 
 7 | manager = "{{cookiecutter.package_manager}}"
 8 | manager_text = manager.title()
 9 | # % if cookiecutter.package_manager == 'poetry'
10 | version_regex = r'Poetry \(version 2\..*\)'
11 | r_version = '2.x'
12 | # % elif cookiecutter.package_manager == 'uv'
13 | version_regex = r'uv (0\..*)'
14 | r_version = '0.x'
15 | # % endif
16 | 
17 | # Check if package manager is available in PATH
18 | if not shutil.which(manager):
19 |     sys.stderr.write(f'\nError: You selected {manager_text} as your package manager, but it is not installed. Please install it and try again.\n')
20 |     sys.exit(1)
21 | 
22 | # Check if the package manager is executable
23 | try:
24 |     version = subprocess.check_output([manager, '--version']).decode().strip()
25 | except OSError:
26 |     sys.stderr.write(f'\nError: Your selected package manager {manager_text} was found but failed to execute.\n')
27 |     sys.exit(1)
28 | 
29 | # Check if the version matches the required regex
30 | if not re.match(version_regex, version):
31 |     sys.stderr.write(f'\nError: Your selected package manager {manager_text} requires version {r_version}, but {version} is installed.\n')
32 |     sys.exit(1)
33 | # % endif
34 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/templates/main_beautifulsoup.py:
--------------------------------------------------------------------------------
 1 | # % extends 'main.py'
 2 | 
 3 | # % block import
 4 | from crawlee.crawlers import BeautifulSoupCrawler
 5 | # % endblock
 6 | 
 7 | # % block instantiation
 8 | crawler = BeautifulSoupCrawler(
 9 |     request_handler=router,
10 |     max_requests_per_crawl=10,
11 |     {{ self.http_client_instantiation() }})
12 | # % endblock
13 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/templates/main_parsel.py:
--------------------------------------------------------------------------------
 1 | # % extends 'main.py'
 2 | 
 3 | # % block import
 4 | from crawlee.crawlers import ParselCrawler
 5 | # % endblock
 6 | 
 7 | # % block instantiation
 8 | crawler = ParselCrawler(
 9 |     request_handler=router,
10 |     max_requests_per_crawl=10,
11 |     {{ self.http_client_instantiation() }})
12 | # % endblock
13 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/templates/main_playwright.py:
--------------------------------------------------------------------------------
 1 | # % extends 'main.py'
 2 | 
 3 | # % block import
 4 | from crawlee.crawlers import PlaywrightCrawler
 5 | # % endblock
 6 | 
 7 | # % block instantiation
 8 | crawler = PlaywrightCrawler(
 9 |     request_handler=router,
10 |     headless=True,
11 |     max_requests_per_crawl=10,
12 |     {{ self.http_client_instantiation() }})
13 | # % endblock
14 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/templates/routes_beautifulsoup.py:
--------------------------------------------------------------------------------
 1 | from crawlee.crawlers import BeautifulSoupCrawlingContext
 2 | from crawlee.router import Router
 3 | 
 4 | router = Router[BeautifulSoupCrawlingContext]()
 5 | 
 6 | 
 7 | @router.default_handler
 8 | async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
 9 |     """Default request handler."""
10 |     context.log.info(f'Processing {context.request.url} ...')
11 |     title = context.soup.find('title')
12 |     await context.push_data(
13 |         {
14 |             'url': context.request.loaded_url,
15 |             'title': title.text if title else None,
16 |         }
17 |     )
18 | 
19 |     await context.enqueue_links()
20 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/templates/routes_camoufox.py:
--------------------------------------------------------------------------------
 1 | from crawlee.crawlers import PlaywrightCrawlingContext
 2 | from crawlee.router import Router
 3 | 
 4 | router = Router[PlaywrightCrawlingContext]()
 5 | 
 6 | 
 7 | @router.default_handler
 8 | async def default_handler(context: PlaywrightCrawlingContext) -> None:
 9 |     """Default request handler."""
10 |     context.log.info(f'Processing {context.request.url} ...')
11 |     title = await context.page.query_selector('title')
12 |     await context.push_data(
13 |         {
14 |             'url': context.request.loaded_url,
15 |             'title': await title.inner_text() if title else None,
16 |         }
17 |     )
18 | 
19 |     await context.enqueue_links()
20 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/templates/routes_parsel.py:
--------------------------------------------------------------------------------
 1 | from crawlee.crawlers import ParselCrawlingContext
 2 | from crawlee.router import Router
 3 | 
 4 | router = Router[ParselCrawlingContext]()
 5 | 
 6 | 
 7 | @router.default_handler
 8 | async def default_handler(context: ParselCrawlingContext) -> None:
 9 |     """Default request handler."""
10 |     context.log.info(f'Processing {context.request.url} ...')
11 |     title = context.selector.xpath('//title/text()').get()
12 |     await context.push_data(
13 |         {
14 |             'url': context.request.loaded_url,
15 |             'title': title,
16 |         }
17 |     )
18 | 
19 |     await context.enqueue_links()
20 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/templates/routes_playwright.py:
--------------------------------------------------------------------------------
 1 | from crawlee.crawlers import PlaywrightCrawlingContext
 2 | from crawlee.router import Router
 3 | 
 4 | router = Router[PlaywrightCrawlingContext]()
 5 | 
 6 | 
 7 | @router.default_handler
 8 | async def default_handler(context: PlaywrightCrawlingContext) -> None:
 9 |     """Default request handler."""
10 |     context.log.info(f'Processing {context.request.url} ...')
11 |     title = await context.page.query_selector('title')
12 |     await context.push_data(
13 |         {
14 |             'url': context.request.loaded_url,
15 |             'title': await title.inner_text() if title else None,
16 |         }
17 |     )
18 | 
19 |     await context.enqueue_links()
20 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/templates/routes_playwright_camoufox.py:
--------------------------------------------------------------------------------
 1 | from crawlee.crawlers import PlaywrightCrawlingContext
 2 | from crawlee.router import Router
 3 | 
 4 | router = Router[PlaywrightCrawlingContext]()
 5 | 
 6 | 
 7 | @router.default_handler
 8 | async def default_handler(context: PlaywrightCrawlingContext) -> None:
 9 |     """Default request handler."""
10 |     context.log.info(f'Processing {context.request.url} ...')
11 |     title = await context.page.query_selector('title')
12 |     await context.push_data(
13 |         {
14 |             'url': context.request.loaded_url,
15 |             'title': await title.inner_text() if title else None,
16 |         }
17 |     )
18 | 
19 |     await context.enqueue_links()
20 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv
2 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # % if cookiecutter.crawler_type == 'playwright-camoufox'
 2 | # % set extras = ['playwright']
 3 | # % else
 4 | # % set extras = [cookiecutter.crawler_type]
 5 | # % endif
 6 | # % if cookiecutter.http_client == 'curl-impersonate'
 7 | # % do extras.append('curl-impersonate')
 8 | # % endif
 9 | 
10 | [project]
11 | name = "{{cookiecutter.project_name}}"
12 | version = "0.0.1"
13 | description = ""
14 | authors = [
15 |     {name = "Your Name",email = "you@example.com"}
16 | ]
17 | readme = "README.md"
18 | requires-python = ">=3.9,<4.0"
19 | dependencies = [
20 |     "crawlee[{{ extras|join(',') }}]",
21 |     # % if cookiecutter.crawler_type == 'playwright-camoufox'
22 |     "camoufox[geoip]~=0.4.5",
23 |     # % endif
24 |     # % if cookiecutter.enable_apify_integration
25 |     "apify",
26 |     # % endif
27 | ]
28 | 
29 | # % if cookiecutter.package_manager == 'poetry'
30 | [tool.poetry]
31 | package-mode = false
32 | 
33 | [build-system]
34 | requires = ["poetry-core>=2.0.0,<3.0.0"]
35 | build-backend = "poetry.core.masonry.api"
36 | # % endif
37 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt:
--------------------------------------------------------------------------------
 1 | # % if cookiecutter.crawler_type == 'playwright-camoufox'
 2 | camoufox[geoip]~=0.4.5
 3 | # % set extras = ['playwright']
 4 | # % else
 5 | # % set extras = [cookiecutter.crawler_type]
 6 | # % endif
 7 | # % if cookiecutter.enable_apify_integration
 8 | apify
 9 | # % endif
10 | # % if cookiecutter.http_client == 'curl-impersonate'
11 | # % do extras.append('curl-impersonate')
12 | # % endif
13 | crawlee[{{ extras | join(',') }}]
14 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py


--------------------------------------------------------------------------------
/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | # % if cookiecutter.http_client == 'curl-impersonate'
 3 | import platform
 4 | # % if 'playwright' in cookiecutter.crawler_type
 5 | import warnings
 6 | # % endif
 7 | # % endif
 8 | {{ '' }}
 9 | from .main import main
10 | 
11 | if __name__ == '__main__':
12 |     # % if cookiecutter.http_client == 'curl-impersonate'
13 |     if platform.system() == 'Windows':
14 |         # This mitigates a warning raised by curl-cffi.
15 |         # % if 'playwright' in cookiecutter.crawler_type
16 |         warnings.warn(
17 |             message=('curl-cffi suggests using WindowsSelectorEventLoopPolicy, but this conflicts with Playwright. '
18 |                      'Ignore the curl-cffi warning.'),
19 |             category=UserWarning,
20 |             stacklevel=2,
21 |         )
22 |         # % else
23 |         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
24 |         # % endif
25 |     # % endif
26 | {{ '' }}
27 |     asyncio.run(main())
28 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py:
--------------------------------------------------------------------------------
1 | # % include 'main_%s.py' % cookiecutter.__crawler_type
2 | 


--------------------------------------------------------------------------------
/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py:
--------------------------------------------------------------------------------
1 | # % include 'routes_%s.py' % cookiecutter.__crawler_type
2 | 


--------------------------------------------------------------------------------
/src/crawlee/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/py.typed


--------------------------------------------------------------------------------
/src/crawlee/request_loaders/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._request_list import RequestList
 2 | from ._request_loader import RequestLoader
 3 | from ._request_manager import RequestManager
 4 | from ._request_manager_tandem import RequestManagerTandem
 5 | 
 6 | __all__ = [
 7 |     'RequestList',
 8 |     'RequestLoader',
 9 |     'RequestManager',
10 |     'RequestManagerTandem',
11 | ]
12 | 


--------------------------------------------------------------------------------
/src/crawlee/sessions/__init__.py:
--------------------------------------------------------------------------------
1 | from ._cookies import CookieParam, SessionCookies
2 | from ._session import Session
3 | from ._session_pool import SessionPool
4 | 
5 | __all__ = ['CookieParam', 'Session', 'SessionCookies', 'SessionPool']
6 | 


--------------------------------------------------------------------------------
/src/crawlee/sessions/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/sessions/py.typed


--------------------------------------------------------------------------------
/src/crawlee/statistics/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: A005
2 | 
3 | from ._models import FinalStatistics, StatisticsState
4 | from ._statistics import Statistics
5 | 
6 | __all__ = ['FinalStatistics', 'Statistics', 'StatisticsState']
7 | 


--------------------------------------------------------------------------------
/src/crawlee/storage_clients/__init__.py:
--------------------------------------------------------------------------------
1 | from ._base import StorageClient
2 | from ._memory import MemoryStorageClient
3 | 
4 | __all__ = ['MemoryStorageClient', 'StorageClient']
5 | 


--------------------------------------------------------------------------------
/src/crawlee/storage_clients/_base/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._dataset_client import DatasetClient
 2 | from ._dataset_collection_client import DatasetCollectionClient
 3 | from ._key_value_store_client import KeyValueStoreClient
 4 | from ._key_value_store_collection_client import KeyValueStoreCollectionClient
 5 | from ._request_queue_client import RequestQueueClient
 6 | from ._request_queue_collection_client import RequestQueueCollectionClient
 7 | from ._storage_client import StorageClient
 8 | from ._types import ResourceClient, ResourceCollectionClient
 9 | 
10 | __all__ = [
11 |     'DatasetClient',
12 |     'DatasetCollectionClient',
13 |     'KeyValueStoreClient',
14 |     'KeyValueStoreCollectionClient',
15 |     'RequestQueueClient',
16 |     'RequestQueueCollectionClient',
17 |     'ResourceClient',
18 |     'ResourceCollectionClient',
19 |     'StorageClient',
20 | ]
21 | 


--------------------------------------------------------------------------------
/src/crawlee/storage_clients/_base/_types.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Union
 4 | 
 5 | from ._dataset_client import DatasetClient
 6 | from ._dataset_collection_client import DatasetCollectionClient
 7 | from ._key_value_store_client import KeyValueStoreClient
 8 | from ._key_value_store_collection_client import KeyValueStoreCollectionClient
 9 | from ._request_queue_client import RequestQueueClient
10 | from ._request_queue_collection_client import RequestQueueCollectionClient
11 | 
12 | ResourceClient = Union[
13 |     DatasetClient,
14 |     KeyValueStoreClient,
15 |     RequestQueueClient,
16 | ]
17 | 
18 | ResourceCollectionClient = Union[
19 |     DatasetCollectionClient,
20 |     KeyValueStoreCollectionClient,
21 |     RequestQueueCollectionClient,
22 | ]
23 | 


--------------------------------------------------------------------------------
/src/crawlee/storage_clients/_base/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/storage_clients/_base/py.typed


--------------------------------------------------------------------------------
/src/crawlee/storage_clients/_memory/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._dataset_client import DatasetClient
 2 | from ._dataset_collection_client import DatasetCollectionClient
 3 | from ._key_value_store_client import KeyValueStoreClient
 4 | from ._key_value_store_collection_client import KeyValueStoreCollectionClient
 5 | from ._memory_storage_client import MemoryStorageClient
 6 | from ._request_queue_client import RequestQueueClient
 7 | from ._request_queue_collection_client import RequestQueueCollectionClient
 8 | 
 9 | __all__ = [
10 |     'DatasetClient',
11 |     'DatasetCollectionClient',
12 |     'KeyValueStoreClient',
13 |     'KeyValueStoreCollectionClient',
14 |     'MemoryStorageClient',
15 |     'RequestQueueClient',
16 |     'RequestQueueCollectionClient',
17 | ]
18 | 


--------------------------------------------------------------------------------
/src/crawlee/storage_clients/_memory/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/storage_clients/_memory/py.typed


--------------------------------------------------------------------------------
/src/crawlee/storage_clients/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/storage_clients/py.typed


--------------------------------------------------------------------------------
/src/crawlee/storages/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._dataset import Dataset
 2 | from ._key_value_store import KeyValueStore
 3 | from ._request_queue import RequestQueue
 4 | 
 5 | __all__ = [
 6 |     'Dataset',
 7 |     'KeyValueStore',
 8 |     'RequestQueue',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/src/crawlee/storages/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/storages/py.typed


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/tests/__init__.py


--------------------------------------------------------------------------------
/tests/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/tests/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/unit/README.md:
--------------------------------------------------------------------------------
1 | # Unit tests
2 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/_statistics/test_periodic_logging.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import asyncio
 4 | import logging
 5 | from datetime import timedelta
 6 | from typing import TYPE_CHECKING
 7 | 
 8 | from crawlee.statistics import Statistics
 9 | 
10 | if TYPE_CHECKING:
11 |     import pytest
12 | 
13 | 
14 | async def test_periodic_logging(caplog: pytest.LogCaptureFixture) -> None:
15 |     caplog.set_level(logging.INFO)
16 | 
17 |     log_message = 'Periodic statistics XYZ'
18 |     statistics = Statistics.with_default_state(log_interval=timedelta(milliseconds=50), log_message=log_message)
19 | 
20 |     async with statistics:
21 |         await asyncio.sleep(0.1)
22 | 
23 |     matching_records = [rec for rec in caplog.records if rec.message.startswith(log_message)]
24 |     assert len(matching_records) >= 1
25 | 


--------------------------------------------------------------------------------
/tests/unit/_statistics/test_persistence.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from crawlee.statistics import Statistics
 4 | 
 5 | 
 6 | async def test_basic_persistence() -> None:
 7 |     key = 'statistics_foo'
 8 | 
 9 |     async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics:
10 |         statistics.state.requests_failed = 42
11 | 
12 |     async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics:
13 |         pass
14 | 
15 |     assert statistics.state.requests_failed == 42
16 | 


--------------------------------------------------------------------------------
/tests/unit/_utils/test_console.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from crawlee._utils.console import make_table
 4 | 
 5 | 
 6 | def test_empty_input() -> None:
 7 |     assert make_table([]) == ''
 8 | 
 9 | 
10 | def test_empty_row() -> None:
11 |     assert make_table([()]) == ''
12 | 
13 | 
14 | def test_single_column() -> None:
15 |     result = make_table([('test',)])
16 |     lines = result.split('\n')
17 |     assert len(lines) == 3
18 |     assert lines[1] == '│ test │'
19 | 
20 | 
21 | def test_two_columns() -> None:
22 |     data = [('Name', 'Age'), ('Alice', '30'), ('Bob', '25')]
23 |     result = make_table(data)
24 |     lines = result.split('\n')
25 |     # fmt: off
26 |     assert lines == ['┌───────┬─────┐',
27 |                      '│ Name  │ Age │',
28 |                      '│ Alice │ 30  │',
29 |                      '│ Bob   │ 25  │',
30 |                      '└───────┴─────┘']
31 |     # fmt: on
32 | 
33 | 
34 | def test_long_content_truncation() -> None:
35 |     data = [('Short', 'VeryVeryVeryLongContent')]
36 |     result = make_table(data, width=25)
37 |     lines = result.split('\n')
38 |     # fmt: off
39 |     assert lines == ['┌───────────┬───────────┐',
40 |                      '│ Short     │ VeryVe... │',
41 |                      '└───────────┴───────────┘']
42 |     # fmt: on
43 | 


--------------------------------------------------------------------------------
/tests/unit/_utils/test_globs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from crawlee._utils.globs import Glob
 4 | 
 5 | 
 6 | def test_asterisk() -> None:
 7 |     glob = Glob('foo/*')
 8 |     assert glob.regexp.match('bar/') is None
 9 |     assert glob.regexp.match('foo/bar') is not None
10 |     assert glob.regexp.match('foo/bar/baz') is None
11 | 
12 | 
13 | def test_double_asteritsk() -> None:
14 |     glob = Glob('foo/**')
15 |     assert glob.regexp.match('bar/') is None
16 |     assert glob.regexp.match('foo/bar') is not None
17 |     assert glob.regexp.match('foo/bar/baz') is not None
18 | 


--------------------------------------------------------------------------------
/tests/unit/_utils/test_measure_time.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import asyncio
 4 | import time
 5 | 
 6 | from crawlee._utils.measure_time import measure_time
 7 | 
 8 | 
 9 | def test_measure_time_wall_sync() -> None:
10 |     with measure_time() as elapsed:
11 |         time.sleep(0.1)
12 | 
13 |     assert elapsed.cpu is not None
14 |     assert elapsed.wall is not None
15 |     assert elapsed.wall >= 0.09
16 | 
17 | 
18 | def test_measure_time_cpu_sync() -> None:
19 |     with measure_time() as elapsed:
20 |         start = time.time()
21 |         acc = 0
22 | 
23 |         while time.time() - start < 0.1:
24 |             acc += 1
25 |             acc *= acc
26 | 
27 |     assert elapsed.cpu is not None
28 |     assert elapsed.wall is not None
29 |     assert elapsed.cpu >= 0.05
30 | 
31 | 
32 | async def test_measure_time_wall_async() -> None:
33 |     with measure_time() as elapsed:
34 |         await asyncio.sleep(0.1)
35 | 
36 |     assert elapsed.cpu is not None
37 |     assert elapsed.wall is not None
38 |     assert elapsed.wall >= 0.09
39 | 


--------------------------------------------------------------------------------
/tests/unit/_utils/test_system.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from crawlee._utils.byte_size import ByteSize
 4 | from crawlee._utils.system import get_cpu_info, get_memory_info
 5 | 
 6 | 
 7 | def test_get_memory_info_returns_valid_values() -> None:
 8 |     memory_info = get_memory_info()
 9 | 
10 |     assert ByteSize(0) < memory_info.total_size < ByteSize.from_tb(1)
11 |     assert memory_info.current_size < memory_info.total_size
12 | 
13 | 
14 | def test_get_cpu_info_returns_valid_values() -> None:
15 |     cpu_info = get_cpu_info()
16 |     assert 0 <= cpu_info.used_ratio <= 1
17 | 


--------------------------------------------------------------------------------
/tests/unit/_utils/test_timedelata_ms.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from datetime import timedelta
 4 | from typing import Any
 5 | 
 6 | import pytest
 7 | from pydantic import BaseModel
 8 | 
 9 | from crawlee._utils.models import timedelta_ms
10 | 
11 | 
12 | class _ModelWithTimedeltaMs(BaseModel):
13 |     time_delta: timedelta_ms | None = None
14 | 
15 | 
16 | @pytest.mark.parametrize(
17 |     ('time_delta_input', 'expected_time_delta', 'expected_model_dump_value'),
18 |     [
19 |         (1.0, timedelta(milliseconds=1), 1),
20 |         (1, timedelta(milliseconds=1), 1),
21 |         ('1', timedelta(milliseconds=1), 1),
22 |         (timedelta(milliseconds=1), timedelta(milliseconds=1), 1),
23 |         (3.01, timedelta(microseconds=3010), 3),
24 |         (3.5, timedelta(microseconds=3500), 4),
25 |         (3.99, timedelta(microseconds=3990), 4),
26 |         (None, None, None),
27 |         (float('inf'), timedelta(days=999999999, seconds=3600 * 24 - 1, microseconds=999999), float('inf')),
28 |     ],
29 | )
30 | def test_model_with_timedelta_ms_input_types(
31 |     time_delta_input: float | timedelta | Any | None, expected_time_delta: timedelta, expected_model_dump_value: int
32 | ) -> None:
33 |     model = _ModelWithTimedeltaMs(time_delta=time_delta_input)
34 |     assert model.time_delta == expected_time_delta
35 |     assert model.model_dump() == {'time_delta': expected_model_dump_value}
36 | 


--------------------------------------------------------------------------------
/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py


--------------------------------------------------------------------------------
/tests/unit/events/test_local_event_manager.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import asyncio
 4 | from datetime import timedelta
 5 | from functools import update_wrapper
 6 | from typing import Any
 7 | from unittest.mock import AsyncMock
 8 | 
 9 | import pytest
10 | 
11 | from crawlee.events import LocalEventManager
12 | from crawlee.events._types import Event, EventSystemInfoData
13 | 
14 | 
15 | @pytest.fixture
16 | def listener() -> AsyncMock:
17 |     async def async_listener(payload: Any) -> None:
18 |         pass
19 | 
20 |     al = AsyncMock()
21 |     update_wrapper(al, async_listener)
22 |     return al
23 | 
24 | 
25 | async def test_emit_system_info_event(listener: AsyncMock) -> None:
26 |     async with LocalEventManager(system_info_interval=timedelta(milliseconds=50)) as event_manager:
27 |         event_manager.on(event=Event.SYSTEM_INFO, listener=listener)
28 |         await asyncio.sleep(0.2)
29 | 
30 |     assert listener.call_count >= 1
31 |     assert isinstance(listener.call_args[0][0], EventSystemInfoData)
32 | 


--------------------------------------------------------------------------------
/tests/unit/server_endpoints.py:
--------------------------------------------------------------------------------
 1 | # Test server response content for testing
 2 | 
 3 | HELLO_WORLD = b"""\
 4 | <html><head>
 5 |     <title>Hello, world!</title>
 6 | </head>
 7 | <body>
 8 | </body></html>"""
 9 | 
10 | START_ENQUEUE = b"""\
11 | <html><head>
12 |     <title>Hello</title>
13 | </head>
14 | <body>
15 |     <a href="/sub_index" class="foo">Link 1</a>
16 |     <a href="/page_1">Link 2</a>
17 | </body></html>"""
18 | 
19 | SECONDARY_INDEX = b"""\
20 | <html><head>
21 |     <title>Hello</title>
22 | </head>
23 | <body>
24 |     <a href="/page_3">Link 3</a>
25 |     <a href="/page_2">Link 4</a>
26 | </body></html>"""
27 | 
28 | INCAPSULA = b"""\
29 | <html><head>
30 |     <title>Hello</title>
31 | </head>
32 | <body>
33 |     <iframe src=Test_Incapsula_Resource>
34 |     </iframe>
35 | </body></html>"""
36 | 
37 | GENERIC_RESPONSE = b"""\
38 | <html><head>
39 |     <title>Hello</title>
40 | </head>
41 | <body>
42 |     Insightful content
43 | </body></html>"""
44 | 
45 | 
46 | ROBOTS_TXT = b"""\
47 | User-agent: *
48 | Disallow: *deny_all/
49 | Disallow: /page_
50 | crawl-delay: 10
51 | 
52 | User-agent: Googlebot
53 | Disallow: *deny_googlebot/
54 | crawl-delay: 1
55 | 
56 | user-agent: Mozilla
57 | crawl-delay: 2
58 | 
59 | sitemap: http://not-exists.com/sitemap_1.xml
60 | sitemap: http://not-exists.com/sitemap_2.xml"""
61 | 


--------------------------------------------------------------------------------
/website/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": [
 3 |         "@apify/eslint-config-ts",
 4 |         "plugin:react/recommended",
 5 |         "plugin:react-hooks/recommended"
 6 |     ],
 7 |     "parserOptions": {
 8 |         "project": "./tsconfig.eslint.json",
 9 |         "ecmaFeatures": {
10 |             "jsx": true
11 |         },
12 |         "ecmaVersion": 2020
13 |     },
14 |     "env": {
15 |         "browser": true
16 |     },
17 |     "settings": {
18 |         "react": {
19 |             "version": "detect"
20 |         }
21 |     },
22 |     "rules": {
23 |         "quote-props": ["error", "consistent"],
24 |         "no-void": 0
25 |     },
26 |     "root": true
27 | }
28 | 


--------------------------------------------------------------------------------
/website/.yarnrc.yml:
--------------------------------------------------------------------------------
1 | nodeLinker: node-modules
2 | enableGlobalCache: true
3 | 


--------------------------------------------------------------------------------
/website/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |     presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
3 | };
4 | 


--------------------------------------------------------------------------------
/website/build_api_reference.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Generate import shortcuts from the modules
4 | python generate_module_shortcuts.py
5 | 


--------------------------------------------------------------------------------
/website/patches/@docusaurus+core+3.4.0.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
 2 | index 903f8dc..b6b60bf 100644
 3 | --- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
 4 | +++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
 5 | @@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) {
 6 |          window.scrollTo(0, 0);
 7 |      }
 8 |      else {
 9 | -        const id = decodeURIComponent(hash.substring(1));
10 | -        const element = document.getElementById(id);
11 | -        element?.scrollIntoView();
12 | +        setTimeout(() => {
13 | +            const id = decodeURIComponent(hash.substring(1));
14 | +            const element = document.getElementById(id);
15 | +            element?.scrollIntoView();
16 | +        }, 100);
17 |      }
18 |  }
19 |  function ClientLifecyclesDispatcher({ children, location, previousLocation, }) {
20 | 


--------------------------------------------------------------------------------
/website/patches/@docusaurus+core+3.5.2.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
 2 | index 903f8dc..b6b60bf 100644
 3 | --- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
 4 | +++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
 5 | @@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) {
 6 |          window.scrollTo(0, 0);
 7 |      }
 8 |      else {
 9 | -        const id = decodeURIComponent(hash.substring(1));
10 | -        const element = document.getElementById(id);
11 | -        element?.scrollIntoView();
12 | +        setTimeout(() => {
13 | +            const id = decodeURIComponent(hash.substring(1));
14 | +            const element = document.getElementById(id);
15 | +            element?.scrollIntoView();
16 | +        }, 100);
17 |      }
18 |  }
19 |  function ClientLifecyclesDispatcher({ children, location, previousLocation, }) {
20 | 


--------------------------------------------------------------------------------
/website/roa-loader/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "roa-loader",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "keywords": [],
10 |   "author": "",
11 |   "license": "ISC",
12 |   "dependencies": {
13 |     "loader-utils": "^3.2.1"
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/website/src/components/ApiLink.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import Link from '@docusaurus/Link';
 3 | // eslint-disable-next-line import/no-extraneous-dependencies
 4 | import { useDocsVersion } from '@docusaurus/theme-common/internal';
 5 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
 6 | 
 7 | // const pkg = require('../../../packages/crawlee/package.json');
 8 | //
 9 | // const [v1, v2] = pkg.version.split('.');
10 | // const stable = [v1, v2].join('.');
11 | 
12 | const ApiLink = ({ to, children }) => {
13 |     return (
14 |         <Link to={`/api/${to}`}>{children}</Link>
15 |     );
16 | 
17 |     // const version = useDocsVersion();
18 |     // const { siteConfig } = useDocusaurusContext();
19 |     //
20 |     // // if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) {
21 |     // if (siteConfig.presets[0][1].docs.disableVersioning) {
22 |     //     return (
23 |     //         <Link to={`/api/${to}`}>{children}</Link>
24 |     //     );
25 |     // }
26 |     //
27 |     // return (
28 |     //     <Link to={`/api/${version.version === 'current' ? 'next' : version.version}/${to}`}>{children}</Link>
29 |     // );
30 | };
31 | 
32 | export default ApiLink;
33 | 


--------------------------------------------------------------------------------
/website/src/components/Button.jsx:
--------------------------------------------------------------------------------
 1 | import Link from '@docusaurus/Link';
 2 | import clsx from 'clsx';
 3 | import React from 'react';
 4 | 
 5 | import styles from './Button.module.css';
 6 | import CrawleeSvg from '../../static/img/crawlee-logo-monocolor.svg';
 7 | 
 8 | export default function Button({ children, to, withIcon, type = 'primary', className, isBig }) {
 9 |     return (
10 |         <Link to={to} target="_self" rel="dofollow">
11 |             <span className={clsx(
12 |                 className,
13 |                 styles.button,
14 |                 type === 'primary' && styles.buttonPrimary,
15 |                 type === 'secondary' && styles.buttonSecondary,
16 |                 isBig && styles.big,
17 |             )}>
18 |                 {withIcon && <CrawleeSvg />}
19 |                 {children}
20 |             </span>
21 |         </Link>
22 |     );
23 | }
24 | 


--------------------------------------------------------------------------------
/website/src/components/CopyButton.module.css:
--------------------------------------------------------------------------------
 1 | .copyButton {
 2 |   all: unset;
 3 |   display: inline-flex;
 4 |   align-items: center;
 5 |   justify-content: center;
 6 |   box-sizing: border-box;
 7 |   cursor: pointer;
 8 |   fill: var(--color-icon);
 9 | 
10 |   svg {
11 |     flex-shrink: 0;
12 |   }
13 | }
14 | 
15 | .copyButtonDefault {
16 |   width: 28px;
17 |   height: 28px;
18 |   background-color: var(--color-background-muted);
19 |   border: 1px solid var(--color-border);
20 |   border-radius: 6px;
21 |   transition: background-color 0.12s ease-out;
22 | 
23 |   &:hover {
24 |       background-color: var(--color-hover);
25 |   }
26 | 
27 |   svg {
28 |     padding: 1px;
29 |   }
30 | }
31 | 
32 | .copyButtonCompact {
33 |   svg {
34 |     width: 16px;
35 |     height: 16px;
36 |   }
37 | }


--------------------------------------------------------------------------------
/website/src/components/Gradients.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | 
 3 | export default function Gradients() {
 4 |     return (
 5 |         <svg xmlns="http://www.w3.org/2000/svg" width="0" height="0" viewBox="0 0 0 0" fill="none">
 6 |             <defs>
 7 |                 <linearGradient id="gradient-1" x1="26.6667" y1="12" x2="14.2802" y2="34.5208"
 8 |                                 gradientUnits="userSpaceOnUse">
 9 |                     <stop offset="0%" stop-color="#9dceff"/>
10 |                     <stop offset="70%" stop-color="#4584b6"/>
11 |                     <stop offset="100%" stop-color="#4584b6"/>
12 |                 </linearGradient>
13 |                 <linearGradient id="gradient-2" x1="29.6667" y1="0" x2="-1.80874" y2="26.2295"
14 |                                 gradientUnits="userSpaceOnUse">
15 |                 <stop offset="0%" stop-color="#4584b6"/>
16 |                 </linearGradient>
17 |             </defs>
18 |         </svg>
19 |     );
20 | }
21 | 


--------------------------------------------------------------------------------
/website/src/components/Highlights.module.css:
--------------------------------------------------------------------------------
 1 | .features {
 2 |     display: flex;
 3 |     align-items: center;
 4 |     width: 100%;
 5 |     font-size: 18px;
 6 |     line-height: 32px;
 7 |     color: #41465d;
 8 | }
 9 | 
10 | html[data-theme="dark"] .features {
11 |     color: #b3b8d2;
12 | }
13 | 
14 | .feature svg {
15 |     height: 60px;
16 |     width: 60px;
17 | }
18 | 
19 | .features svg path:nth-child(1) {
20 |     fill: url(#gradient-1) !important;
21 | }
22 | 
23 | .features svg path:nth-child(n + 1) {
24 |     fill: url(#gradient-2) !important;
25 | }
26 | 
27 | html[data-theme="dark"] .featureIcon {
28 |     background: #272c3d;
29 | }
30 | 
31 | .featureIcon {
32 |     display: flex;
33 |     justify-content: center;
34 |     align-items: center;
35 |     margin-bottom: 24px;
36 |     border-radius: 8px;
37 |     background-color: #f2f3fb;
38 |     width: 48px;
39 |     height: 48px;
40 | }
41 | 
42 | .features h3 {
43 |     font-weight: 700;
44 |     font-size: 18px;
45 |     line-height: 32px;
46 | }
47 | 


--------------------------------------------------------------------------------
/website/src/components/Homepage/HomepageCliExample.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | 
 3 | import CopyButton from '../CopyButton';
 4 | import styles from './HomepageCliExample.module.css';
 5 | 
 6 | const cliCommand = `pipx run 'crawlee[cli]' create my-crawler`;
 7 | 
 8 | export default function CliExample() {
 9 |     return (
10 |         <section className={styles.cliExampleSection}>
11 |             <div className={styles.cliExampleTitle}>
12 |                 Or start with a template from our CLI
13 |             </div>
14 |             <code className={styles.cliExampleCodeBlock}>
15 |                 <pre>
16 |                     <span className={styles.cliCommandPrefix}>$</span>
17 |                     {cliCommand}
18 |                     <CopyButton copyText={cliCommand} />
19 |                 </pre>
20 |             </code>
21 |             <div className={styles.cliExampleSubtitle}>
22 |                 Built with 🤍 by Apify. Forever free and open-source.
23 |             </div>
24 |         </section>
25 |     );
26 | }
27 | 


--------------------------------------------------------------------------------
/website/src/components/Homepage/HomepageHeroSection.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | 
 3 | import styles from './HomepageHeroSection.module.css';
 4 | import homepageStyles from '../../pages/index.module.css';
 5 | 
 6 | export default function HomepageHeroSection() {
 7 |     return (
 8 |         <section className={styles.hero}>
 9 |             <h1 className={styles.heroTitle}>
10 |                 Build reliable web scrapers. Fast.
11 |             </h1>
12 |             <div
13 |                 className={homepageStyles.dashedSeparator}
14 |                 id={styles.separatorHeroHeader}
15 |             />
16 |             <p className={styles.heroSubtitle}>
17 |                 Crawlee is a web scraping library for JavaScript and Python. It
18 |                 handles blocking, crawling, proxies, and browsers for you.
19 |             </p>
20 |             <div
21 |                 className={homepageStyles.dashedSeparator}
22 |                 id={styles.separatorHeroHeader2}
23 |             >
24 |                 <div
25 |                     className={homepageStyles.dashedDecorativeCircle}
26 |                     id={styles.heroDecorativeCircle}
27 |                 />
28 |             </div>
29 |         </section>
30 |     );
31 | }
32 | 


--------------------------------------------------------------------------------
/website/src/components/Homepage/LanguageSwitch.module.css:
--------------------------------------------------------------------------------
 1 | .languageSwitch {
 2 |     z-index: 1;
 3 |     display: inline-flex;
 4 |     position: relative;
 5 |     background-color: var(--color-background-subtle);
 6 |     border-radius: 6px;
 7 |     padding: 4px;
 8 | }
 9 | 
10 | .switchOption {
11 |     position: relative;
12 |     z-index: 1;
13 |     padding: 6px 16px;
14 |     font-size: 14px;
15 |     font-weight: 500;
16 |     color: var(--color-text-muted);
17 |     background: none;
18 |     border: none;
19 |     cursor: pointer;
20 |     transition: color 0.3s ease;
21 | }
22 | 
23 | .switchOption:hover {
24 |     color: var(--color-text);
25 | }
26 | 
27 | .switchOption.active {
28 |     color: var(--color-text);
29 | }
30 | 
31 | .switchBackground {
32 |     position: absolute;
33 |     top: 4px;
34 |     bottom: 4px;
35 |     left: 0;
36 |     border-radius: 6px;
37 |     background-color: var(--color-background);
38 |     transition:
39 |         transform 0.3s ease,
40 |         width 0.3s ease;
41 | }
42 | 


--------------------------------------------------------------------------------
/website/src/components/Homepage/RiverSection.jsx:
--------------------------------------------------------------------------------
 1 | import Link from '@docusaurus/Link';
 2 | import clsx from 'clsx';
 3 | import React from 'react';
 4 | 
 5 | import styles from './RiverSection.module.css';
 6 | 
 7 | export default function RiverSection({ title, description, content, reversed, to }) {
 8 |     return (
 9 |         <div className={styles.riverWrapper}>
10 |             <div className={clsx(styles.riverContainer, { [styles.riverReversed]: reversed })}>
11 |                 <div className={clsx(styles.riverSection, styles.riverText)}>
12 |                     <h3 className={styles.riverTitle}>{title}</h3>
13 |                     <p className={styles.riverDescription}>{description}</p>
14 |                     <Link className={styles.riverButton} to={to}>
15 |                         Learn more
16 |                     </Link>
17 |                 </div>
18 |                 <div className={clsx(styles.riverSection, styles.riverContent)}>{content}</div>
19 |             </div>
20 |         </div>
21 |     );
22 | }
23 | 


--------------------------------------------------------------------------------
/website/src/components/RunnableCodeBlock.module.css:
--------------------------------------------------------------------------------
 1 | .button {
 2 |     display: inline-block;
 3 |     padding: 3px 10px;
 4 |     position: absolute;
 5 |     top: calc(var(--ifm-pre-padding) / 2);
 6 |     right: 9px;
 7 |     z-index: 1;
 8 |     font-size: 16px;
 9 |     line-height: 28px;
10 |     background: var(--prism-background-color);
11 |     color: var(--prism-color);
12 |     border: 1px solid var(--ifm-color-emphasis-300);
13 |     border-radius: var(--ifm-global-radius);
14 |     opacity: 0.7;
15 |     font-weight: 600;
16 |     width: 155px;
17 | }
18 | 
19 | @media screen and (max-width: 768px) {
20 |     .button {
21 |         display: none;
22 |     }
23 | }
24 | 
25 | .button svg {
26 |     height: 20px;
27 |     position: absolute;
28 |     top: 7.5px;
29 |     right: 0;
30 | }
31 | 
32 | .button:hover {
33 |     opacity: 1;
34 |     color: var(--prism-color);
35 | }
36 | 
37 | .container {
38 |     position: relative;
39 | }
40 | 


--------------------------------------------------------------------------------
/website/src/theme/ColorModeToggle/light-mode-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M6.66602 10.0003C6.66602 10.8844 7.0172 11.7322 7.64233 12.3573C8.26745 12.9825 9.11529 13.3337 9.99935 13.3337C10.8834 13.3337 11.7313 12.9825 12.3564 12.3573C12.9815 11.7322 13.3327 10.8844 13.3327 10.0003C13.3327 9.11627 12.9815 8.26842 12.3564 7.6433C11.7313 7.01818 10.8834 6.66699 9.99935 6.66699C9.11529 6.66699 8.26745 7.01818 7.64233 7.6433C7.0172 8.26842 6.66602 9.11627 6.66602 10.0003Z" stroke="#555D76" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
3 | <path d="M2.5 10H3.33333M10 2.5V3.33333M16.6667 10H17.5M10 16.6667V17.5M4.66667 4.66667L5.25 5.25M15.3333 4.66667L14.75 5.25M14.75 14.75L15.3333 15.3333M5.25 14.75L4.66667 15.3333" stroke="#555D76" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/website/src/theme/ColorModeToggle/styles.module.css:
--------------------------------------------------------------------------------
 1 | .toggleButton {
 2 |     padding: 4px;
 3 |     display: flex;
 4 |     gap: 4px;
 5 |     align-items: center;
 6 |     transition: all var(--ifm-transition-fast);
 7 |     position: relative;
 8 |     border-radius: 150px;
 9 |     background-color: var(--color-background-subtle);
10 | }
11 | 
12 | .toggleButton span {
13 |     width: 44px;
14 |     height: 36px;
15 |     border-radius: 50%;
16 |     background: #fff;
17 |     position: absolute;
18 |     transition: all var(--ifm-transition-fast);
19 |     left: 0;
20 |     margin: 4px;
21 | 
22 |     border-radius: 150px;
23 |     background-color: var(--color-background);
24 | 
25 |     /* Light/L1 */
26 |     box-shadow:
27 |         0px 0.5px 1.5px 0px rgba(63, 71, 93, 0.15),
28 |         0.4px 0.8px 1px -1.2px rgba(63, 71, 93, 0.14),
29 |         1px 2px 2.5px -2.5px rgba(63, 71, 93, 0.13);
30 | }
31 | 
32 | .toggleButton svg {
33 |     z-index: 1;
34 |     margin: 8px 12px;
35 |     width: 20px;
36 |     height: 20px;
37 |     path {
38 |         stroke: var(--color-icon);
39 |     }
40 | }
41 | 
42 | [data-theme='dark'] .toggleButton span {
43 |     left: 48px;
44 | }
45 | 
46 | .toggleButtonDisabled {
47 |     cursor: not-allowed;
48 | }
49 | 


--------------------------------------------------------------------------------
/website/src/theme/DocItem/Layout/styles.module.css:
--------------------------------------------------------------------------------
 1 | .docItemContainer {
 2 |     margin-bottom: 50px;
 3 | }
 4 | 
 5 | .docItemContainer header + *,
 6 | .docItemContainer article > *:first-child {
 7 |   margin-top: 0;
 8 | }
 9 | 
10 | @media (min-width: 997px) {
11 |   .docItemCol {
12 |     max-width: 75% !important;
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/website/src/theme/Footer/LinkItem/index.js:
--------------------------------------------------------------------------------
 1 | import isInternalUrl from '@docusaurus/isInternalUrl';
 2 | import Link from '@docusaurus/Link';
 3 | import useBaseUrl from '@docusaurus/useBaseUrl';
 4 | import clsx from 'clsx';
 5 | import React from 'react';
 6 | 
 7 | import styles from './index.module.css';
 8 | 
 9 | export default function FooterLinkItem({ item }) {
10 |     const ExternalLinkIcon = require('../../../../static/img/external-link.svg').default;
11 | 
12 |     const { to, href, label, prependBaseUrlToHref, className, ...props } = item;
13 |     const toUrl = useBaseUrl(to);
14 |     const normalizedHref = useBaseUrl(href, { forcePrependBaseUrl: true });
15 | 
16 |     return (
17 |         <Link
18 |             className={clsx('footer__link-item', className, styles.footerLink)}
19 |             {...(href
20 |                 ? {
21 |                     href: prependBaseUrlToHref ? normalizedHref : href,
22 |                 }
23 |                 : {
24 |                     to: toUrl,
25 |                 })}
26 |             {...props}>
27 |             {label}
28 |             {href && !isInternalUrl(href) && <ExternalLinkIcon className={styles.externalLinkIcon} />}
29 |         </Link>
30 |     );
31 | }
32 | 


--------------------------------------------------------------------------------
/website/src/theme/Footer/LinkItem/index.module.css:
--------------------------------------------------------------------------------
 1 | .footerLink {
 2 |     color: var(--color-text);
 3 |     cursor: pointer;
 4 |     font-size: 14px;
 5 |     line-height: 20px;
 6 |     &:hover {
 7 |         color: var(--color-text-subtle);
 8 |         path {
 9 |             fill: var(--color-text-subtle);
10 |         }
11 |     }
12 | }
13 | 
14 | .externalLinkIcon {
15 |     margin-left: 5px;
16 |     path {
17 |         fill: var(--color-text);
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/website/src/theme/MDXComponents/A.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable react/prop-types */
 2 | import Link from '@docusaurus/Link';
 3 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
 4 | import React from 'react';
 5 | 
 6 | export default function MDXA(props) {
 7 |     const { siteConfig } = useDocusaurusContext();
 8 |     if (props.href?.startsWith(siteConfig.url)) {
 9 |         const { href, ...rest } = props;
10 |         rest.to = props.href.replace(siteConfig.url + siteConfig.baseUrl, '/');
11 |         props = rest;
12 |     }
13 | 
14 |     return <Link {...props} />;
15 | }
16 | 


--------------------------------------------------------------------------------
/website/src/theme/Navbar/Content/styles.module.css:
--------------------------------------------------------------------------------
 1 | .navbarItems {
 2 |     display: flex;
 3 |     align-items: center;
 4 |     margin-inline: auto;
 5 |     gap: 16px;
 6 | }
 7 | 
 8 | .navbarItems__leftMargin {
 9 |     margin-left: 40px;
10 | }
11 | 
12 | .getStartedButton {
13 |     color: var(--color-text-on-primary);
14 |     background: var(--color-black-action);
15 |     border-radius: 8px;
16 |     font-size: 16px;
17 |     font-weight: 500;
18 |     line-height: 24px;
19 |     padding: 8px 16px !important;
20 |     border: none;
21 |     transition: background-color 0.2s;
22 | 
23 |     &:hover {
24 |         color: var(--color-text-on-primary);
25 |         background-color: var(--color-primary-action-hover);
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/website/src/theme/Navbar/MobileSidebar/Header/index.module.css:
--------------------------------------------------------------------------------
 1 | .getStartedButton {
 2 |     color: var(--color-text-on-primary);
 3 |     background: var(--color-black-action);
 4 |     border-radius: 8px;
 5 |     font-size: 16px;
 6 |     font-weight: 500;
 7 |     line-height: 24px;
 8 |     padding: 8px 16px !important;
 9 |     border: none;
10 |     &:hover {
11 |         color: var(--color-text-on-primary);
12 |     }
13 |     text-align: center;
14 | }
15 | 
16 | .navbarHeader {
17 |     display: flex;
18 |     width: 100%;
19 |     align-items: center;
20 |     justify-content: space-between;
21 |     padding: 16px;
22 | 
23 |     @media (min-width: 768px) {
24 |         padding: 20px 40px;
25 |     }
26 |     @media (min-width: 1024px) {
27 |         padding: 20px 64px;
28 |     }
29 | }
30 | 
31 | .navbarButtonsWrapper {
32 |     display: flex;
33 |     gap: 16px;
34 |     margin-left: auto;
35 | }
36 | 
37 | .navbarButtonsWrapperDesktop {
38 |     display: flex;
39 |     @media (max-width: 767px) {
40 |         display: none;
41 |     }
42 | }
43 | .navbarButtonsWrapperMobile {
44 |     border-top: 1px solid var(--color-separator);
45 |     display: none;
46 |     @media (max-width: 767px) {
47 |         display: flex;
48 |     }
49 |     width: 100%;
50 |     margin: 0;
51 |     flex-direction: column;
52 |     gap: 16px;
53 |     button {
54 |         width: 100%;
55 |     }
56 |     padding: 16px 24px;
57 | }
58 | 


--------------------------------------------------------------------------------
/website/src/theme/Navbar/MobileSidebar/Layout/index.js:
--------------------------------------------------------------------------------
 1 | import { useNavbarSecondaryMenu } from '@docusaurus/theme-common/internal';
 2 | import clsx from 'clsx';
 3 | import React from 'react';
 4 | 
 5 | export default function NavbarMobileSidebarLayout({
 6 |     header,
 7 |     primaryMenu,
 8 |     secondaryMenu,
 9 | }) {
10 |     const { shown: secondaryMenuShown } = useNavbarSecondaryMenu();
11 |     return (
12 |         <div className="navbar-sidebar">
13 |             {header}
14 |             <div
15 |                 className={clsx('navbar-sidebar__items', {
16 |                     'navbar-sidebar__items--show-secondary': secondaryMenuShown,
17 |                 })}>
18 |                 <div className="navbar-sidebar__item menu menu-primary">{primaryMenu}</div>
19 |                 <div className="navbar-sidebar__item menu menu-secondary">{secondaryMenu}</div>
20 |             </div>
21 |         </div>
22 |     );
23 | }
24 | 


--------------------------------------------------------------------------------
/website/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.js:
--------------------------------------------------------------------------------
 1 | import { useThemeConfig } from '@docusaurus/theme-common';
 2 | import { useNavbarMobileSidebar } from '@docusaurus/theme-common/internal';
 3 | import NavbarItem from '@theme/NavbarItem';
 4 | import React from 'react';
 5 | 
 6 | function useNavbarItems() {
 7 |     return useThemeConfig().navbar.items;
 8 | }
 9 | // The primary menu displays the navbar items
10 | export default function NavbarMobilePrimaryMenu() {
11 |     const mobileSidebar = useNavbarMobileSidebar();
12 |     const items = useNavbarItems();
13 | 
14 |     return (
15 |         <ul className="menu__list">
16 |             {items.map((item, i) => (
17 |                 <NavbarItem
18 |                     mobile
19 |                     {...item}
20 |                     onClick={() => mobileSidebar.toggle()}
21 |                     key={i}
22 |                 />
23 |             ))}
24 |         </ul>
25 |     );
26 | }
27 | 


--------------------------------------------------------------------------------
/website/src/theme/Navbar/MobileSidebar/index.js:
--------------------------------------------------------------------------------
 1 | import {
 2 |     useLockBodyScroll,
 3 |     useNavbarMobileSidebar,
 4 |     useWindowSize,
 5 | } from '@docusaurus/theme-common/internal';
 6 | import NavbarMobileSidebarHeader from '@theme/Navbar/MobileSidebar/Header';
 7 | import NavbarMobileSidebarLayout from '@theme/Navbar/MobileSidebar/Layout';
 8 | import NavbarMobileSidebarPrimaryMenu from '@theme/Navbar/MobileSidebar/PrimaryMenu';
 9 | import NavbarMobileSidebarSecondaryMenu from '@theme/Navbar/MobileSidebar/SecondaryMenu';
10 | import React from 'react';
11 | 
12 | export default function NavbarMobileSidebar() {
13 |     const mobileSidebar = useNavbarMobileSidebar();
14 |     const windowSize = useWindowSize({
15 |         desktopBreakpoint: 1200,
16 |     });
17 | 
18 |     useLockBodyScroll(mobileSidebar.shown);
19 |     const shouldRender = !mobileSidebar.disabled && windowSize === 'mobile';
20 |     if (!shouldRender) {
21 |         return null;
22 |     }
23 |     return (
24 |         <NavbarMobileSidebarLayout
25 |             header={<NavbarMobileSidebarHeader />}
26 |             primaryMenu={<NavbarMobileSidebarPrimaryMenu />}
27 |             secondaryMenu={<NavbarMobileSidebarSecondaryMenu />}
28 |         />
29 |     );
30 | }
31 | 


--------------------------------------------------------------------------------
/website/static/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/.nojekyll


--------------------------------------------------------------------------------
/website/static/font/lota.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/font/lota.woff


--------------------------------------------------------------------------------
/website/static/font/lota.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/font/lota.woff2


--------------------------------------------------------------------------------
/website/static/img/API.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/API.png


--------------------------------------------------------------------------------
/website/static/img/apify_logo.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1080 1080" style="enable-background:new 0 0 1080 1080" xml:space="preserve">
2 |     <path d="M189.4 148.8C114.7 159.3 62.7 228.3 73.1 303l80.6 576.8 339.2-773.3-303.5 42.3z" style="fill:#6bb435"/>
3 |     <path d="M1007.9 629.4 976.2 186c-5.6-78.8-76.8-136.4-155.1-125.5L690.5 78.8l287.5 646c21.2-26.3 32.4-60.2 29.9-95.4z" style="fill:#0099c5"/>
4 |     <path d="M277.1 1020c22.3 2.5 45.6-.4 68.2-10L839 801.2 604.3 273.9 277.1 1020z" style="fill:#ff7800"/>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/website/static/img/apify_og_SDK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/apify_og_SDK.png


--------------------------------------------------------------------------------
/website/static/img/arrow_right.svg:
--------------------------------------------------------------------------------
1 | <svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
2 |   <path d="M4.16687 10H15.8335" stroke="#555D76" stroke-width="1.66667" stroke-linecap="round"
3 |     stroke-linejoin="round" />
4 |   <path d="M10.8331 15L15.8331 10" stroke="#555D76" stroke-width="1.66667" stroke-linecap="round"
5 |     stroke-linejoin="round" />
6 |   <path d="M10.8331 5L15.8331 10" stroke="#555D76" stroke-width="1.66667" stroke-linecap="round"
7 |     stroke-linejoin="round" />
8 | </svg>


--------------------------------------------------------------------------------
/website/static/img/auto-scaling-dark.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/auto-scaling-dark.webp


--------------------------------------------------------------------------------
/website/static/img/auto-scaling-light.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/auto-scaling-light.webp


--------------------------------------------------------------------------------
/website/static/img/check.svg:
--------------------------------------------------------------------------------
1 | <svg width="17" height="16" viewBox="0 0 17 16" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <g id="Check_16">
3 | <path id="shape" d="M14.7419 4.22032C14.8824 4.36094 14.9612 4.55157 14.9612 4.75032C14.9612 4.94907 14.8824 5.13969 14.7419 5.28032L7.4919 12.5303C7.35128 12.6708 7.16065 12.7497 6.9619 12.7497C6.76315 12.7497 6.57253 12.6708 6.4319 12.5303L3.1819 9.28032C3.0495 9.13805 2.97739 8.95 2.98075 8.75569C2.98411 8.56137 3.06266 8.37592 3.1999 8.23832C3.33751 8.10107 3.52296 8.02252 3.71727 8.01916C3.91159 8.0158 4.09964 8.08791 4.2419 8.22032L6.9619 10.9403L13.6819 4.22032C13.8225 4.07987 14.0132 4.00098 14.2119 4.00098C14.4107 4.00098 14.6013 4.07987 14.7419 4.22032Z" fill="#555D76"/>
4 | </g>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/website/static/img/chrome-scrape-dark.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/chrome-scrape-dark.gif


--------------------------------------------------------------------------------
/website/static/img/chrome-scrape-light.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/chrome-scrape-light.gif


--------------------------------------------------------------------------------
/website/static/img/cloud_icon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" style="enable-background:new 0 0 24 24" xml:space="preserve">
2 |     <path d="M0 0h24v24H0V0z" style="fill:none"/>
3 |     <path d="M19.4 10c-.7-3.4-3.7-6-7.4-6-2.9 0-5.4 1.6-6.7 4-3 .4-5.3 2.9-5.3 6 0 3.3 2.7 6 6 6h13c2.8 0 5-2.2 5-5 0-2.6-2-4.8-4.6-5z" style="fill:#00a7ce"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/website/static/img/community-dark-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="52" height="52" viewBox="0 0 52 52" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <circle cx="18.7153" cy="20.9995" r="11.665" stroke="#B2B8CC" stroke-width="0.67" stroke-dasharray="2 2"/>
3 | <circle cx="32.7153" cy="20.9995" r="11.665" stroke="#B2B8CC" stroke-width="0.67" stroke-dasharray="2 2"/>
4 | <rect x="9.42244" y="27.9683" width="23" height="23" transform="rotate(-45 9.42244 27.9683)" fill="#1A1B21" stroke="#B2B8CC"/>
5 | <path d="M28.5005 25.4995L23.5005 30.4995" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
6 | <path d="M24.0005 25.4995H28.5005V29.9995" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
7 | </svg>
8 | 


--------------------------------------------------------------------------------
/website/static/img/community-light-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="52" height="52" viewBox="0 0 52 52" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <circle cx="18.7155" cy="20.9995" r="11.665" stroke="#555D76" stroke-width="0.67" stroke-dasharray="2 2"/>
3 | <circle cx="32.7155" cy="20.9995" r="11.665" stroke="#555D76" stroke-width="0.67" stroke-dasharray="2 2"/>
4 | <rect x="9.42256" y="27.9683" width="23" height="23" transform="rotate(-45 9.42256 27.9683)" fill="white" stroke="#555D76"/>
5 | <path d="M28.5006 25.4995L23.5006 30.4995" stroke="#555D76" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
6 | <path d="M24.0006 25.4995H28.5006V29.9995" stroke="#555D76" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
7 | </svg>
8 | 


--------------------------------------------------------------------------------
/website/static/img/crawlee-logo-monocolor.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" width="17" height="16" viewBox="0 0 17 16" fill="none">
 2 | <g clip-path="url(#clip0_885_92371)">
 3 | <path d="M15.9459 14.8569H1.34596C1.13708 14.8569 1.00654 14.6219 1.11098 14.4326L4.76137 7.85796L8.41177 1.28332C8.51621 1.09566 8.7773 1.09566 8.88174 1.28332L11.2299 5.51302C9.81841 5.86223 8.77241 7.13669 8.77241 8.65592C8.77241 10.4444 10.2215 11.8935 12.01 11.8935C12.8862 11.8935 13.6826 11.5459 14.2651 10.978L16.1825 14.4326C16.287 14.6202 16.1564 14.8569 15.9476 14.8569H15.9459Z" fill="var(--color-text)" stroke="var(--color-text)" stroke-width="0.579707" stroke-linecap="round" stroke-linejoin="round"/>
 4 | <path d="M15.2605 9.71822C14.7513 10.8393 13.6221 11.6177 12.3118 11.6177C10.5233 11.6177 9.07422 10.1686 9.07422 8.38012C9.07422 6.59164 10.5233 5.14258 12.3118 5.14258C12.8144 5.14258 13.2892 5.25681 13.7135 5.46078" stroke="var(--color-text)" stroke-width="1.11559" stroke-linecap="round" stroke-linejoin="round"/>
 5 | </g>
 6 | <defs>
 7 | <clipPath id="clip0_885_92371">
 8 | <rect width="16" height="16" fill="var(--color-text)" transform="translate(0.5)"/>
 9 | </clipPath>
10 | </defs>
11 | </svg>
12 | 


--------------------------------------------------------------------------------
/website/static/img/crawlee-python-og.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/crawlee-python-og.png


--------------------------------------------------------------------------------
/website/static/img/defaults-dark-icon.svg:
--------------------------------------------------------------------------------
 1 | <svg width="41" height="40" viewBox="0 0 41 40" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <rect x="-1.56776" y="8.06607" width="33.33" height="33.33" rx="16.665" transform="rotate(-21.3443 -1.56776 8.06607)" stroke="#B2B8CC" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
 3 | <path d="M4.09246 39.0157L20.2913 6.61803L36.4902 39.0157H4.09246Z" fill="#1A1B21" stroke="#B2B8CC"/>
 4 | <g clip-path="url(#clip0_885_94662)">
 5 | <path d="M15.9983 33.5C15.9983 33.5 16.1036 33.5 16.2912 33.5C16.4787 33.5 16.7331 33.5 16.9983 33.5H22.9983C23.2635 33.5 23.5179 33.5 23.7054 33.5C23.8929 33.5 23.9983 33.5 23.9983 33.5" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="square" stroke-linejoin="round"/>
 6 | <path d="M17.4983 29L19.9983 31.5L22.4983 29" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
 7 | <path d="M19.9983 25.5V31.5" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
 8 | </g>
 9 | <defs>
10 | <clipPath id="clip0_885_94662">
11 | <rect width="12" height="12" fill="white" transform="translate(15.9983 22.5)"/>
12 | </clipPath>
13 | </defs>
14 | </svg>
15 | 


--------------------------------------------------------------------------------
/website/static/img/defaults-light-icon.svg:
--------------------------------------------------------------------------------
 1 | <svg width="41" height="40" viewBox="0 0 41 40" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <rect x="-1.56776" y="8.06607" width="33.33" height="33.33" rx="16.665" transform="rotate(-21.3443 -1.56776 8.06607)" stroke="#555D76" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
 3 | <path d="M4.09246 39.0157L20.2913 6.61803L36.4902 39.0157H4.09246Z" fill="white" stroke="#555D76"/>
 4 | <g clip-path="url(#clip0_885_97537)">
 5 | <path d="M15.9983 33.5C15.9983 33.5 16.1036 33.5 16.2912 33.5C16.4787 33.5 16.7331 33.5 16.9983 33.5H22.9983C23.2635 33.5 23.5179 33.5 23.7054 33.5C23.8929 33.5 23.9983 33.5 23.9983 33.5" stroke="#555D76" stroke-width="0.67" stroke-linecap="square" stroke-linejoin="round"/>
 6 | <path d="M17.4983 29L19.9983 31.5L22.4983 29" stroke="#555D76" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
 7 | <path d="M19.9983 25.5V31.5" stroke="#555D76" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
 8 | </g>
 9 | <defs>
10 | <clipPath id="clip0_885_97537">
11 | <rect width="12" height="12" fill="white" transform="translate(15.9983 22.5)"/>
12 | </clipPath>
13 | </defs>
14 | </svg>
15 | 


--------------------------------------------------------------------------------
/website/static/img/discord-brand-dark.svg:
--------------------------------------------------------------------------------
1 | <svg width="80" height="80" viewBox="0 0 1000 1000" xmlns="http://www.w3.org/2000/svg">
2 |     <path fill="#B3B8D2" d="M500 0C224 0 0 224 0 500s224 500 500 500 500-224 500-500S776 0 500 0M386 203h2l7 9c-128 36-186 92-186 92s15-8 41-20c76-34 136-43 161-45 4-1 8-1 12-1 43-6 92-7 143-2 67 8 139 28 213 68 0 0-56-53-176-90l9-11s97-2 199 74c0 0 102 185 102 412 0 0-60 103-216 108 0 0-26-30-47-57 93-26 128-84 128-84-29 19-57 32-81 41-36 15-70 25-103 31-68 12-130 9-183-1-40-8-75-19-104-30-16-7-34-15-52-24-2-2-4-3-6-4-1-1-2-1-3-2-13-7-20-12-20-12s34 56 124 83c-21 27-47 59-47 59-157-5-216-108-216-108 0-227 102-412 102-412 95-71 186-74 197-74m-18 264c-41 0-72 35-72 78s32 79 72 79 72-36 72-79c1-43-32-78-72-78m258 0c-40 0-72 35-72 78s32 79 72 79 72-36 72-79-32-78-72-78"/>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/static/img/discord-brand.svg:
--------------------------------------------------------------------------------
1 | <svg width="80" height="80" viewBox="0 0 1000 1000" xmlns="http://www.w3.org/2000/svg">
2 |     <path fill="#585E76" d="M500 0C224 0 0 224 0 500s224 500 500 500 500-224 500-500S776 0 500 0M386 203h2l7 9c-128 36-186 92-186 92s15-8 41-20c76-34 136-43 161-45 4-1 8-1 12-1 43-6 92-7 143-2 67 8 139 28 213 68 0 0-56-53-176-90l9-11s97-2 199 74c0 0 102 185 102 412 0 0-60 103-216 108 0 0-26-30-47-57 93-26 128-84 128-84-29 19-57 32-81 41-36 15-70 25-103 31-68 12-130 9-183-1-40-8-75-19-104-30-16-7-34-15-52-24-2-2-4-3-6-4-1-1-2-1-3-2-13-7-20-12-20-12s34 56 124 83c-21 27-47 59-47 59-157-5-216-108-216-108 0-227 102-412 102-412 95-71 186-74 197-74m-18 264c-41 0-72 35-72 78s32 79 72 79 72-36 72-79c1-43-32-78-72-78m258 0c-40 0-72 35-72 78s32 79 72 79 72-36 72-79-32-78-72-78"/>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/static/img/external-link.svg:
--------------------------------------------------------------------------------
1 | <svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path fill-rule="evenodd" clip-rule="evenodd" d="M2 1.5C2 1.22386 2.22386 1 2.5 1H6.5C6.77614 1 7 1.22386 7 1.5V5.5C7 5.77614 6.77614 6 6.5 6C6.22386 6 6 5.77614 6 5.5V2.70711L1.85355 6.85355C1.65829 7.04882 1.34171 7.04882 1.14645 6.85355C0.951184 6.65829 0.951184 6.34171 1.14645 6.14645L5.29289 2H2.5C2.22386 2 2 1.77614 2 1.5Z" fill="white" />
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/favicon.ico


--------------------------------------------------------------------------------
/website/static/img/favorite-tools-dark.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/favorite-tools-dark.webp


--------------------------------------------------------------------------------
/website/static/img/favorite-tools-light.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/favorite-tools-light.webp


--------------------------------------------------------------------------------
/website/static/img/features/automate-everything.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" height="32" width="32">
 2 |     <g>
 3 |         <path
 4 |             d="M16.55,12a1.5,1.5,0,0,0-1.5,1.5V20a.26.26,0,0,1-.14.23.24.24,0,0,1-.27,0l-.88-.73A1.25,1.25,0,1,0,11.84,21l2.33,2.8a.49.49,0,0,0,.38.18h8a.5.5,0,0,0,.5-.5V21.33A3.33,3.33,0,0,0,19.72,18H18.55a.5.5,0,0,1-.5-.5v-4A1.5,1.5,0,0,0,16.55,12Z"
 5 |             fill="#000000"/>
 6 |         <path d="M9.85,9a2,2,0,1,0-2,2A2,2,0,0,0,9.85,9Z" fill="#000000"/>
 7 |         <path d="M4.65,14.1a.24.24,0,0,0,0,.2.23.23,0,0,0,.18.09h5.86a.25.25,0,0,0,.2-.1.27.27,0,0,0,0-.23,3.32,3.32,0,0,0-6.31,0Z"
 8 |               fill="#000000"/>
 9 |         <path
10 |             d="M9.5,19.5a1,1,0,0,0-1-1H3a1,1,0,0,1-1-1V5.25A.25.25,0,0,1,2.25,5h19.5a.25.25,0,0,1,.25.25V16a1,1,0,0,0,1,1h0a1,1,0,0,0,1-1V3a3,3,0,0,0-3-3H3A3,3,0,0,0,0,3V17.5a3,3,0,0,0,3,3H8.5a1,1,0,0,0,1-1ZM9.65,2a1,1,0,0,1,1.7,0,.91.91,0,0,1,0,1,1,1,0,0,1-1.7,0,.91.91,0,0,1,0-1ZM6.15,2a1,1,0,0,1,1.7,0,.91.91,0,0,1,0,1,1,1,0,0,1-1.7,0,.91.91,0,0,1,0-1Zm-3.56.09A1,1,0,0,1,4.35,2a.91.91,0,0,1,0,1,1,1,0,0,1-1.7,0,1,1,0,0,1-.15-.5A1.12,1.12,0,0,1,2.59,2.09Z"
11 |             fill="#000000"/>
12 |     </g>
13 | </svg>
14 | 


--------------------------------------------------------------------------------
/website/static/img/features/node-requests.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" height="32" width="32">
 2 |     <g>
 3 |         <path
 4 |             d="M7.8,13.29a.31.31,0,0,1,0,.42l-.86.91a1.54,1.54,0,0,0-.23,1.86,1.61,1.61,0,0,0,1.77.78L9.7,17a.32.32,0,0,1,.39.21l.37,1.19a1.62,1.62,0,0,0,3.08,0l.37-1.19A.32.32,0,0,1,14.3,17l1.22.28a1.6,1.6,0,0,0,1.77-.78,1.54,1.54,0,0,0-.23-1.86l-.86-.91a.31.31,0,0,1,0-.42l.86-.91a1.54,1.54,0,0,0,.23-1.86,1.6,1.6,0,0,0-1.77-.78L14.3,10a.32.32,0,0,1-.39-.21l-.37-1.19h0a1.62,1.62,0,0,0-3.08,0l-.37,1.19A.32.32,0,0,1,9.7,10L8.48,9.74a1.6,1.6,0,0,0-1.77.78,1.54,1.54,0,0,0,.23,1.86Zm4.2-1.7a1.91,1.91,0,1,1-2,1.91A1.93,1.93,0,0,1,12,11.59Z"
 5 |             fill="#000000"/>
 6 |         <path
 7 |             d="M22,1H2A2,2,0,0,0,0,3V21a2,2,0,0,0,2,2H22a2,2,0,0,0,2-2V3A2,2,0,0,0,22,1ZM5.5,3.58a1,1,0,1,1,1,1A1,1,0,0,1,5.5,3.58Zm3.5,0a1,1,0,1,1,1,1A1,1,0,0,1,9,3.58Zm-6-1a1,1,0,1,1-1,1A1,1,0,0,1,3,2.58ZM21.5,21H2.5a.5.5,0,0,1-.5-.5V7a.5.5,0,0,1,.5-.5h19A.5.5,0,0,1,22,7V20.5A.5.5,0,0,1,21.5,21Z"
 8 |             fill="#000000"/>
 9 |     </g>
10 | </svg>
11 | 


--------------------------------------------------------------------------------
/website/static/img/features/storage.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" height="32" width="32"><g><path d="M24,6a1.5,1.5,0,0,0-1.5-1.5H1.5A1.5,1.5,0,0,0,0,6V8.5A1.5,1.5,0,0,0,1.5,10h21A1.5,1.5,0,0,0,24,8.5ZM3.25,7.25A1.25,1.25,0,1,1,4.5,8.5,1.25,1.25,0,0,1,3.25,7.25ZM8.5,8.5A1.25,1.25,0,1,1,9.75,7.25,1.25,1.25,0,0,1,8.5,8.5Z" fill="#000000"></path><path d="M22.74,2.05,18.84.11A1,1,0,0,0,18.39,0H5.63a1,1,0,0,0-.45.11L1.29,2.05a.5.5,0,0,0,.22,1h21A.5.5,0,0,0,23,2.62.52.52,0,0,0,22.74,2.05Z" fill="#000000"></path><path d="M24,13a1.5,1.5,0,0,0-1.5-1.5H1.5A1.5,1.5,0,0,0,0,13v2.5A1.5,1.5,0,0,0,1.5,17h9.25a.25.25,0,0,1,.25.25v4.5a.25.25,0,0,1-.25.25H5.5a1,1,0,0,0,0,2h13a1,1,0,0,0,0-2H13.25a.25.25,0,0,1-.25-.25v-4.5a.25.25,0,0,1,.25-.25H22.5A1.5,1.5,0,0,0,24,15.5ZM3.25,14.25A1.25,1.25,0,1,1,4.5,15.5,1.25,1.25,0,0,1,3.25,14.25ZM8.5,15.5a1.25,1.25,0,1,1,1.25-1.25A1.25,1.25,0,0,1,8.5,15.5Z" fill="#000000"></path><path d="M23,24a1,1,0,0,0,0-2H22a1,1,0,0,0,0,2Z" fill="#000000"></path><path d="M1,24H2a1,1,0,0,0,0-2H1a1,1,0,0,0,0,2Z" fill="#000000"></path></g></svg>


--------------------------------------------------------------------------------
/website/static/img/features/works-everywhere.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" height="32" width="32">
 2 |     <g>
 3 |         <path
 4 |             d="M3.47,14.73a1,1,0,0,0,1-1V5.42a.67.67,0,0,1,.66-.67H18.8a.67.67,0,0,1,.67.67v8.31a1,1,0,1,0,2,0V5.42A2.68,2.68,0,0,0,18.8,2.75H5.13A2.67,2.67,0,0,0,2.47,5.42v8.31A1,1,0,0,0,3.47,14.73Z"
 5 |             fill="#000000"/>
 6 |         <path
 7 |             d="M23.89,16.44a.49.49,0,0,0-.39-.19h-8a.5.5,0,0,0-.5.5c0,.71-1.23,1.5-3,1.5s-3-.79-3-1.5a.5.5,0,0,0-.5-.5H.5a.49.49,0,0,0-.39.19.51.51,0,0,0-.1.43,5.78,5.78,0,0,0,5.61,4.38H18.38A5.78,5.78,0,0,0,24,16.87.51.51,0,0,0,23.89,16.44Z"
 8 |             fill="#000000"/>
 9 |         <path d="M9.47,9.4a.76.76,0,0,0,.75-.75V7.5a.75.75,0,1,0-1.5,0V8.65A.75.75,0,0,0,9.47,9.4Z" fill="#000000"/>
10 |         <path d="M14.47,9.4a.76.76,0,0,0,.75-.75V7.5a.75.75,0,1,0-1.5,0V8.65A.75.75,0,0,0,14.47,9.4Z" fill="#000000"/>
11 |         <path
12 |             d="M7.66,11.36a.75.75,0,0,0,0,1.06,6.25,6.25,0,0,0,8.84,0,.75.75,0,0,0-1.06-1.06,4.76,4.76,0,0,1-6.72,0A.75.75,0,0,0,7.66,11.36Z"
13 |             fill="#000000"/>
14 |     </g>
15 | </svg>
16 | 


--------------------------------------------------------------------------------
/website/static/img/fill-and-submit-web-form/00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/fill-and-submit-web-form/00.jpg


--------------------------------------------------------------------------------
/website/static/img/fill-and-submit-web-form/01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/fill-and-submit-web-form/01.jpg


--------------------------------------------------------------------------------
/website/static/img/fill-and-submit-web-form/02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/fill-and-submit-web-form/02.jpg


--------------------------------------------------------------------------------
/website/static/img/fill-and-submit-web-form/03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/fill-and-submit-web-form/03.jpg


--------------------------------------------------------------------------------
/website/static/img/getting-started/current-price.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/current-price.jpg


--------------------------------------------------------------------------------
/website/static/img/getting-started/scraping-practice.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/scraping-practice.jpg


--------------------------------------------------------------------------------
/website/static/img/getting-started/select-an-element.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/select-an-element.jpg


--------------------------------------------------------------------------------
/website/static/img/getting-started/selected-element.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/selected-element.jpg


--------------------------------------------------------------------------------
/website/static/img/getting-started/sku.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/sku.jpg


--------------------------------------------------------------------------------
/website/static/img/getting-started/title.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/title.jpg


--------------------------------------------------------------------------------
/website/static/img/hearth copy.svg:
--------------------------------------------------------------------------------
1 | <svg width="14" height="12" viewBox="0 0 14 12" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M3.65259 0.0490158C4.24263 -0.0515187 4.84771 -0.0183548 5.42323 0.146063C5.99875 0.310481 6.53004 0.60196 6.97792 0.999016L7.00259 1.02102L7.02526 1.00102C7.45272 0.62588 7.95527 0.346213 8.49937 0.180671C9.04348 0.0151289 9.61663 -0.0324815 10.1806 0.0410159L10.3446 0.0650159C11.0555 0.187755 11.7199 0.500453 12.2676 0.969996C12.8152 1.43954 13.2257 2.04845 13.4556 2.73224C13.6854 3.41603 13.726 4.14925 13.5732 4.85426C13.4203 5.55927 13.0797 6.20982 12.5873 6.73702L12.4673 6.86035L12.4353 6.88768L7.46859 11.807C7.35397 11.9205 7.20212 11.9885 7.04118 11.9986C6.88023 12.0086 6.72109 11.96 6.59326 11.8617L6.53059 11.807L1.53525 6.85902C1.00607 6.34412 0.629722 5.6928 0.447894 4.97719C0.266067 4.26159 0.285856 3.50961 0.505068 2.80456C0.72428 2.0995 1.13436 1.46888 1.6899 0.98253C2.24543 0.496179 2.92474 0.17308 3.65259 0.0490158Z" fill="#B2B8CC"/>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/static/img/hearth.svg:
--------------------------------------------------------------------------------
1 | <svg width="14" height="12" viewBox="0 0 14 12" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M3.65259 0.0490158C4.24263 -0.0515187 4.84771 -0.0183548 5.42323 0.146063C5.99875 0.310481 6.53004 0.60196 6.97792 0.999016L7.00259 1.02102L7.02526 1.00102C7.45272 0.62588 7.95527 0.346213 8.49937 0.180671C9.04348 0.0151289 9.61663 -0.0324815 10.1806 0.0410159L10.3446 0.0650159C11.0555 0.187755 11.7199 0.500453 12.2676 0.969996C12.8152 1.43954 13.2257 2.04845 13.4556 2.73224C13.6854 3.41603 13.726 4.14925 13.5732 4.85426C13.4203 5.55927 13.0797 6.20982 12.5873 6.73702L12.4673 6.86035L12.4353 6.88768L7.46859 11.807C7.35397 11.9205 7.20212 11.9885 7.04118 11.9986C6.88023 12.0086 6.72109 11.96 6.59326 11.8617L6.53059 11.807L1.53525 6.85902C1.00607 6.34412 0.629722 5.6928 0.447894 4.97719C0.266067 4.26159 0.285856 3.50961 0.505068 2.80456C0.72428 2.0995 1.13436 1.46888 1.6899 0.98253C2.24543 0.496179 2.92474 0.17308 3.65259 0.0490158Z" fill="#B2B8CC"/>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/static/img/javascript_logo.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 630 630">
2 |     <path fill="#f7df1e" d="M0 0h630v630H0z"/>
3 |     <path d="M423.2 492.19c12.69 20.72 29.2 35.95 58.4 35.95 24.53 0 40.2-12.26 40.2-29.2 0-20.3-16.1-27.49-43.1-39.3l-14.8-6.35c-42.72-18.2-71.1-41-71.1-89.2 0-44.4 33.83-78.2 86.7-78.2 37.64 0 64.7 13.1 84.2 47.4l-46.1 29.6c-10.15-18.2-21.1-25.37-38.1-25.37-17.34 0-28.33 11-28.33 25.37 0 17.76 11 24.95 36.4 35.95l14.8 6.34c50.3 21.57 78.7 43.56 78.7 93 0 53.3-41.87 82.5-98.1 82.5-54.98 0-90.5-26.2-107.88-60.54zm-209.13 5.13c9.3 16.5 17.76 30.45 38.1 30.45 19.45 0 31.72-7.61 31.72-37.2v-201.3h59.2v202.1c0 61.3-35.94 89.2-88.4 89.2-47.4 0-74.85-24.53-88.81-54.075z"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/website/static/img/js_file.svg:
--------------------------------------------------------------------------------
 1 | <svg width="60" height="60" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 |     <g clip-path="url(#a)" stroke-width="3.75" stroke-linecap="round" stroke-linejoin="round">
 3 |         <path d="M26.25 58.133H5.625a3.75 3.75 0 0 1-3.75-3.75V5.633a3.75 3.75 0 0 1 3.75-3.75h26.572a3.75 3.75 0 0 1 2.653 1.1L49.528 17.66a3.75 3.75 0 0 1 1.097 2.652v11.57" stroke="#585E76"/>
 4 |         <path d="M50.625 20.633h-15a3.75 3.75 0 0 1-3.75-3.75v-15" stroke="#008A27"/>
 5 |         <path d="M43.125 39.383v15a3.75 3.75 0 0 1-3.75 3.75h-3.75m22.5-18.75h-3.75a3.75 3.75 0 0 0-3.75 3.75c0 5.624 7.5 5.624 7.5 11.25a3.75 3.75 0 0 1-3.75 3.75h-3.75" stroke="#1672EB"/>
 6 |     </g>
 7 |     <defs>
 8 |         <clipPath id="a">
 9 |             <path fill="#fff" d="M0 0h60v60H0z"/>
10 |         </clipPath>
11 |     </defs>
12 | </svg>
13 | 


--------------------------------------------------------------------------------
/website/static/img/logo-blur.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/logo-blur.png


--------------------------------------------------------------------------------
/website/static/img/menu-arrows.svg:
--------------------------------------------------------------------------------
1 | <svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <g id="Frame">
3 | <path id="Vector" d="M5.3335 5.99967L8.00016 3.33301L10.6668 5.99967" stroke="#B2B8CC" stroke-linecap="round" stroke-linejoin="round"/>
4 | <path id="Vector_2" d="M10.6668 10L8.00016 12.6667L5.3335 10" stroke="#B2B8CC" stroke-linecap="round" stroke-linejoin="round"/>
5 | </g>
6 | </svg>
7 | 


--------------------------------------------------------------------------------
/website/static/img/oss_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/oss_logo.png


--------------------------------------------------------------------------------
/website/static/img/puppeteer-live-view-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/puppeteer-live-view-dashboard.png


--------------------------------------------------------------------------------
/website/static/img/puppeteer-live-view-detail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/puppeteer-live-view-detail.png


--------------------------------------------------------------------------------
/website/static/img/resuming-paused-crawl/00.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/resuming-paused-crawl/00.webp


--------------------------------------------------------------------------------
/website/static/img/resuming-paused-crawl/01.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/resuming-paused-crawl/01.webp


--------------------------------------------------------------------------------
/website/static/img/robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/robot.png


--------------------------------------------------------------------------------
/website/static/img/routing-dark-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="47" height="46" viewBox="0 0 47 46" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <rect x="14.525" y="16.4805" width="22.0072" height="22.0072" transform="rotate(-45 14.525 16.4805)" stroke="#B2B8CC" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
3 | <rect x="7.72205" y="29.5195" width="22.0072" height="22.0072" transform="rotate(-45 7.72205 29.5195)" stroke="#B2B8CC" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
4 | <rect x="0.91883" y="16.4805" width="22.0072" height="22.0072" transform="rotate(-45 0.91883 16.4805)" stroke="#B2B8CC" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
5 | <path d="M7.08514 34.6896L23.284 2.29186L39.4829 34.6896H7.08514Z" fill="#1A1B21" stroke="#B2B8CC"/>
6 | <path d="M20.4504 26.4219L25.8567 21.0156" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
7 | <path d="M25.8561 23.2833V21.0156H23.5884" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
8 | </svg>
9 | 


--------------------------------------------------------------------------------
/website/static/img/routing-light-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="47" height="46" viewBox="0 0 47 46" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <rect x="14.5253" y="16.4805" width="22.0072" height="22.0072" transform="rotate(-45 14.5253 16.4805)" stroke="#555D76" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
3 | <rect x="7.72205" y="29.5195" width="22.0072" height="22.0072" transform="rotate(-45 7.72205 29.5195)" stroke="#555D76" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
4 | <rect x="0.918952" y="16.4805" width="22.0072" height="22.0072" transform="rotate(-45 0.918952 16.4805)" stroke="#555D76" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
5 | <path d="M7.08514 34.6896L23.284 2.29186L39.4829 34.6896H7.08514Z" fill="white" stroke="#555D76"/>
6 | <path d="M20.4507 26.4219L25.8569 21.0156" stroke="#555D76" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
7 | <path d="M25.8565 23.2833V21.0156H23.5887" stroke="#555D76" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
8 | </svg>
9 | 


--------------------------------------------------------------------------------
/website/static/img/scraping-utils-dark-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="53" height="52" viewBox="0 0 53 52" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <rect x="3.24012" y="18.8037" width="22.0072" height="45.0155" transform="rotate(-45 3.24012 18.8037)" stroke="#B2B8CC" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
3 | <rect x="13.4988" y="13.4971" width="27.3465" height="27.3465" fill="#1A1B21" stroke="#B2B8CC"/>
4 | <path d="M23.0244 26.4987C23.1509 25.5341 23.6245 24.6486 24.3568 24.008C25.089 23.3674 26.0296 23.0157 27.0024 23.0186C27.9753 23.0215 28.9138 23.3789 29.6422 24.0238C30.3706 24.6687 30.8389 25.557 30.9596 26.5224C31.0803 27.4878 30.845 28.464 30.2978 29.2684C29.7505 30.0728 28.9289 30.6502 27.9867 30.8925C27.0445 31.1348 26.0462 31.0254 25.1789 30.5848C24.3115 30.1442 23.6344 29.4025 23.2744 28.4987M23.0244 30.9987V28.4987H25.5244" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/website/static/img/scraping-utils-light-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="53" height="52" viewBox="0 0 53 52" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <rect x="3.24" y="18.8037" width="22.0072" height="45.0155" transform="rotate(-45 3.24 18.8037)" stroke="#555D76" stroke-width="0.67" stroke-dasharray="2.27 2.27"/>
3 | <rect x="13.4988" y="13.4971" width="27.3465" height="27.3465" fill="white" stroke="#555D76"/>
4 | <path d="M23.0243 26.4987C23.1507 25.5341 23.6244 24.6486 24.3566 24.008C25.0889 23.3674 26.0294 23.0157 27.0023 23.0186C27.9752 23.0215 28.9137 23.3789 29.6421 24.0238C30.3704 24.6687 30.8388 25.557 30.9595 26.5224C31.0801 27.4878 30.8449 28.464 30.2976 29.2684C29.7504 30.0728 28.9288 30.6502 27.9866 30.8925C27.0443 31.1348 26.0461 31.0254 25.1787 30.5848C24.3114 30.1442 23.6343 29.4025 23.2743 28.4987M23.0243 30.9987V28.4987H25.5243" stroke="#555D76" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/website/static/img/smart-proxy-dark.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/smart-proxy-dark.webp


--------------------------------------------------------------------------------
/website/static/img/smart-proxy-light.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/smart-proxy-light.webp


--------------------------------------------------------------------------------
/website/static/img/source_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/source_code.png


--------------------------------------------------------------------------------
/website/static/img/system.svg:
--------------------------------------------------------------------------------
 1 | <svg width="61" height="60" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 |     <g clip-path="url(#a)" stroke-width="3.75" stroke-linecap="round" stroke-linejoin="round">
 3 |         <path d="M21.125 26.31a13.92 13.92 0 0 0 18.75 0m-15-11.31v3.75M36.125 15v3.75" stroke="#008A27"/>
 4 |         <path d="M53 37.5V9.375a3.75 3.75 0 0 0-3.75-3.75h-37.5A3.75 3.75 0 0 0 8 9.375V37.5h45Z" stroke="#585E76"/>
 5 |         <path d="M58.303 49.102a3.75 3.75 0 0 1-3.428 5.273H6.125a3.75 3.75 0 0 1-3.428-5.273L8 37.5h45l5.303 11.602ZM26.75 46.875h7.5" stroke="#1672EB"/>
 6 |     </g>
 7 |     <defs>
 8 |         <clipPath id="a">
 9 |             <path fill="#fff" transform="translate(.5)" d="M0 0h60v60H0z"/>
10 |         </clipPath>
11 |     </defs>
12 | </svg>
13 | 


--------------------------------------------------------------------------------
/website/static/img/workflow.svg:
--------------------------------------------------------------------------------
 1 | <svg width="60" height="60" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 |     <g clip-path="url(#a)" stroke-width="3.75" stroke-linecap="round" stroke-linejoin="round">
 3 |         <path d="M9.555 20.625v-15a4.125 4.125 0 0 1 4.418-3.75h39.75a4.125 4.125 0 0 1 4.415 3.75v26.25a4.125 4.125 0 0 1-4.416 3.75H43.125" stroke="#585E76"/>
 4 |         <path d="M9.555 13.125h48.57" stroke="#008A27"/>
 5 |         <path d="M1.875 55.125a3.3 3.3 0 0 0 3.532 3h26.686a3.3 3.3 0 0 0 3.532-3v-24a3.3 3.3 0 0 0-3.532-3H5.407a3.3 3.3 0 0 0-3.532 3v24Z" stroke="#1672EB"/>
 6 |         <path d="m15.12 37.5 8.833 5.813-8.833 5.812" stroke="#1672EB"/>
 7 |         <path d="M47 58.125c2.843 0 4.5-3.14 4.5-5.625v-9.375" stroke="#008A27"/>
 8 |         <path d="m58.125 49.125-6.792-6-6.795 6" stroke="#008A27"/>
 9 |     </g>
10 |     <defs>
11 |         <clipPath id="a">
12 |             <path fill="#fff" d="M0 0h60v60H0z"/>
13 |         </clipPath>
14 |     </defs>
15 | </svg>
16 | 


--------------------------------------------------------------------------------
/website/static/img/zero-setup-dark-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="50" height="50" viewBox="0 0 50 50" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <circle cx="25" cy="25" r="24.665" stroke="#B2B8CC" stroke-width="0.67" stroke-dasharray="2 2"/>
3 | <rect x="12.5" y="12.5" width="25.35" height="25.35" fill="#1A1B21" stroke="#B2B8CC"/>
4 | <path d="M25 20.5C25.96 20.5 26.85 20.8005 27.58 21.313L25 25V20.5Z" fill="#B2B8CC"/>
5 | <path d="M20.5 25C20.5 25.5909 20.6164 26.1761 20.8425 26.7221C21.0687 27.268 21.4002 27.7641 21.818 28.182C22.2359 28.5998 22.732 28.9313 23.2779 29.1575C23.8239 29.3836 24.4091 29.5 25 29.5C25.5909 29.5 26.1761 29.3836 26.7221 29.1575C27.268 28.9313 27.7641 28.5998 28.182 28.182C28.5998 27.7641 28.9313 27.268 29.1575 26.7221C29.3836 26.1761 29.5 25.5909 29.5 25C29.5 23.8065 29.0259 22.6619 28.182 21.818C27.3381 20.9741 26.1935 20.5 25 20.5C23.8065 20.5 22.6619 20.9741 21.818 21.818C20.9741 22.6619 20.5 23.8065 20.5 25Z" stroke="#B2B8CC" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
6 | </svg>
7 | 


--------------------------------------------------------------------------------
/website/static/img/zero-setup-light-icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="50" height="50" viewBox="0 0 50 50" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <circle cx="25" cy="25" r="24.665" stroke="#555D76" stroke-width="0.67" stroke-dasharray="2 2"/>
3 | <rect x="12.5" y="12.5" width="25.35" height="25.35" fill="white" stroke="#555D76"/>
4 | <path d="M25 20.5C25.96 20.5 26.85 20.8005 27.58 21.313L25 25V20.5Z" fill="#555D76"/>
5 | <path d="M20.5 25C20.5 25.5909 20.6164 26.1761 20.8425 26.7221C21.0687 27.268 21.4002 27.7641 21.818 28.182C22.2359 28.5998 22.732 28.9313 23.2779 29.1575C23.8239 29.3836 24.4091 29.5 25 29.5C25.5909 29.5 26.1761 29.3836 26.7221 29.1575C27.268 28.9313 27.7641 28.5998 28.182 28.182C28.5998 27.7641 28.9313 27.268 29.1575 26.7221C29.3836 26.1761 29.5 25.5909 29.5 25C29.5 23.8065 29.0259 22.6619 28.182 21.818C27.3381 20.9741 26.1935 20.5 25 20.5C23.8065 20.5 22.6619 20.9741 21.818 21.818C20.9741 22.6619 20.5 23.8065 20.5 25Z" stroke="#555D76" stroke-width="0.67" stroke-linecap="round" stroke-linejoin="round"/>
6 | </svg>
7 | 


--------------------------------------------------------------------------------
/website/static/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Sitemap: https://crawlee.dev/python/sitemap.xml
3 | 


--------------------------------------------------------------------------------
/website/tools/docs-prettier.config.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @type {import('prettier').Options}
 3 |  */
 4 | module.exports = {
 5 |     parser: 'markdown',
 6 |     arrowParens: 'avoid',
 7 |     trailingComma: 'all',
 8 |     singleQuote: true,
 9 |     tabWidth: 4,
10 |     printWidth: 150,
11 |     proseWrap: 'always',
12 | };
13 | 


--------------------------------------------------------------------------------
/website/tools/utils/externalLink.js:
--------------------------------------------------------------------------------
 1 | const { parse } = require('url');
 2 | 
 3 | const visit = import('unist-util-visit').then((m) => m.visit);
 4 | 
 5 | const internalUrls = ['crawlee.dev'];
 6 | 
 7 | /**
 8 |  * @param {import('url').UrlWithStringQuery} href
 9 |  */
10 | function isInternal(href) {
11 |     return internalUrls.some(
12 |         (internalUrl) => href.host === internalUrl
13 |             || (!href.protocol && !href.host && (href.pathname || href.hash)),
14 |     );
15 | }
16 | 
17 | /**
18 |  * @type {import('unified').Plugin}
19 |  */
20 | exports.externalLinkProcessor = () => {
21 |     return async (tree) => {
22 |         (await visit)(tree, 'element', (node) => {
23 |             if (
24 |                 node.tagName === 'a'
25 |                 && node.properties
26 |                 && typeof node.properties.href === 'string'
27 |             ) {
28 |                 const href = parse(node.properties.href);
29 | 
30 |                 if (!isInternal(href)) {
31 |                     node.properties.target = '_blank';
32 |                     node.properties.rel = 'noopener';
33 |                 } else {
34 |                     node.properties.target = null;
35 |                     node.properties.rel = null;
36 |                 }
37 |             }
38 |         });
39 |     };
40 | };
41 | 


--------------------------------------------------------------------------------
/website/tools/website_gif/chrome-scrape-dark.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/tools/website_gif/chrome-scrape-dark.gif


--------------------------------------------------------------------------------
/website/tools/website_gif/chrome-scrape-dark.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/tools/website_gif/chrome-scrape-dark.mp4


--------------------------------------------------------------------------------
/website/tools/website_gif/chrome-scrape-light.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/tools/website_gif/chrome-scrape-light.gif


--------------------------------------------------------------------------------
/website/tools/website_gif/chrome-scrape-light.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/tools/website_gif/chrome-scrape-light.mp4


--------------------------------------------------------------------------------
/website/tsconfig.eslint.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"extends": "@apify/tsconfig",
 3 | 	"compilerOptions": {
 4 | 		"jsx": "preserve"
 5 | 	},
 6 | 	"include": [
 7 | 		"src/**/*.js",
 8 | 		"src/**/*.ts",
 9 | 		"src/**/*.jsx",
10 | 		"src/**/*.tsx"
11 | 	]
12 | }
13 | 


--------------------------------------------------------------------------------