├── .editorconfig ├── .github ├── CODEOWNERS ├── pull_request_template.md └── workflows │ ├── build_and_deploy_docs.yaml │ ├── check_pr_title.yaml │ ├── pre_release.yaml │ ├── release.yaml │ ├── run_code_checks.yaml │ ├── templates_e2e_tests.yaml │ └── update_new_issue.yaml ├── .gitignore ├── .markdownlint.yaml ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── docs ├── deployment │ ├── apify_platform.mdx │ ├── code_examples │ │ ├── apify │ │ │ ├── crawler_as_actor_example.py │ │ │ ├── get_public_url.py │ │ │ ├── log_with_config_example.py │ │ │ ├── proxy_advanced_example.py │ │ │ └── proxy_example.py │ │ └── google │ │ │ ├── cloud_run_example.py │ │ │ └── google_example.py │ ├── google_cloud.mdx │ └── google_cloud_run.mdx ├── examples │ ├── add_data_to_dataset.mdx │ ├── beautifulsoup_crawler.mdx │ ├── capture_screenshot_using_playwright.mdx │ ├── capturing_page_snapshots_with_error_snapshotter.mdx │ ├── code_examples │ │ ├── adaptive_playwright_crawler.py │ │ ├── add_data_to_dataset_bs.py │ │ ├── add_data_to_dataset_dataset.py │ │ ├── add_data_to_dataset_pw.py │ │ ├── beautifulsoup_crawler.py │ │ ├── beautifulsoup_crawler_keep_alive.py │ │ ├── beautifulsoup_crawler_stop.py │ │ ├── capture_screenshot_using_playwright.py │ │ ├── configure_json_logging.py │ │ ├── crawl_all_links_on_website_bs.py │ │ ├── crawl_all_links_on_website_pw.py │ │ ├── crawl_multiple_urls_bs.py │ │ ├── crawl_multiple_urls_pw.py │ │ ├── crawl_specific_links_on_website_bs.py │ │ ├── crawl_specific_links_on_website_pw.py │ │ ├── crawl_website_with_relative_links_all_links.py │ │ ├── crawl_website_with_relative_links_same_domain.py │ │ ├── crawl_website_with_relative_links_same_hostname.py │ │ ├── crawl_website_with_relative_links_same_origin.py │ │ ├── export_entire_dataset_to_file_csv.py │ │ ├── export_entire_dataset_to_file_json.py │ │ ├── extract_and_add_specific_links_on_website_bs.py │ │ ├── extract_and_add_specific_links_on_website_pw.py │ │ ├── fill_and_submit_web_form_crawler.py │ │ ├── fill_and_submit_web_form_request.py │ │ ├── parsel_crawler.py │ │ ├── parsel_crawler_with_error_snapshotter.py │ │ ├── playwright_block_requests.py │ │ ├── playwright_crawler.py │ │ ├── playwright_crawler_with_camoufox.py │ │ ├── playwright_crawler_with_error_snapshotter.py │ │ ├── playwright_crawler_with_fingerprint_generator.py │ │ ├── respect_robots_on_skipped_request.py │ │ ├── respect_robots_txt_file.py │ │ └── resuming_paused_crawl.py │ ├── crawl_all_links_on_website.mdx │ ├── crawl_multiple_urls.mdx │ ├── crawl_specific_links_on_website.mdx │ ├── crawl_website_with_relative_links.mdx │ ├── crawler_keep_alive.mdx │ ├── crawler_stop.mdx │ ├── export_entire_dataset_to_file.mdx │ ├── fill_and_submit_web_form.mdx │ ├── json_logging.mdx │ ├── parsel_crawler.mdx │ ├── playwright_crawler.mdx │ ├── playwright_crawler_adaptive.mdx │ ├── playwright_crawler_with_block_requests.mdx │ ├── playwright_crawler_with_camoufox.mdx │ ├── playwright_crawler_with_fingerprint_generator.mdx │ ├── respect_robots_txt_file.mdx │ └── resuming_paused_crawl.mdx ├── guides │ ├── avoid_blocking.mdx │ ├── code_examples │ │ ├── avoid_blocking │ │ │ ├── default_fingerprint_generator_with_args.py │ │ │ └── playwright_with_fingerprint_generator.py │ │ ├── error_handling │ │ │ ├── change_handle_error_status.py │ │ │ ├── disable_retry.py │ │ │ └── handle_proxy_error.py │ │ ├── http_clients │ │ │ ├── curl_impersonate_example.py │ │ │ └── httpx_example.py │ │ ├── login_crawler │ │ │ ├── http_login.py │ │ │ └── playwright_login.py │ │ ├── playwright_crawler │ │ │ ├── browser_configuration_example.py │ │ │ ├── multiple_launch_example.py │ │ │ ├── plugin_browser_configuration_example.py │ │ │ └── pre_navigation_hook_example.py │ │ ├── playwright_crawler_adaptive │ │ │ ├── handler.py │ │ │ ├── init_beautifulsoup.py │ │ │ ├── init_parsel.py │ │ │ ├── init_prediction.py │ │ │ └── pre_nav_hooks.py │ │ ├── proxy_management │ │ │ ├── inspecting_bs_example.py │ │ │ ├── inspecting_pw_example.py │ │ │ ├── integration_bs_example.py │ │ │ ├── integration_pw_example.py │ │ │ ├── quick_start_example.py │ │ │ ├── session_bs_example.py │ │ │ ├── session_pw_example.py │ │ │ ├── tiers_bs_example.py │ │ │ └── tiers_pw_example.py │ │ ├── request_loaders │ │ │ ├── rl_basic_example.py │ │ │ ├── tandem_example.py │ │ │ └── tandem_example_explicit.py │ │ ├── running_in_web_server │ │ │ ├── __init__.py │ │ │ ├── crawler.py │ │ │ └── server.py │ │ ├── scaling_crawlers │ │ │ ├── max_tasks_per_minute_example.py │ │ │ └── min_and_max_concurrency_example.py │ │ ├── session_management │ │ │ ├── multi_sessions_http.py │ │ │ ├── one_session_http.py │ │ │ ├── sm_basic.py │ │ │ ├── sm_beautifulsoup.py │ │ │ ├── sm_http.py │ │ │ ├── sm_parsel.py │ │ │ ├── sm_playwright.py │ │ │ └── sm_standalone.py │ │ └── storages │ │ │ ├── cleaning_do_not_purge_example.py │ │ │ ├── cleaning_purge_explicitly_example.py │ │ │ ├── dataset_basic_example.py │ │ │ ├── dataset_with_crawler_example.py │ │ │ ├── dataset_with_crawler_explicit_example.py │ │ │ ├── helper_add_requests_example.py │ │ │ ├── helper_enqueue_links_example.py │ │ │ ├── kvs_basic_example.py │ │ │ ├── kvs_with_crawler_example.py │ │ │ ├── kvs_with_crawler_explicit_example.py │ │ │ ├── rq_basic_example.py │ │ │ ├── rq_with_crawler_example.py │ │ │ └── rq_with_crawler_explicit_example.py │ ├── crawler_login.mdx │ ├── error_handling.mdx │ ├── http_clients.mdx │ ├── http_crawlers.mdx │ ├── playwright_crawler.mdx │ ├── playwright_crawler_adaptive.mdx │ ├── proxy_management.mdx │ ├── request_loaders.mdx │ ├── running_in_web_server.mdx │ ├── scaling_crawlers.mdx │ ├── session_management.mdx │ └── storages.mdx ├── introduction │ ├── 01_setting_up.mdx │ ├── 02_first_crawler.mdx │ ├── 03_adding_more_urls.mdx │ ├── 04_real_world_project.mdx │ ├── 05_crawling.mdx │ ├── 06_scraping.mdx │ ├── 07_saving_data.mdx │ ├── 08_refactoring.mdx │ ├── 09_running_in_cloud.mdx │ ├── code_examples │ │ ├── 02_bs.py │ │ ├── 02_bs_better.py │ │ ├── 02_request_queue.py │ │ ├── 03_enqueue_strategy.py │ │ ├── 03_finding_new_links.py │ │ ├── 03_globs.py │ │ ├── 03_original_code.py │ │ ├── 03_transform_request.py │ │ ├── 04_sanity_check.py │ │ ├── 05_crawling_detail.py │ │ ├── 05_crawling_listing.py │ │ ├── 06_scraping.py │ │ ├── 07_final_code.py │ │ ├── 07_first_code.py │ │ ├── 08_main.py │ │ ├── 08_routes.py │ │ ├── 09_apify_sdk.py │ │ ├── __init__.py │ │ └── routes.py │ └── index.mdx ├── pyproject.toml ├── quick-start │ ├── code_examples │ │ ├── beautifulsoup_crawler_example.py │ │ ├── parsel_crawler_example.py │ │ ├── playwright_crawler_example.py │ │ └── playwright_crawler_headful_example.py │ └── index.mdx └── upgrading │ └── upgrading_to_v0x.md ├── pyproject.toml ├── renovate.json ├── src └── crawlee │ ├── __init__.py │ ├── _autoscaling │ ├── __init__.py │ ├── _types.py │ ├── autoscaled_pool.py │ ├── py.typed │ ├── snapshotter.py │ └── system_status.py │ ├── _browserforge_workaround.py │ ├── _cli.py │ ├── _consts.py │ ├── _log_config.py │ ├── _request.py │ ├── _service_locator.py │ ├── _types.py │ ├── _utils │ ├── __init__.py │ ├── blocked.py │ ├── byte_size.py │ ├── console.py │ ├── context.py │ ├── crypto.py │ ├── data_processing.py │ ├── docs.py │ ├── file.py │ ├── globs.py │ ├── html_to_text.py │ ├── measure_time.py │ ├── models.py │ ├── recoverable_state.py │ ├── recurring_task.py │ ├── requests.py │ ├── robots.py │ ├── system.py │ ├── try_import.py │ ├── urls.py │ ├── wait.py │ └── web.py │ ├── browsers │ ├── __init__.py │ ├── _browser_controller.py │ ├── _browser_plugin.py │ ├── _browser_pool.py │ ├── _playwright_browser.py │ ├── _playwright_browser_controller.py │ ├── _playwright_browser_plugin.py │ ├── _types.py │ └── py.typed │ ├── configuration.py │ ├── crawlers │ ├── __init__.py │ ├── _abstract_http │ │ ├── __init__.py │ │ ├── _abstract_http_crawler.py │ │ ├── _abstract_http_parser.py │ │ ├── _http_crawling_context.py │ │ └── py.typed │ ├── _adaptive_playwright │ │ ├── __init__.py │ │ ├── _adaptive_playwright_crawler.py │ │ ├── _adaptive_playwright_crawler_statistics.py │ │ ├── _adaptive_playwright_crawling_context.py │ │ ├── _rendering_type_predictor.py │ │ └── _result_comparator.py │ ├── _basic │ │ ├── __init__.py │ │ ├── _basic_crawler.py │ │ ├── _basic_crawling_context.py │ │ ├── _context_pipeline.py │ │ ├── _logging_utils.py │ │ └── py.typed │ ├── _beautifulsoup │ │ ├── __init__.py │ │ ├── _beautifulsoup_crawler.py │ │ ├── _beautifulsoup_crawling_context.py │ │ ├── _beautifulsoup_parser.py │ │ ├── _utils.py │ │ └── py.typed │ ├── _http │ │ ├── __init__.py │ │ ├── _http_crawler.py │ │ └── _http_parser.py │ ├── _parsel │ │ ├── __init__.py │ │ ├── _parsel_crawler.py │ │ ├── _parsel_crawling_context.py │ │ ├── _parsel_parser.py │ │ └── _utils.py │ ├── _playwright │ │ ├── __init__.py │ │ ├── _playwright_crawler.py │ │ ├── _playwright_crawling_context.py │ │ ├── _playwright_http_client.py │ │ ├── _playwright_pre_nav_crawling_context.py │ │ ├── _types.py │ │ └── _utils.py │ ├── _types.py │ └── py.typed │ ├── errors.py │ ├── events │ ├── __init__.py │ ├── _event_manager.py │ ├── _local_event_manager.py │ ├── _types.py │ └── py.typed │ ├── fingerprint_suite │ ├── __init__.py │ ├── _browserforge_adapter.py │ ├── _consts.py │ ├── _fingerprint_generator.py │ ├── _header_generator.py │ ├── _types.py │ └── py.typed │ ├── http_clients │ ├── __init__.py │ ├── _base.py │ ├── _curl_impersonate.py │ └── _httpx.py │ ├── project_template │ ├── cookiecutter.json │ ├── hooks │ │ ├── post_gen_project.py │ │ └── pre_gen_project.py │ ├── templates │ │ ├── main.py │ │ ├── main_beautifulsoup.py │ │ ├── main_parsel.py │ │ ├── main_playwright.py │ │ ├── main_playwright_camoufox.py │ │ ├── routes_beautifulsoup.py │ │ ├── routes_camoufox.py │ │ ├── routes_parsel.py │ │ ├── routes_playwright.py │ │ └── routes_playwright_camoufox.py │ └── {{cookiecutter.project_name}} │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── pyproject.toml │ │ ├── requirements.txt │ │ └── {{cookiecutter.__package_name}} │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── main.py │ │ └── routes.py │ ├── proxy_configuration.py │ ├── py.typed │ ├── request_loaders │ ├── __init__.py │ ├── _request_list.py │ ├── _request_loader.py │ ├── _request_manager.py │ └── _request_manager_tandem.py │ ├── router.py │ ├── sessions │ ├── __init__.py │ ├── _cookies.py │ ├── _models.py │ ├── _session.py │ ├── _session_pool.py │ └── py.typed │ ├── statistics │ ├── __init__.py │ ├── _error_snapshotter.py │ ├── _error_tracker.py │ ├── _models.py │ └── _statistics.py │ ├── storage_clients │ ├── __init__.py │ ├── _base │ │ ├── __init__.py │ │ ├── _dataset_client.py │ │ ├── _dataset_collection_client.py │ │ ├── _key_value_store_client.py │ │ ├── _key_value_store_collection_client.py │ │ ├── _request_queue_client.py │ │ ├── _request_queue_collection_client.py │ │ ├── _storage_client.py │ │ ├── _types.py │ │ └── py.typed │ ├── _memory │ │ ├── __init__.py │ │ ├── _creation_management.py │ │ ├── _dataset_client.py │ │ ├── _dataset_collection_client.py │ │ ├── _key_value_store_client.py │ │ ├── _key_value_store_collection_client.py │ │ ├── _memory_storage_client.py │ │ ├── _request_queue_client.py │ │ ├── _request_queue_collection_client.py │ │ └── py.typed │ ├── models.py │ └── py.typed │ └── storages │ ├── __init__.py │ ├── _base.py │ ├── _creation_management.py │ ├── _dataset.py │ ├── _key_value_store.py │ ├── _request_queue.py │ └── py.typed ├── tests ├── __init__.py ├── e2e │ ├── __init__.py │ ├── conftest.py │ └── project_template │ │ ├── test_static_crawlers_templates.py │ │ └── utils.py └── unit │ ├── README.md │ ├── __init__.py │ ├── _autoscaling │ ├── test_autoscaled_pool.py │ ├── test_snapshotter.py │ └── test_system_status.py │ ├── _statistics │ ├── test_error_tracker.py │ ├── test_periodic_logging.py │ └── test_persistence.py │ ├── _utils │ ├── test_byte_size.py │ ├── test_console.py │ ├── test_crypto.py │ ├── test_data_processing.py │ ├── test_file.py │ ├── test_globs.py │ ├── test_html_to_text.py │ ├── test_measure_time.py │ ├── test_recurring_task.py │ ├── test_requests.py │ ├── test_robots.py │ ├── test_system.py │ ├── test_timedelata_ms.py │ └── test_urls.py │ ├── browsers │ ├── test_browser_pool.py │ ├── test_playwright_browser.py │ ├── test_playwright_browser_controller.py │ └── test_playwright_browser_plugin.py │ ├── conftest.py │ ├── crawlers │ ├── _adaptive_playwright │ │ ├── test_adaptive_playwright_crawler.py │ │ ├── test_adaptive_playwright_crawler_statistics.py │ │ ├── test_adaptive_playwright_crawling_context.py │ │ └── test_predictor.py │ ├── _basic │ │ ├── test_basic_crawler.py │ │ └── test_context_pipeline.py │ ├── _beautifulsoup │ │ └── test_beautifulsoup_crawler.py │ ├── _http │ │ └── test_http_crawler.py │ ├── _parsel │ │ └── test_parsel_crawler.py │ └── _playwright │ │ └── test_playwright_crawler.py │ ├── events │ ├── test_event_manager.py │ └── test_local_event_manager.py │ ├── fingerprint_suite │ ├── test_adapters.py │ └── test_header_generator.py │ ├── http_clients │ ├── test_curl_impersonate.py │ └── test_httpx.py │ ├── proxy_configuration │ ├── test_new_proxy_info.py │ └── test_tiers.py │ ├── request_loaders │ └── test_request_list.py │ ├── server.py │ ├── server_endpoints.py │ ├── sessions │ ├── test_cookies.py │ ├── test_models.py │ ├── test_session.py │ └── test_session_pool.py │ ├── storage_clients │ └── _memory │ │ ├── test_creation_management.py │ │ ├── test_dataset_client.py │ │ ├── test_dataset_collection_client.py │ │ ├── test_key_value_store_client.py │ │ ├── test_key_value_store_collection_client.py │ │ ├── test_memory_storage_client.py │ │ ├── test_memory_storage_e2e.py │ │ ├── test_request_queue_client.py │ │ └── test_request_queue_collection_client.py │ ├── storages │ ├── test_dataset.py │ ├── test_key_value_store.py │ ├── test_request_manager_tandem.py │ └── test_request_queue.py │ ├── test_cli.py │ ├── test_configuration.py │ ├── test_log_config.py │ ├── test_router.py │ └── test_service_locator.py ├── uv.lock └── website ├── .eslintrc.json ├── .yarnrc.yml ├── babel.config.js ├── build_api_reference.sh ├── docusaurus.config.js ├── generate_module_shortcuts.py ├── package.json ├── patches ├── @docusaurus+core+3.4.0.patch └── @docusaurus+core+3.5.2.patch ├── roa-loader ├── index.js └── package.json ├── sidebars.js ├── src ├── components │ ├── ApiLink.jsx │ ├── Button.jsx │ ├── Button.module.css │ ├── CopyButton.jsx │ ├── CopyButton.module.css │ ├── Gradients.jsx │ ├── Highlights.jsx │ ├── Highlights.module.css │ ├── Homepage │ │ ├── HomepageCliExample.jsx │ │ ├── HomepageCliExample.module.css │ │ ├── HomepageCtaSection.jsx │ │ ├── HomepageCtaSection.module.css │ │ ├── HomepageHeroSection.jsx │ │ ├── HomepageHeroSection.module.css │ │ ├── LanguageInfoWidget.jsx │ │ ├── LanguageInfoWidget.module.css │ │ ├── LanguageSwitch.jsx │ │ ├── LanguageSwitch.module.css │ │ ├── RiverSection.jsx │ │ ├── RiverSection.module.css │ │ ├── ThreeCardsWithIcon.jsx │ │ ├── ThreeCardsWithIcon.module.css │ │ ├── animated-crawlee-logo-dark.svg │ │ └── animated-crawlee-logo-light.svg │ ├── RunnableCodeBlock.jsx │ └── RunnableCodeBlock.module.css ├── css │ └── custom.css ├── pages │ ├── home_page_example.py │ ├── index.js │ └── index.module.css └── theme │ ├── ColorModeToggle │ ├── dark-mode-icon.svg │ ├── index.js │ ├── light-mode-icon.svg │ └── styles.module.css │ ├── DocItem │ └── Layout │ │ ├── index.js │ │ └── styles.module.css │ ├── Footer │ ├── LinkItem │ │ ├── index.js │ │ └── index.module.css │ ├── index.js │ └── index.module.css │ ├── MDXComponents │ └── A.js │ ├── Navbar │ ├── Content │ │ ├── index.js │ │ └── styles.module.css │ ├── Logo │ │ ├── index.js │ │ └── index.module.css │ └── MobileSidebar │ │ ├── Header │ │ ├── index.js │ │ └── index.module.css │ │ ├── Layout │ │ └── index.js │ │ ├── PrimaryMenu │ │ └── index.js │ │ └── index.js │ └── NavbarItem │ └── ComponentTypes.js ├── static ├── .nojekyll ├── font │ ├── lota.woff │ └── lota.woff2 ├── img │ ├── API.png │ ├── apify_logo.svg │ ├── apify_og_SDK.png │ ├── apify_sdk.svg │ ├── apify_sdk_white.svg │ ├── arrow_right.svg │ ├── auto-scaling-dark.webp │ ├── auto-scaling-light.webp │ ├── check.svg │ ├── chrome-scrape-dark.gif │ ├── chrome-scrape-light.gif │ ├── cloud_icon.svg │ ├── community-dark-icon.svg │ ├── community-light-icon.svg │ ├── crawlee-dark-new.svg │ ├── crawlee-dark.svg │ ├── crawlee-javascript-dark.svg │ ├── crawlee-javascript-light.svg │ ├── crawlee-light-new.svg │ ├── crawlee-light.svg │ ├── crawlee-logo-monocolor.svg │ ├── crawlee-logo.svg │ ├── crawlee-python-dark.svg │ ├── crawlee-python-light.svg │ ├── crawlee-python-og.png │ ├── defaults-dark-icon.svg │ ├── defaults-light-icon.svg │ ├── discord-brand-dark.svg │ ├── discord-brand.svg │ ├── docusaurus.svg │ ├── external-link.svg │ ├── favicon.ico │ ├── favorite-tools-dark.webp │ ├── favorite-tools-light.webp │ ├── features │ │ ├── auto-scaling.svg │ │ ├── automate-everything.svg │ │ ├── fingerprints.svg │ │ ├── node-requests.svg │ │ ├── runs-on-py.svg │ │ ├── storage.svg │ │ └── works-everywhere.svg │ ├── fill-and-submit-web-form │ │ ├── 00.jpg │ │ ├── 01.jpg │ │ ├── 02.jpg │ │ └── 03.jpg │ ├── getting-started │ │ ├── current-price.jpg │ │ ├── scraping-practice.jpg │ │ ├── select-an-element.jpg │ │ ├── selected-element.jpg │ │ ├── sku.jpg │ │ └── title.jpg │ ├── github-brand-dark.svg │ ├── github-brand.svg │ ├── hearth copy.svg │ ├── hearth.svg │ ├── javascript_logo.svg │ ├── js_file.svg │ ├── logo-big.svg │ ├── logo-blur.png │ ├── logo-blur.svg │ ├── logo-zoom.svg │ ├── menu-arrows.svg │ ├── oss_logo.png │ ├── puppeteer-live-view-dashboard.png │ ├── puppeteer-live-view-detail.png │ ├── queue-dark-icon.svg │ ├── queue-light-icon.svg │ ├── resuming-paused-crawl │ │ ├── 00.webp │ │ └── 01.webp │ ├── robot.png │ ├── routing-dark-icon.svg │ ├── routing-light-icon.svg │ ├── scraping-utils-dark-icon.svg │ ├── scraping-utils-light-icon.svg │ ├── smart-proxy-dark.webp │ ├── smart-proxy-light.webp │ ├── source_code.png │ ├── system.svg │ ├── triangles_dark.svg │ ├── triangles_light.svg │ ├── workflow.svg │ ├── zero-setup-dark-icon.svg │ └── zero-setup-light-icon.svg ├── js │ └── custom.js └── robots.txt ├── tools ├── docs-prettier.config.js ├── utils │ └── externalLink.js └── website_gif │ ├── chrome-scrape-dark.gif │ ├── chrome-scrape-dark.mp4 │ ├── chrome-scrape-light.gif │ ├── chrome-scrape-light.mp4 │ └── website_gif.mjs ├── tsconfig.eslint.json └── yarn.lock /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | end_of_line = lf 10 | 11 | [Makefile] 12 | indent_style = tab 13 | 14 | [{*.yaml, *.yml}] 15 | indent_size = 2 16 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Documentation codeowner 2 | 3 | /docs/*.md @TC-MO 4 | /docs/*.mdx @TC-MO 5 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | 3 | 4 | 5 | - TODO 6 | 7 | ### Issues 8 | 9 | 10 | 11 | - Closes: #TODO 12 | 13 | ### Testing 14 | 15 | 16 | 17 | - TODO 18 | 19 | ### Checklist 20 | 21 | - [ ] CI passed 22 | -------------------------------------------------------------------------------- /.github/workflows/check_pr_title.yaml: -------------------------------------------------------------------------------- 1 | name: Check PR title 2 | 3 | on: 4 | pull_request_target: 5 | types: [opened, edited, synchronize] 6 | 7 | jobs: 8 | check_pr_title: 9 | name: Check PR title 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: amannn/action-semantic-pull-request@v5.5.3 13 | env: 14 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 15 | -------------------------------------------------------------------------------- /.github/workflows/run_code_checks.yaml: -------------------------------------------------------------------------------- 1 | name: Run code checks 2 | 3 | on: 4 | # Trigger code checks on opening a new pull request. 5 | # Secrets are only made available to the integration tests job, with a manual approval 6 | # step required for PRs from forks. This prevents their potential exposure. 7 | pull_request: 8 | 9 | # Pushing to the master branch triggers code checks 10 | push: 11 | branches: 12 | - master 13 | tags-ignore: 14 | - "**" # Ignore all tags to prevent duplicate checks when tags are pushed. 15 | 16 | # It should also be possible to trigger checks manually 17 | workflow_dispatch: 18 | 19 | jobs: 20 | lint_check: 21 | name: Lint check 22 | uses: apify/workflows/.github/workflows/python_lint_check.yaml@main 23 | 24 | type_check: 25 | name: Type check 26 | uses: apify/workflows/.github/workflows/python_type_check.yaml@main 27 | 28 | unit_tests: 29 | name: Unit tests 30 | uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main 31 | secrets: 32 | httpbin_url: ${{ secrets.APIFY_HTTPBIN_TOKEN && format('https://httpbin.apify.actor?token={0}', secrets.APIFY_HTTPBIN_TOKEN) || 'https://httpbin.org'}} 33 | 34 | docs_check: 35 | name: Docs check 36 | uses: apify/workflows/.github/workflows/python_docs_check.yaml@main 37 | -------------------------------------------------------------------------------- /.github/workflows/update_new_issue.yaml: -------------------------------------------------------------------------------- 1 | name: Update new issue 2 | 3 | on: 4 | issues: 5 | types: 6 | - opened 7 | 8 | jobs: 9 | label_issues: 10 | name: Label issues 11 | runs-on: ubuntu-latest 12 | permissions: 13 | issues: write 14 | 15 | steps: 16 | # Add the "t-tooling" label to all new issues 17 | - uses: actions/github-script@v7 18 | with: 19 | script: | 20 | github.rest.issues.addLabels({ 21 | issue_number: context.issue.number, 22 | owner: context.repo.owner, 23 | repo: context.repo.repo, 24 | labels: ["t-tooling"] 25 | }) 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Cache 2 | __pycache__ 3 | .mypy_cache 4 | .pytest_cache 5 | .ruff_cache 6 | 7 | # Virtual envs 8 | .venv 9 | .direnv 10 | .envrc 11 | .python-version 12 | 13 | # Other Python tools 14 | .ropeproject 15 | 16 | # Mise 17 | mise.toml 18 | .mise.toml 19 | 20 | # Egg and build artifacts 21 | *.egg-info/ 22 | *.egg 23 | dist/ 24 | build/ 25 | 26 | # Coverage reports 27 | .coverage* 28 | htmlcov 29 | 30 | # IDE, editors 31 | .vscode 32 | .idea 33 | .DS_Store 34 | .nvim.lua 35 | Session.vim 36 | 37 | # Docs 38 | docs/changelog.md 39 | 40 | # Website build artifacts, node dependencies 41 | website/build 42 | website/node_modules 43 | website/.yarn 44 | website/.docusaurus 45 | website/api-typedoc-generated.json 46 | website/apify-shared-docspec-dump.jsonl 47 | website/docspec-dump.jsonl 48 | website/module_shortcuts.json 49 | website/typedoc-types* 50 | # npm lockfile (we use yarn) 51 | website/package-lock.json 52 | 53 | # Default directory for memory storage 54 | storage/ 55 | -------------------------------------------------------------------------------- /.markdownlint.yaml: -------------------------------------------------------------------------------- 1 | default: true 2 | line-length: 3 | line_length: 120 4 | MD007: 5 | indent: 4 6 | MD004: 7 | style: dash 8 | no-inline-html: false 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: lint-check 5 | name: Lint check 6 | entry: make lint 7 | language: system 8 | pass_filenames: false 9 | 10 | - id: type-check 11 | name: Type check 12 | entry: make type-check 13 | language: system 14 | pass_filenames: false 15 | -------------------------------------------------------------------------------- /docs/deployment/code_examples/apify/crawler_as_actor_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from apify import Actor 4 | 5 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 6 | 7 | 8 | async def main() -> None: 9 | # Wrap the crawler code in an Actor context manager. 10 | async with Actor: 11 | crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) 12 | 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | data = { 17 | 'url': context.request.url, 18 | 'title': context.soup.title.string if context.soup.title else None, 19 | } 20 | await context.push_data(data) 21 | await context.enqueue_links() 22 | 23 | await crawler.run(['https://crawlee.dev']) 24 | 25 | 26 | if __name__ == '__main__': 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /docs/deployment/code_examples/apify/get_public_url.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from apify import Actor 4 | 5 | 6 | async def main() -> None: 7 | async with Actor: 8 | store = await Actor.open_key_value_store() 9 | await store.set_value('your-file', {'foo': 'bar'}) 10 | url = store.get_public_url('your-file') 11 | Actor.log.info(f'KVS public URL: {url}') 12 | # https://api.apify.com/v2/key-value-stores//records/your-file 13 | 14 | 15 | if __name__ == '__main__': 16 | asyncio.run(main()) 17 | -------------------------------------------------------------------------------- /docs/deployment/code_examples/apify/log_with_config_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from apify import Actor, Configuration 4 | 5 | 6 | async def main() -> None: 7 | # Create a new configuration with your API key. You can find it at 8 | # https://console.apify.com/settings/integrations. It can be provided either 9 | # as a parameter "token" or as an environment variable "APIFY_TOKEN". 10 | config = Configuration( 11 | token='apify_api_YOUR_TOKEN', 12 | ) 13 | 14 | async with Actor(config): 15 | Actor.log.info('Hello from Apify platform!') 16 | 17 | 18 | if __name__ == '__main__': 19 | asyncio.run(main()) 20 | -------------------------------------------------------------------------------- /docs/deployment/code_examples/apify/proxy_advanced_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from apify import Actor 4 | 5 | 6 | async def main() -> None: 7 | async with Actor: 8 | proxy_configuration = await Actor.create_proxy_configuration( 9 | password='apify_proxy_YOUR_PASSWORD', 10 | # Specify the proxy group to use. 11 | groups=['RESIDENTIAL'], 12 | # Set the country code for the proxy. 13 | country_code='US', 14 | ) 15 | 16 | # ... 17 | 18 | 19 | if __name__ == '__main__': 20 | asyncio.run(main()) 21 | -------------------------------------------------------------------------------- /docs/deployment/code_examples/apify/proxy_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from apify import Actor 4 | 5 | 6 | async def main() -> None: 7 | async with Actor: 8 | # Create a new Apify Proxy configuration. The password can be found at 9 | # https://console.apify.com/proxy/http-settings and should be provided either 10 | # as a parameter "password" or as an environment variable "APIFY_PROXY_PASSWORD". 11 | proxy_configuration = await Actor.create_proxy_configuration( 12 | password='apify_proxy_YOUR_PASSWORD', 13 | ) 14 | 15 | if not proxy_configuration: 16 | Actor.log.warning('Failed to create proxy configuration.') 17 | return 18 | 19 | proxy_url = await proxy_configuration.new_url() 20 | Actor.log.info(f'Proxy URL: {proxy_url}') 21 | 22 | 23 | if __name__ == '__main__': 24 | asyncio.run(main()) 25 | -------------------------------------------------------------------------------- /docs/examples/beautifulsoup_crawler.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | id: beautifulsoup-crawler 3 | title: BeautifulSoup crawler 4 | --- 5 | 6 | import ApiLink from '@site/src/components/ApiLink'; 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; 8 | 9 | import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler.py'; 10 | 11 | This example demonstrates how to use `BeautifulSoupCrawler` to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `

`, `

` and `

` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request. 12 | 13 | 14 | {BeautifulSoupExample} 15 | 16 | -------------------------------------------------------------------------------- /docs/examples/capture_screenshot_using_playwright.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | id: capture-screenshots-using-playwright 3 | title: Capture screenshots using Playwright 4 | --- 5 | 6 | import ApiLink from '@site/src/components/ApiLink'; 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; 8 | 9 | import CaptureScreenshotExample from '!!raw-loader!roa-loader!./code_examples/capture_screenshot_using_playwright.py'; 10 | 11 | This example demonstrates how to capture screenshots of web pages using `PlaywrightCrawler` and store them in the key-value store. 12 | 13 | The `PlaywrightCrawler` is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method. 14 | 15 | The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page. 16 | 17 | 18 | {CaptureScreenshotExample} 19 | 20 | -------------------------------------------------------------------------------- /docs/examples/code_examples/add_data_to_dataset_bs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler() 8 | 9 | # Define the default request handler, which will be called for every request. 10 | @crawler.router.default_handler 11 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 12 | context.log.info(f'Processing {context.request.url} ...') 13 | 14 | # Extract data from the page. 15 | data = { 16 | 'url': context.request.url, 17 | 'title': context.soup.title.string if context.soup.title else None, 18 | 'html': str(context.soup)[:1000], 19 | } 20 | 21 | # Push the extracted data to the default dataset. 22 | await context.push_data(data) 23 | 24 | # Run the crawler with the initial list of requests. 25 | await crawler.run( 26 | [ 27 | 'https://crawlee.dev', 28 | 'https://apify.com', 29 | 'https://example.com', 30 | ] 31 | ) 32 | 33 | 34 | if __name__ == '__main__': 35 | asyncio.run(main()) 36 | -------------------------------------------------------------------------------- /docs/examples/code_examples/add_data_to_dataset_dataset.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.storages import Dataset 4 | 5 | 6 | async def main() -> None: 7 | # Open dataset manually using asynchronous constructor open(). 8 | dataset = await Dataset.open() 9 | 10 | # Interact with dataset directly. 11 | await dataset.push_data({'key': 'value'}) 12 | 13 | 14 | if __name__ == '__main__': 15 | asyncio.run(main()) 16 | -------------------------------------------------------------------------------- /docs/examples/code_examples/add_data_to_dataset_pw.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = PlaywrightCrawler() 8 | 9 | # Define the default request handler, which will be called for every request. 10 | @crawler.router.default_handler 11 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 12 | context.log.info(f'Processing {context.request.url} ...') 13 | 14 | # Extract data from the page. 15 | data = { 16 | 'url': context.request.url, 17 | 'title': await context.page.title(), 18 | 'html': str(await context.page.content())[:1000], 19 | } 20 | 21 | # Push the extracted data to the default dataset. 22 | await context.push_data(data) 23 | 24 | # Run the crawler with the initial list of requests. 25 | await crawler.run( 26 | [ 27 | 'https://crawlee.dev', 28 | 'https://apify.com', 29 | 'https://example.com', 30 | ] 31 | ) 32 | 33 | 34 | if __name__ == '__main__': 35 | asyncio.run(main()) 36 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_all_links_on_website_bs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler( 8 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 9 | max_requests_per_crawl=10, 10 | ) 11 | 12 | # Define the default request handler, which will be called for every request. 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Enqueue all links found on the page. 18 | await context.enqueue_links() 19 | 20 | # Run the crawler with the initial list of requests. 21 | await crawler.run(['https://crawlee.dev']) 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_all_links_on_website_pw.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = PlaywrightCrawler( 8 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 9 | max_requests_per_crawl=10, 10 | ) 11 | 12 | # Define the default request handler, which will be called for every request. 13 | @crawler.router.default_handler 14 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Enqueue all links found on the page. 18 | await context.enqueue_links() 19 | 20 | # Run the crawler with the initial list of requests. 21 | await crawler.run(['https://crawlee.dev']) 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_multiple_urls_bs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler() 8 | 9 | # Define the default request handler, which will be called for every request. 10 | @crawler.router.default_handler 11 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 12 | context.log.info(f'Processing {context.request.url} ...') 13 | 14 | # Run the crawler with the initial list of requests. 15 | await crawler.run( 16 | [ 17 | 'https://crawlee.dev', 18 | 'https://apify.com', 19 | 'https://example.com', 20 | ] 21 | ) 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_multiple_urls_pw.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = PlaywrightCrawler() 8 | 9 | # Define the default request handler, which will be called for every request. 10 | @crawler.router.default_handler 11 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 12 | context.log.info(f'Processing {context.request.url} ...') 13 | 14 | # Run the crawler with the initial list of requests. 15 | await crawler.run( 16 | [ 17 | 'https://crawlee.dev', 18 | 'https://apify.com', 19 | 'https://example.com', 20 | ] 21 | ) 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_specific_links_on_website_bs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee import Glob 4 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 5 | 6 | 7 | async def main() -> None: 8 | crawler = BeautifulSoupCrawler( 9 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 10 | max_requests_per_crawl=10, 11 | ) 12 | 13 | # Define the default request handler, which will be called for every request. 14 | @crawler.router.default_handler 15 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 16 | context.log.info(f'Processing {context.request.url} ...') 17 | 18 | # Enqueue all the documentation links found on the page, except for the examples. 19 | await context.enqueue_links( 20 | include=[Glob('https://crawlee.dev/docs/**')], 21 | exclude=[Glob('https://crawlee.dev/docs/examples')], 22 | ) 23 | 24 | # Run the crawler with the initial list of requests. 25 | await crawler.run(['https://crawlee.dev']) 26 | 27 | 28 | if __name__ == '__main__': 29 | asyncio.run(main()) 30 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_specific_links_on_website_pw.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee import Glob 4 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 5 | 6 | 7 | async def main() -> None: 8 | crawler = PlaywrightCrawler( 9 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 10 | max_requests_per_crawl=10, 11 | ) 12 | 13 | # Define the default request handler, which will be called for every request. 14 | @crawler.router.default_handler 15 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 16 | context.log.info(f'Processing {context.request.url} ...') 17 | 18 | # Enqueue all the documentation links found on the page, except for the examples. 19 | await context.enqueue_links( 20 | include=[Glob('https://crawlee.dev/docs/**')], 21 | exclude=[Glob('https://crawlee.dev/docs/examples')], 22 | ) 23 | 24 | # Run the crawler with the initial list of requests. 25 | await crawler.run(['https://crawlee.dev']) 26 | 27 | 28 | if __name__ == '__main__': 29 | asyncio.run(main()) 30 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_website_with_relative_links_all_links.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler( 8 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 9 | max_requests_per_crawl=10, 10 | ) 11 | 12 | # Define the default request handler, which will be called for every request. 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Enqueue all links found on the page. Any URLs found will be matched by 18 | # this strategy, even if they go off the site you are currently crawling. 19 | await context.enqueue_links(strategy='all') 20 | 21 | # Run the crawler with the initial list of requests. 22 | await crawler.run(['https://crawlee.dev']) 23 | 24 | 25 | if __name__ == '__main__': 26 | asyncio.run(main()) 27 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_website_with_relative_links_same_domain.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler( 8 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 9 | max_requests_per_crawl=10, 10 | ) 11 | 12 | # Define the default request handler, which will be called for every request. 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Setting the strategy to same domain will enqueue all links found that 18 | # are on the same hostname as request.loaded_url or request.url. 19 | await context.enqueue_links(strategy='same-domain') 20 | 21 | # Run the crawler with the initial list of requests. 22 | await crawler.run(['https://crawlee.dev']) 23 | 24 | 25 | if __name__ == '__main__': 26 | asyncio.run(main()) 27 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_website_with_relative_links_same_hostname.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler( 8 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 9 | max_requests_per_crawl=10, 10 | ) 11 | 12 | # Define the default request handler, which will be called for every request. 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Setting the strategy to same hostname will enqueue all links found that are on 18 | # the same hostname (including subdomains) as request.loaded_url or request.url. 19 | await context.enqueue_links(strategy='same-hostname') 20 | 21 | # Run the crawler with the initial list of requests. 22 | await crawler.run(['https://crawlee.dev']) 23 | 24 | 25 | if __name__ == '__main__': 26 | asyncio.run(main()) 27 | -------------------------------------------------------------------------------- /docs/examples/code_examples/crawl_website_with_relative_links_same_origin.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler( 8 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 9 | max_requests_per_crawl=10, 10 | ) 11 | 12 | # Define the default request handler, which will be called for every request. 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Setting the strategy to same origin will enqueue all links found that are on 18 | # the same origin as request.loaded_url or request.url. 19 | await context.enqueue_links(strategy='same-origin') 20 | 21 | # Run the crawler with the initial list of requests. 22 | await crawler.run(['https://crawlee.dev']) 23 | 24 | 25 | if __name__ == '__main__': 26 | asyncio.run(main()) 27 | -------------------------------------------------------------------------------- /docs/examples/code_examples/export_entire_dataset_to_file_csv.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler( 8 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 9 | max_requests_per_crawl=10, 10 | ) 11 | 12 | # Define the default request handler, which will be called for every request. 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Extract data from the page. 18 | data = { 19 | 'url': context.request.url, 20 | 'title': context.soup.title.string if context.soup.title else None, 21 | } 22 | 23 | # Enqueue all links found on the page. 24 | await context.enqueue_links() 25 | 26 | # Push the extracted data to the default dataset. 27 | await context.push_data(data) 28 | 29 | # Run the crawler with the initial list of URLs. 30 | await crawler.run(['https://crawlee.dev']) 31 | 32 | # Export the entire dataset to a CSV file. 33 | await crawler.export_data_csv(path='results.csv') 34 | 35 | 36 | if __name__ == '__main__': 37 | asyncio.run(main()) 38 | -------------------------------------------------------------------------------- /docs/examples/code_examples/export_entire_dataset_to_file_json.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler( 8 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 9 | max_requests_per_crawl=10, 10 | ) 11 | 12 | # Define the default request handler, which will be called for every request. 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Extract data from the page. 18 | data = { 19 | 'url': context.request.url, 20 | 'title': context.soup.title.string if context.soup.title else None, 21 | } 22 | 23 | # Enqueue all links found on the page. 24 | await context.enqueue_links() 25 | 26 | # Push the extracted data to the default dataset. 27 | await context.push_data(data) 28 | 29 | # Run the crawler with the initial list of URLs. 30 | await crawler.run(['https://crawlee.dev']) 31 | 32 | # Export the entire dataset to a JSON file. 33 | await crawler.export_data_json(path='results.json') 34 | 35 | 36 | if __name__ == '__main__': 37 | asyncio.run(main()) 38 | -------------------------------------------------------------------------------- /docs/examples/code_examples/fill_and_submit_web_form_request.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from urllib.parse import urlencode 3 | 4 | from crawlee import Request 5 | 6 | 7 | async def main() -> None: 8 | # Prepare a POST request to the form endpoint. 9 | request = Request.from_url( 10 | url='https://httpbin.org/post', 11 | method='POST', 12 | headers={'content-type': 'application/x-www-form-urlencoded'}, 13 | payload=urlencode( 14 | { 15 | 'custname': 'John Doe', 16 | 'custtel': '1234567890', 17 | 'custemail': 'johndoe@example.com', 18 | 'size': 'large', 19 | 'topping': ['bacon', 'cheese', 'mushroom'], 20 | 'delivery': '13:00', 21 | 'comments': 'Please ring the doorbell upon arrival.', 22 | } 23 | ).encode(), 24 | ) 25 | 26 | 27 | if __name__ == '__main__': 28 | asyncio.run(main()) 29 | -------------------------------------------------------------------------------- /docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from random import choice 3 | 4 | from crawlee.crawlers import ParselCrawler, ParselCrawlingContext 5 | from crawlee.statistics import Statistics 6 | 7 | 8 | async def main() -> None: 9 | crawler = ParselCrawler( 10 | statistics=Statistics.with_default_state(save_error_snapshots=True) 11 | ) 12 | 13 | @crawler.router.default_handler 14 | async def request_handler(context: ParselCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | # Simulate various errors to demonstrate `ErrorSnapshotter` 17 | # saving only the first occurrence of unique error. 18 | await context.enqueue_links() 19 | random_number = choice(range(10)) 20 | if random_number == 1: 21 | raise KeyError('Some KeyError') 22 | if random_number == 2: 23 | raise ValueError('Some ValueError') 24 | if random_number == 3: 25 | raise RuntimeError('Some RuntimeError') 26 | 27 | await crawler.run(['https://crawlee.dev']) 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /docs/examples/code_examples/playwright_block_requests.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import ( 4 | PlaywrightCrawler, 5 | PlaywrightCrawlingContext, 6 | PlaywrightPreNavCrawlingContext, 7 | ) 8 | 9 | 10 | async def main() -> None: 11 | crawler = PlaywrightCrawler( 12 | # Limit the crawl to max requests. Remove or increase it for crawling all links. 13 | max_requests_per_crawl=10, 14 | ) 15 | 16 | # Define the default request handler, which will be called for every request. 17 | @crawler.router.default_handler 18 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 19 | context.log.info(f'Processing {context.request.url} ...') 20 | 21 | await context.enqueue_links() 22 | 23 | # Define the hook, which will be called before every request. 24 | @crawler.pre_navigation_hook 25 | async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None: 26 | context.log.info(f'Navigating to {context.request.url} ...') 27 | 28 | # Block all requests to URLs that include `adsbygoogle.js` and also all defaults. 29 | await context.block_requests(extra_url_patterns=['adsbygoogle.js']) 30 | 31 | # Run the crawler with the initial list of URLs. 32 | await crawler.run(['https://crawlee.dev/']) 33 | 34 | 35 | if __name__ == '__main__': 36 | asyncio.run(main()) 37 | -------------------------------------------------------------------------------- /docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from random import choice 3 | 4 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 5 | from crawlee.statistics import Statistics 6 | 7 | 8 | async def main() -> None: 9 | crawler = PlaywrightCrawler( 10 | statistics=Statistics.with_default_state(save_error_snapshots=True) 11 | ) 12 | 13 | @crawler.router.default_handler 14 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | # Simulate various errors to demonstrate `ErrorSnapshotter` 17 | # saving only the first occurrence of unique error. 18 | await context.enqueue_links() 19 | random_number = choice(range(10)) 20 | if random_number == 1: 21 | raise KeyError('Some KeyError') 22 | if random_number == 2: 23 | raise ValueError('Some ValueError') 24 | if random_number == 3: 25 | raise RuntimeError('Some RuntimeError') 26 | 27 | await crawler.run(['https://crawlee.dev']) 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /docs/examples/code_examples/respect_robots_on_skipped_request.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee import SkippedReason 4 | from crawlee.crawlers import ( 5 | BeautifulSoupCrawler, 6 | BeautifulSoupCrawlingContext, 7 | ) 8 | 9 | 10 | async def main() -> None: 11 | # Initialize the crawler with robots.txt compliance enabled 12 | crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) 13 | 14 | @crawler.router.default_handler 15 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 16 | context.log.info(f'Processing {context.request.url} ...') 17 | 18 | # highlight-start 19 | # This handler is called when a request is skipped 20 | @crawler.on_skipped_request 21 | async def skipped_request_handler(url: str, reason: SkippedReason) -> None: 22 | # Check if the request was skipped due to robots.txt rules 23 | if reason == 'robots_txt': 24 | crawler.log.info(f'Skipped {url} due to robots.txt rules.') 25 | 26 | # highlight-end 27 | 28 | # Start the crawler with the specified URLs 29 | # The login URL will be skipped and handled by the skipped_request_handler 30 | await crawler.run( 31 | ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] 32 | ) 33 | 34 | 35 | if __name__ == '__main__': 36 | asyncio.run(main()) 37 | -------------------------------------------------------------------------------- /docs/examples/code_examples/respect_robots_txt_file.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import ( 4 | BeautifulSoupCrawler, 5 | BeautifulSoupCrawlingContext, 6 | ) 7 | 8 | 9 | async def main() -> None: 10 | # Initialize the crawler with robots.txt compliance enabled 11 | crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) 12 | 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | # Start the crawler with the specified URLs 18 | # The crawler will check the robots.txt file before making requests 19 | # In this example, 'https://news.ycombinator.com/login' will be skipped 20 | # because it's disallowed in the site's robots.txt file 21 | await crawler.run( 22 | ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] 23 | ) 24 | 25 | 26 | if __name__ == '__main__': 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /docs/examples/crawl_multiple_urls.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | id: crawl-multiple-urls 3 | title: Crawl multiple URLs 4 | --- 5 | 6 | import ApiLink from '@site/src/components/ApiLink'; 7 | import Tabs from '@theme/Tabs'; 8 | import TabItem from '@theme/TabItem'; 9 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; 10 | 11 | import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_bs.py'; 12 | import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_pw.py'; 13 | 14 | This example demonstrates how to crawl a specified list of URLs using different crawlers. You'll learn how to set up the crawler, define a request handler, and run the crawler with multiple URLs. This setup is useful for scraping data from multiple pages or websites concurrently. 15 | 16 | 17 | 18 | 19 | {BeautifulSoupExample} 20 | 21 | 22 | 23 | 24 | {PlaywrightExample} 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /docs/examples/crawler_keep_alive.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | id: crawler-keep-alive 3 | title: Keep a Crawler alive waiting for more requests 4 | --- 5 | 6 | import ApiLink from '@site/src/components/ApiLink'; 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; 8 | 9 | import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_keep_alive.py'; 10 | 11 | This example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of `BasicCrawler.__init__`. This is available to all crawlers that inherit from `BasicCrawler` and in the example below it is shown on `BeautifulSoupCrawler`. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`. 12 | 13 | 14 | {BeautifulSoupExample} 15 | 16 | -------------------------------------------------------------------------------- /docs/examples/crawler_stop.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | id: crawler-stop 3 | title: Stopping a Crawler with stop method 4 | --- 5 | 6 | import ApiLink from '@site/src/components/ApiLink'; 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; 8 | 9 | import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_stop.py'; 10 | 11 | This example demonstrates how to use `stop` method of `BasicCrawler` to stop crawler once the crawler finds what it is looking for. This method is available to all crawlers that inherit from `BasicCrawler` and in the example below it is shown on `BeautifulSoupCrawler`. Simply call `crawler.stop()` to stop the crawler. It will not continue to crawl through new requests. Requests that are already being concurrently processed are going to get finished. It is possible to call `stop` method with optional argument `reason` that is a string that will be used in logs and it can improve logs readability especially if you have multiple different conditions for triggering `stop`. 12 | 13 | 14 | {BeautifulSoupExample} 15 | 16 | -------------------------------------------------------------------------------- /docs/examples/parsel_crawler.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | id: parsel-crawler 3 | title: Parsel crawler 4 | --- 5 | 6 | import ApiLink from '@site/src/components/ApiLink'; 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; 8 | 9 | import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler.py'; 10 | 11 | This example shows how to use `ParselCrawler` to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request. 12 | 13 | 14 | {ParselCrawlerExample} 15 | 16 | -------------------------------------------------------------------------------- /docs/examples/playwright_crawler.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | id: playwright-crawler 3 | title: Playwright crawler 4 | --- 5 | 6 | import ApiLink from '@site/src/components/ApiLink'; 7 | import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; 8 | 9 | import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler.py'; 10 | 11 | This example demonstrates how to use `PlaywrightCrawler` to recursively scrape the Hacker news website using headless Chromium and Playwright. 12 | 13 | The `PlaywrightCrawler` manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content. 14 | 15 | A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. 16 | 17 | 18 | {PlaywrightCrawlerExample} 19 | 20 | -------------------------------------------------------------------------------- /docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.fingerprint_suite import ( 4 | DefaultFingerprintGenerator, 5 | HeaderGeneratorOptions, 6 | ScreenOptions, 7 | ) 8 | 9 | 10 | async def main() -> None: 11 | fingerprint_generator = DefaultFingerprintGenerator( 12 | header_options=HeaderGeneratorOptions(browsers=['chromium']), 13 | screen_options=ScreenOptions(min_width=400), 14 | ) 15 | 16 | # ... 17 | 18 | 19 | if __name__ == '__main__': 20 | asyncio.run(main()) 21 | -------------------------------------------------------------------------------- /docs/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | # Fingerprint generator is used by default. 8 | crawler = PlaywrightCrawler() 9 | 10 | # Define the default request handler, which will be called for every request. 11 | @crawler.router.default_handler 12 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 13 | context.log.info(f'Processing {context.request.url} ...') 14 | 15 | # Find a link to the next page and enqueue it if it exists. 16 | await context.enqueue_links(selector='.morelink') 17 | 18 | # Run the crawler with the initial list of URLs. 19 | await crawler.run(['https://news.ycombinator.com/']) 20 | 21 | 22 | if __name__ == '__main__': 23 | asyncio.run(main()) 24 | -------------------------------------------------------------------------------- /docs/guides/code_examples/error_handling/disable_retry.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext 4 | from crawlee.errors import HttpStatusCodeError, SessionError 5 | 6 | 7 | async def main() -> None: 8 | crawler = HttpCrawler(max_request_retries=5) 9 | 10 | # Create a parsing error for demonstration 11 | @crawler.router.default_handler 12 | async def default_handler(context: HttpCrawlingContext) -> None: 13 | context.log.info(f'Processing {context.request.url} ...') 14 | raise ValueError('Simulated parsing error') 15 | 16 | # This handler runs before any retry attempts 17 | @crawler.error_handler 18 | async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None: 19 | context.log.error(f'Failed request {context.request.url}') 20 | # Only allow retries for network-related errors 21 | if not isinstance(error, (SessionError, HttpStatusCodeError)): 22 | context.log.error('Non-network error detected') 23 | # Stop further retry attempts for this `Request` 24 | context.request.no_retry = True 25 | 26 | await crawler.run(['https://crawlee.dev/']) 27 | 28 | 29 | if __name__ == '__main__': 30 | asyncio.run(main()) 31 | -------------------------------------------------------------------------------- /docs/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin 4 | from crawlee.crawlers import PlaywrightCrawler 5 | 6 | 7 | async def main() -> None: 8 | crawler = PlaywrightCrawler( 9 | browser_pool=BrowserPool( 10 | plugins=[ 11 | PlaywrightBrowserPlugin( 12 | browser_type='chromium', 13 | browser_launch_options={ 14 | 'headless': False, 15 | 'channel': 'msedge', 16 | 'slow_mo': 200, 17 | }, 18 | browser_new_context_options={ 19 | 'color_scheme': 'dark', 20 | 'extra_http_headers': { 21 | 'Custom-Header': 'my-header', 22 | 'Accept-Language': 'en', 23 | }, 24 | 'user_agent': 'My-User-Agent', 25 | }, 26 | ) 27 | ] 28 | ) 29 | ) 30 | 31 | # ... 32 | 33 | 34 | if __name__ == '__main__': 35 | asyncio.run(main()) 36 | -------------------------------------------------------------------------------- /docs/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import ( 4 | PlaywrightCrawler, 5 | PlaywrightCrawlingContext, 6 | PlaywrightPreNavCrawlingContext, 7 | ) 8 | 9 | 10 | async def main() -> None: 11 | crawler = PlaywrightCrawler(max_requests_per_crawl=10) 12 | 13 | @crawler.router.default_handler 14 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | 17 | await context.enqueue_links() 18 | 19 | @crawler.pre_navigation_hook 20 | async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None: 21 | context.log.info(f'Navigating to {context.request.url} ...') 22 | 23 | # will set a timeout for all navigation methods 24 | context.page.set_default_navigation_timeout(600_000) 25 | 26 | # will set the page size before you go to the target URL 27 | await context.page.set_viewport_size({'width': 1280, 'height': 1024}) 28 | 29 | # Run the crawler with the initial list of URLs. 30 | await crawler.run(['https://crawlee.dev']) 31 | 32 | 33 | if __name__ == '__main__': 34 | asyncio.run(main()) 35 | -------------------------------------------------------------------------------- /docs/guides/code_examples/playwright_crawler_adaptive/handler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import timedelta 3 | 4 | from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext 5 | 6 | 7 | async def main() -> None: 8 | crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() 9 | 10 | @crawler.router.default_handler 11 | async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: 12 | # Locate element h2 within 5 seconds 13 | h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) 14 | # Do stuff with element found by the selector 15 | context.log.info(h2) 16 | 17 | await crawler.run(['https://crawlee.dev/']) 18 | 19 | 20 | if __name__ == '__main__': 21 | asyncio.run(main()) 22 | -------------------------------------------------------------------------------- /docs/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import AdaptivePlaywrightCrawler 4 | 5 | 6 | async def main() -> None: 7 | crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( 8 | # Arguments relevant only for PlaywrightCrawler 9 | playwright_crawler_specific_kwargs={ 10 | 'headless': False, 11 | 'browser_type': 'chromium', 12 | }, 13 | # Common arguments relevant to all crawlers 14 | max_crawl_depth=5, 15 | ) 16 | 17 | # ... 18 | 19 | 20 | if __name__ == '__main__': 21 | asyncio.run(main()) 22 | -------------------------------------------------------------------------------- /docs/guides/code_examples/playwright_crawler_adaptive/init_parsel.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import AdaptivePlaywrightCrawler 4 | 5 | 6 | async def main() -> None: 7 | crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( 8 | # Arguments relevant only for PlaywrightCrawler 9 | playwright_crawler_specific_kwargs={ 10 | 'headless': False, 11 | 'browser_type': 'chromium', 12 | }, 13 | # Common arguments relevant to all crawlers 14 | max_crawl_depth=5, 15 | ) 16 | 17 | # ... 18 | 19 | 20 | if __name__ == '__main__': 21 | asyncio.run(main()) 22 | -------------------------------------------------------------------------------- /docs/guides/code_examples/proxy_management/inspecting_bs_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | from crawlee.proxy_configuration import ProxyConfiguration 5 | 6 | 7 | async def main() -> None: 8 | # Create a ProxyConfiguration object and pass it to the crawler. 9 | proxy_configuration = ProxyConfiguration( 10 | proxy_urls=[ 11 | 'http://proxy-1.com/', 12 | 'http://proxy-2.com/', 13 | ] 14 | ) 15 | crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) 16 | 17 | # Define the default request handler, which will be called for every request. 18 | @crawler.router.default_handler 19 | async def default_handler(context: BeautifulSoupCrawlingContext) -> None: 20 | # Log the proxy used for the current request. 21 | context.log.info(f'Proxy for the current request: {context.proxy_info}') 22 | 23 | # Run the crawler with the initial list of requests. 24 | await crawler.run(['https://crawlee.dev/']) 25 | 26 | 27 | if __name__ == '__main__': 28 | asyncio.run(main()) 29 | -------------------------------------------------------------------------------- /docs/guides/code_examples/proxy_management/inspecting_pw_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | from crawlee.proxy_configuration import ProxyConfiguration 5 | 6 | 7 | async def main() -> None: 8 | # Create a ProxyConfiguration object and pass it to the crawler. 9 | proxy_configuration = ProxyConfiguration( 10 | proxy_urls=[ 11 | 'http://proxy-1.com/', 12 | 'http://proxy-2.com/', 13 | ] 14 | ) 15 | crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) 16 | 17 | # Define the default request handler, which will be called for every request. 18 | @crawler.router.default_handler 19 | async def default_handler(context: PlaywrightCrawlingContext) -> None: 20 | # Log the proxy used for the current request. 21 | context.log.info(f'Proxy for the current request: {context.proxy_info}') 22 | 23 | # Run the crawler with the initial list of requests. 24 | await crawler.run(['https://crawlee.dev/']) 25 | 26 | 27 | if __name__ == '__main__': 28 | asyncio.run(main()) 29 | -------------------------------------------------------------------------------- /docs/guides/code_examples/proxy_management/integration_bs_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | from crawlee.proxy_configuration import ProxyConfiguration 5 | 6 | 7 | async def main() -> None: 8 | # Create a ProxyConfiguration object and pass it to the crawler. 9 | proxy_configuration = ProxyConfiguration( 10 | proxy_urls=[ 11 | 'http://proxy-1.com/', 12 | 'http://proxy-2.com/', 13 | ] 14 | ) 15 | crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) 16 | 17 | # Define the default request handler, which will be called for every request. 18 | @crawler.router.default_handler 19 | async def default_handler(context: BeautifulSoupCrawlingContext) -> None: 20 | # Extract data from the page. 21 | data = { 22 | 'url': context.request.url, 23 | 'title': context.soup.title.string if context.soup.title else None, 24 | } 25 | context.log.info(f'Extracted data: {data}') 26 | 27 | # Run the crawler with the initial list of requests. 28 | await crawler.run(['https://crawlee.dev/']) 29 | 30 | 31 | if __name__ == '__main__': 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /docs/guides/code_examples/proxy_management/integration_pw_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | from crawlee.proxy_configuration import ProxyConfiguration 5 | 6 | 7 | async def main() -> None: 8 | # Create a ProxyConfiguration object and pass it to the crawler. 9 | proxy_configuration = ProxyConfiguration( 10 | proxy_urls=[ 11 | 'http://proxy-1.com/', 12 | 'http://proxy-2.com/', 13 | ] 14 | ) 15 | crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) 16 | 17 | # Define the default request handler, which will be called for every request. 18 | @crawler.router.default_handler 19 | async def default_handler(context: PlaywrightCrawlingContext) -> None: 20 | # Extract data from the page. 21 | data = { 22 | 'url': context.request.url, 23 | 'title': await context.page.title(), 24 | } 25 | context.log.info(f'Extracted data: {data}') 26 | 27 | # Run the crawler with the initial list of requests. 28 | await crawler.run(['https://crawlee.dev/']) 29 | 30 | 31 | if __name__ == '__main__': 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /docs/guides/code_examples/proxy_management/quick_start_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.proxy_configuration import ProxyConfiguration 4 | 5 | 6 | async def main() -> None: 7 | proxy_configuration = ProxyConfiguration( 8 | proxy_urls=[ 9 | 'http://proxy-1.com/', 10 | 'http://proxy-2.com/', 11 | ] 12 | ) 13 | 14 | # The proxy URLs are rotated in a round-robin. 15 | proxy_url_1 = await proxy_configuration.new_url() # http://proxy-1.com/ 16 | proxy_url_2 = await proxy_configuration.new_url() # http://proxy-2.com/ 17 | proxy_url_3 = await proxy_configuration.new_url() # http://proxy-1.com/ 18 | 19 | 20 | if __name__ == '__main__': 21 | asyncio.run(main()) 22 | -------------------------------------------------------------------------------- /docs/guides/code_examples/proxy_management/session_bs_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler 4 | from crawlee.proxy_configuration import ProxyConfiguration 5 | 6 | 7 | async def main() -> None: 8 | # Create a ProxyConfiguration object and pass it to the crawler. 9 | proxy_configuration = ProxyConfiguration( 10 | proxy_urls=[ 11 | 'http://proxy-1.com/', 12 | 'http://proxy-2.com/', 13 | ] 14 | ) 15 | crawler = BeautifulSoupCrawler( 16 | proxy_configuration=proxy_configuration, 17 | use_session_pool=True, 18 | ) 19 | 20 | # ... 21 | 22 | 23 | if __name__ == '__main__': 24 | asyncio.run(main()) 25 | -------------------------------------------------------------------------------- /docs/guides/code_examples/proxy_management/session_pw_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler 4 | from crawlee.proxy_configuration import ProxyConfiguration 5 | 6 | 7 | async def main() -> None: 8 | # Create a ProxyConfiguration object and pass it to the crawler. 9 | proxy_configuration = ProxyConfiguration( 10 | proxy_urls=[ 11 | 'http://proxy-1.com/', 12 | 'http://proxy-2.com/', 13 | ] 14 | ) 15 | crawler = PlaywrightCrawler( 16 | proxy_configuration=proxy_configuration, 17 | use_session_pool=True, 18 | ) 19 | 20 | # ... 21 | 22 | 23 | if __name__ == '__main__': 24 | asyncio.run(main()) 25 | -------------------------------------------------------------------------------- /docs/guides/code_examples/request_loaders/rl_basic_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.request_loaders import RequestList 4 | 5 | 6 | async def main() -> None: 7 | # Open the request list, if it does not exist, it will be created. 8 | # Leave name empty to use the default request list. 9 | request_list = RequestList( 10 | name='my-request-list', 11 | requests=[ 12 | 'https://apify.com/', 13 | 'https://crawlee.dev/', 14 | 'https://crawlee.dev/python/', 15 | ], 16 | ) 17 | 18 | # Fetch and process requests from the queue. 19 | while request := await request_list.fetch_next_request(): 20 | # Do something with it... 21 | 22 | # And mark it as handled. 23 | await request_list.mark_request_as_handled(request) 24 | 25 | 26 | if __name__ == '__main__': 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /docs/guides/code_examples/request_loaders/tandem_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import ParselCrawler, ParselCrawlingContext 4 | from crawlee.request_loaders import RequestList 5 | 6 | 7 | async def main() -> None: 8 | # Create a static request list. 9 | request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) 10 | 11 | # Convert the request list to a request manager using the to_tandem method. 12 | # It is a tandem with the default request queue. 13 | request_manager = await request_list.to_tandem() 14 | 15 | # Create a crawler and pass the request manager to it. 16 | crawler = ParselCrawler(request_manager=request_manager) 17 | 18 | @crawler.router.default_handler 19 | async def handler(context: ParselCrawlingContext) -> None: 20 | # New links will be enqueued directly to the queue. 21 | await context.enqueue_links() 22 | 23 | await crawler.run() 24 | 25 | 26 | if __name__ == '__main__': 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /docs/guides/code_examples/request_loaders/tandem_example_explicit.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import ParselCrawler, ParselCrawlingContext 4 | from crawlee.request_loaders import RequestList, RequestManagerTandem 5 | from crawlee.storages import RequestQueue 6 | 7 | 8 | async def main() -> None: 9 | # Create a static request list. 10 | request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) 11 | 12 | # Open the default request queue. 13 | request_queue = await RequestQueue.open() 14 | 15 | # And combine them together to a sinhle request manager. 16 | request_manager = RequestManagerTandem(request_list, request_queue) 17 | 18 | # Create a crawler and pass the request manager to it. 19 | crawler = ParselCrawler(request_manager=request_manager) 20 | 21 | @crawler.router.default_handler 22 | async def handler(context: ParselCrawlingContext) -> None: 23 | # New links will be enqueued directly to the queue. 24 | await context.enqueue_links() 25 | 26 | await crawler.run() 27 | 28 | 29 | if __name__ == '__main__': 30 | asyncio.run(main()) 31 | -------------------------------------------------------------------------------- /docs/guides/code_examples/running_in_web_server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/docs/guides/code_examples/running_in_web_server/__init__.py -------------------------------------------------------------------------------- /docs/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee import ConcurrencySettings 4 | from crawlee.crawlers import BeautifulSoupCrawler 5 | 6 | 7 | async def main() -> None: 8 | concurrency_settings = ConcurrencySettings( 9 | # Set the maximum number of concurrent requests the crawler can run to 100. 10 | max_concurrency=100, 11 | # Limit the total number of requests to 10 per minute to avoid overwhelming 12 | # the target website. 13 | max_tasks_per_minute=10, 14 | ) 15 | 16 | crawler = BeautifulSoupCrawler( 17 | # Apply the defined concurrency settings to the crawler. 18 | concurrency_settings=concurrency_settings, 19 | ) 20 | 21 | # ... 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /docs/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee import ConcurrencySettings 4 | from crawlee.crawlers import BeautifulSoupCrawler 5 | 6 | 7 | async def main() -> None: 8 | concurrency_settings = ConcurrencySettings( 9 | # Start with 8 concurrent tasks, as long as resources are available. 10 | desired_concurrency=8, 11 | # Maintain a minimum of 5 concurrent tasks to ensure steady crawling. 12 | min_concurrency=5, 13 | # Limit the maximum number of concurrent tasks to 10 to prevent 14 | # overloading the system. 15 | max_concurrency=10, 16 | ) 17 | 18 | crawler = BeautifulSoupCrawler( 19 | # Use the configured concurrency settings for the crawler. 20 | concurrency_settings=concurrency_settings, 21 | ) 22 | 23 | # ... 24 | 25 | 26 | if __name__ == '__main__': 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /docs/guides/code_examples/session_management/sm_standalone.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.sessions import SessionPool 4 | 5 | 6 | async def main() -> None: 7 | # Override the default Session pool configuration. 8 | async with SessionPool( 9 | max_pool_size=100, 10 | create_session_settings={'max_usage_count': 10, 'blocked_status_codes': [403]}, 11 | ) as session_pool: 12 | session = await session_pool.get_session() 13 | 14 | # Increase the error_score. 15 | session.mark_bad() 16 | 17 | # Throw away the session. 18 | session.retire() 19 | 20 | # Lower the error_score and mark the session good. 21 | session.mark_good() 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/cleaning_do_not_purge_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.configuration import Configuration 4 | from crawlee.crawlers import HttpCrawler, HttpCrawlingContext 5 | 6 | 7 | async def main() -> None: 8 | # Set the purge_on_start field to False to avoid purging the storage on start. 9 | # highlight-next-line 10 | configuration = Configuration(purge_on_start=False) 11 | 12 | # Pass the configuration to the crawler. 13 | crawler = HttpCrawler(configuration=configuration) 14 | 15 | @crawler.router.default_handler 16 | async def request_handler(context: HttpCrawlingContext) -> None: 17 | context.log.info(f'Processing {context.request.url} ...') 18 | 19 | await crawler.run(['https://crawlee.dev/']) 20 | 21 | 22 | if __name__ == '__main__': 23 | asyncio.run(main()) 24 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import HttpCrawler 4 | from crawlee.storage_clients import MemoryStorageClient 5 | 6 | 7 | async def main() -> None: 8 | storage_client = MemoryStorageClient.from_config() 9 | 10 | # Call the purge_on_start method to explicitly purge the storage. 11 | # highlight-next-line 12 | await storage_client.purge_on_start() 13 | 14 | # Pass the storage client to the crawler. 15 | crawler = HttpCrawler(storage_client=storage_client) 16 | 17 | # ... 18 | 19 | 20 | if __name__ == '__main__': 21 | asyncio.run(main()) 22 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/dataset_basic_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.storages import Dataset 4 | 5 | 6 | async def main() -> None: 7 | # Open the dataset, if it does not exist, it will be created. 8 | # Leave name empty to use the default dataset. 9 | dataset = await Dataset.open() 10 | 11 | # Push a single row of data. 12 | await dataset.push_data({'foo': 'bar'}) 13 | 14 | # Push multiple rows of data (anything JSON-serializable can be pushed). 15 | await dataset.push_data([{'foo': 'bar2', 'col2': 'val2'}, {'col3': 123}]) 16 | 17 | # Fetch all data from the dataset. 18 | data = await dataset.get_data() 19 | # Do something with it... 20 | 21 | # Remove the dataset. 22 | await dataset.drop() 23 | 24 | 25 | if __name__ == '__main__': 26 | asyncio.run(main()) 27 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/dataset_with_crawler_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | # Create a new crawler (it can be any subclass of BasicCrawler). 8 | crawler = BeautifulSoupCrawler() 9 | 10 | # Define the default request handler, which will be called for every request. 11 | @crawler.router.default_handler 12 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 13 | context.log.info(f'Processing {context.request.url} ...') 14 | 15 | # Extract data from the page. 16 | data = { 17 | 'url': context.request.url, 18 | 'title': context.soup.title.string if context.soup.title else None, 19 | } 20 | 21 | # Push the extracted data to the (default) dataset. 22 | await context.push_data(data) 23 | 24 | # Run the crawler with the initial URLs. 25 | await crawler.run(['https://crawlee.dev']) 26 | 27 | # Export the dataset to a file. 28 | await crawler.export_data(path='dataset.csv') 29 | 30 | 31 | if __name__ == '__main__': 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | from crawlee.storages import Dataset 5 | 6 | 7 | async def main() -> None: 8 | # Open the dataset, if it does not exist, it will be created. 9 | # Leave name empty to use the default dataset. 10 | dataset = await Dataset.open() 11 | 12 | # Create a new crawler (it can be any subclass of BasicCrawler). 13 | crawler = BeautifulSoupCrawler() 14 | 15 | # Define the default request handler, which will be called for every request. 16 | @crawler.router.default_handler 17 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 18 | context.log.info(f'Processing {context.request.url} ...') 19 | 20 | # Extract data from the page. 21 | data = { 22 | 'url': context.request.url, 23 | 'title': context.soup.title.string if context.soup.title else None, 24 | } 25 | 26 | # Push the extracted data to the dataset. 27 | await dataset.push_data(data) 28 | 29 | # Run the crawler with the initial URLs. 30 | await crawler.run(['https://crawlee.dev']) 31 | 32 | # Export the dataset to the key-value store. 33 | await dataset.export_to(key='dataset', content_type='csv') 34 | 35 | 36 | if __name__ == '__main__': 37 | asyncio.run(main()) 38 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/helper_add_requests_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler() 8 | 9 | @crawler.router.default_handler 10 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 11 | context.log.info(f'Processing {context.request.url} ...') 12 | # highlight-next-line 13 | await context.add_requests(['https://apify.com/']) 14 | 15 | await crawler.run(['https://crawlee.dev/']) 16 | 17 | 18 | if __name__ == '__main__': 19 | asyncio.run(main()) 20 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/helper_enqueue_links_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler() 8 | 9 | @crawler.router.default_handler 10 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 11 | context.log.info(f'Processing {context.request.url} ...') 12 | # highlight-next-line 13 | await context.enqueue_links() 14 | 15 | await crawler.run(['https://crawlee.dev/']) 16 | 17 | 18 | if __name__ == '__main__': 19 | asyncio.run(main()) 20 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/kvs_basic_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.storages import KeyValueStore 4 | 5 | 6 | async def main() -> None: 7 | # Open the key-value store, if it does not exist, it will be created. 8 | # Leave name empty to use the default KVS. 9 | kvs = await KeyValueStore.open() 10 | 11 | # Set a value associated with 'some-key'. 12 | await kvs.set_value(key='some-key', value={'foo': 'bar'}) 13 | 14 | # Get the value associated with 'some-key'. 15 | value = kvs.get_value('some-key') 16 | # Do something with it... 17 | 18 | # Delete the value associated with 'some-key' by setting it to None. 19 | await kvs.set_value(key='some-key', value=None) 20 | 21 | # Remove the key-value store. 22 | await kvs.drop() 23 | 24 | 25 | if __name__ == '__main__': 26 | asyncio.run(main()) 27 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/kvs_with_crawler_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | # Create a new Playwright crawler. 8 | crawler = PlaywrightCrawler() 9 | 10 | # Define the default request handler, which will be called for every request. 11 | @crawler.router.default_handler 12 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 13 | context.log.info(f'Processing {context.request.url} ...') 14 | 15 | # Capture the screenshot of the page using Playwright's API. 16 | screenshot = await context.page.screenshot() 17 | name = context.request.url.split('/')[-1] 18 | 19 | # Get the key-value store from the context. # If it does not exist, 20 | # it will be created. Leave name empty to use the default KVS. 21 | kvs = await context.get_key_value_store() 22 | 23 | # Store the screenshot in the key-value store. 24 | await kvs.set_value( 25 | key=f'screenshot-{name}', 26 | value=screenshot, 27 | content_type='image/png', 28 | ) 29 | 30 | # Run the crawler with the initial URLs. 31 | await crawler.run(['https://crawlee.dev']) 32 | 33 | 34 | if __name__ == '__main__': 35 | asyncio.run(main()) 36 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | from crawlee.storages import KeyValueStore 5 | 6 | 7 | async def main() -> None: 8 | # Open the key-value store, if it does not exist, it will be created. 9 | # Leave name empty to use the default KVS. 10 | kvs = await KeyValueStore.open() 11 | 12 | # Create a new Playwright crawler. 13 | crawler = PlaywrightCrawler() 14 | 15 | # Define the default request handler, which will be called for every request. 16 | @crawler.router.default_handler 17 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 18 | context.log.info(f'Processing {context.request.url} ...') 19 | 20 | # Capture the screenshot of the page using Playwright's API. 21 | screenshot = await context.page.screenshot() 22 | name = context.request.url.split('/')[-1] 23 | 24 | # Store the screenshot in the key-value store. 25 | await kvs.set_value( 26 | key=f'screenshot-{name}', 27 | value=screenshot, 28 | content_type='image/png', 29 | ) 30 | 31 | # Run the crawler with the initial URLs. 32 | await crawler.run(['https://crawlee.dev']) 33 | 34 | 35 | if __name__ == '__main__': 36 | asyncio.run(main()) 37 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/rq_basic_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.storages import RequestQueue 4 | 5 | 6 | async def main() -> None: 7 | # Open the request queue, if it does not exist, it will be created. 8 | # Leave name empty to use the default request queue. 9 | request_queue = await RequestQueue.open(name='my-request-queue') 10 | 11 | # Add a single request. 12 | await request_queue.add_request('https://apify.com/') 13 | 14 | # Add multiple requests as a batch. 15 | await request_queue.add_requests_batched( 16 | ['https://crawlee.dev/', 'https://crawlee.dev/python/'] 17 | ) 18 | 19 | # Fetch and process requests from the queue. 20 | while request := await request_queue.fetch_next_request(): 21 | # Do something with it... 22 | 23 | # And mark it as handled. 24 | await request_queue.mark_request_as_handled(request) 25 | 26 | # Remove the request queue. 27 | await request_queue.drop() 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/rq_with_crawler_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import HttpCrawler, HttpCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is 8 | # a default request manager, it will be opened, and fully managed if not specified. 9 | crawler = HttpCrawler() 10 | 11 | # Define the default request handler, which will be called for every request. 12 | @crawler.router.default_handler 13 | async def request_handler(context: HttpCrawlingContext) -> None: 14 | context.log.info(f'Processing {context.request.url} ...') 15 | 16 | # Use context's add_requests method helper to add new requests from the handler. 17 | await context.add_requests(['https://crawlee.dev/python/']) 18 | 19 | # Use crawler's add_requests method helper to add new requests. 20 | await crawler.add_requests(['https://apify.com/']) 21 | 22 | # Run the crawler. You can optionally pass the list of initial requests. 23 | await crawler.run(['https://crawlee.dev/']) 24 | 25 | 26 | if __name__ == '__main__': 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import HttpCrawler, HttpCrawlingContext 4 | from crawlee.storages import RequestQueue 5 | 6 | 7 | async def main() -> None: 8 | # Open the request queue, if it does not exist, it will be created. 9 | # Leave name empty to use the default request queue. 10 | request_queue = await RequestQueue.open(name='my-request-queue') 11 | 12 | # Interact with the request queue directly, e.g. add a batch of requests. 13 | await request_queue.add_requests_batched( 14 | ['https://apify.com/', 'https://crawlee.dev/'] 15 | ) 16 | 17 | # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request 18 | # list as request manager to it. It will be managed by the crawler. 19 | crawler = HttpCrawler(request_manager=request_queue) 20 | 21 | # Define the default request handler, which will be called for every request. 22 | @crawler.router.default_handler 23 | async def request_handler(context: HttpCrawlingContext) -> None: 24 | context.log.info(f'Processing {context.request.url} ...') 25 | 26 | # And execute the crawler. 27 | await crawler.run() 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/02_bs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | # Add import of crawler and crawling context. 4 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 5 | from crawlee.storages import RequestQueue 6 | 7 | 8 | async def main() -> None: 9 | # First you create the request queue instance. 10 | rq = await RequestQueue.open() 11 | 12 | # And then you add one or more requests to it. 13 | await rq.add_request('https://crawlee.dev') 14 | 15 | crawler = BeautifulSoupCrawler(request_manager=rq) 16 | 17 | # Define a request handler and attach it to the crawler using the decorator. 18 | @crawler.router.default_handler 19 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 20 | # Extract text with BeautifulSoup. 21 | # See BeautifulSoup documentation for API docs. 22 | url = context.request.url 23 | title = context.soup.title.string if context.soup.title else '' 24 | context.log.info(f'The title of {url} is: {title}.') 25 | 26 | await crawler.run() 27 | 28 | 29 | if __name__ == '__main__': 30 | asyncio.run(main()) 31 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/02_bs_better.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | # You don't need to import RequestQueue anymore. 4 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 5 | 6 | 7 | async def main() -> None: 8 | crawler = BeautifulSoupCrawler() 9 | 10 | @crawler.router.default_handler 11 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 12 | url = context.request.url 13 | title = context.soup.title.string if context.soup.title else '' 14 | context.log.info(f'The title of {url} is: {title}.') 15 | 16 | # Start the crawler with the provided URLs. 17 | await crawler.run(['https://crawlee.dev/']) 18 | 19 | 20 | if __name__ == '__main__': 21 | asyncio.run(main()) 22 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/02_request_queue.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.storages import RequestQueue 4 | 5 | 6 | async def main() -> None: 7 | # First you create the request queue instance. 8 | rq = await RequestQueue.open() 9 | 10 | # And then you add one or more requests to it. 11 | await rq.add_request('https://crawlee.dev') 12 | 13 | 14 | if __name__ == '__main__': 15 | asyncio.run(main()) 16 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/03_enqueue_strategy.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) 8 | 9 | @crawler.router.default_handler 10 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 11 | context.log.info(f'Processing {context.request.url}.') 12 | 13 | # See the `EnqueueStrategy` type alias for more strategy options. 14 | # highlight-next-line 15 | await context.enqueue_links( 16 | # highlight-next-line 17 | strategy='same-domain', 18 | # highlight-next-line 19 | ) 20 | 21 | await crawler.run(['https://crawlee.dev/']) 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/03_finding_new_links.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | # Let's limit our crawls to make our tests shorter and safer. 8 | crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) 9 | 10 | @crawler.router.default_handler 11 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 12 | url = context.request.url 13 | title = context.soup.title.string if context.soup.title else '' 14 | context.log.info(f'The title of {url} is: {title}.') 15 | 16 | # The enqueue_links function is available as one of the fields of the context. 17 | # It is also context aware, so it does not require any parameters. 18 | await context.enqueue_links() 19 | 20 | await crawler.run(['https://crawlee.dev/']) 21 | 22 | 23 | if __name__ == '__main__': 24 | asyncio.run(main()) 25 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/03_globs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee import Glob 4 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 5 | 6 | 7 | async def main() -> None: 8 | crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) 9 | 10 | @crawler.router.default_handler 11 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 12 | context.log.info(f'Processing {context.request.url}.') 13 | 14 | # Enqueue links that match the 'include' glob pattern and 15 | # do not match the 'exclude' glob pattern. 16 | # highlight-next-line 17 | await context.enqueue_links( 18 | # highlight-next-line 19 | include=[Glob('https://someplace.com/**/cats')], 20 | # highlight-next-line 21 | exclude=[Glob('https://**/archive/**')], 22 | # highlight-next-line 23 | ) 24 | 25 | await crawler.run(['https://crawlee.dev/']) 26 | 27 | 28 | if __name__ == '__main__': 29 | asyncio.run(main()) 30 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/03_original_code.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = BeautifulSoupCrawler() 8 | 9 | @crawler.router.default_handler 10 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 11 | url = context.request.url 12 | title = context.soup.title.string if context.soup.title else '' 13 | context.log.info(f'The title of {url} is: {title}.') 14 | 15 | await crawler.run(['https://crawlee.dev/']) 16 | 17 | 18 | if __name__ == '__main__': 19 | asyncio.run(main()) 20 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/04_sanity_check.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | # Instead of BeautifulSoupCrawler let's use Playwright to be able to render JavaScript. 4 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 5 | 6 | 7 | async def main() -> None: 8 | crawler = PlaywrightCrawler() 9 | 10 | @crawler.router.default_handler 11 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 12 | # Wait for the collection cards to render on the page. This ensures that 13 | # the elements we want to interact with are present in the DOM. 14 | await context.page.wait_for_selector('.collection-block-item') 15 | 16 | # Execute a function within the browser context to target the collection 17 | # card elements and extract their text content, trimming any leading or 18 | # trailing whitespace. 19 | category_texts = await context.page.eval_on_selector_all( 20 | '.collection-block-item', 21 | '(els) => els.map(el => el.textContent.trim())', 22 | ) 23 | 24 | # Log the extracted texts. 25 | for i, text in enumerate(category_texts): 26 | context.log.info(f'CATEGORY_{i + 1}: {text}') 27 | 28 | await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) 29 | 30 | 31 | if __name__ == '__main__': 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/05_crawling_listing.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | crawler = PlaywrightCrawler() 8 | 9 | @crawler.router.default_handler 10 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 11 | context.log.info(f'Processing {context.request.url}') 12 | 13 | # Wait for the category cards to render on the page. This ensures that 14 | # the elements we want to interact with are present in the DOM. 15 | await context.page.wait_for_selector('.collection-block-item') 16 | 17 | # Enqueue links found within elements that match the specified selector. 18 | # These links will be added to the crawling queue with the label CATEGORY. 19 | await context.enqueue_links( 20 | selector='.collection-block-item', 21 | label='CATEGORY', 22 | ) 23 | 24 | await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) 25 | 26 | 27 | if __name__ == '__main__': 28 | asyncio.run(main()) 29 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/07_first_code.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | from crawlee.storages import Dataset 5 | 6 | # ... 7 | 8 | 9 | async def main() -> None: 10 | crawler = PlaywrightCrawler() 11 | dataset = await Dataset.open() 12 | 13 | # ... 14 | 15 | @crawler.router.default_handler 16 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 17 | ... 18 | # ... 19 | 20 | 21 | if __name__ == '__main__': 22 | asyncio.run(main()) 23 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/08_main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler 4 | 5 | from .routes import router 6 | 7 | 8 | async def main() -> None: 9 | crawler = PlaywrightCrawler( 10 | # Let's limit our crawls to make our tests shorter and safer. 11 | max_requests_per_crawl=10, 12 | # Provide our router instance to the crawler. 13 | request_handler=router, 14 | ) 15 | 16 | await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) 17 | 18 | 19 | if __name__ == '__main__': 20 | asyncio.run(main()) 21 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/09_apify_sdk.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | # highlight-next-line 4 | from apify import Actor 5 | 6 | from crawlee.crawlers import PlaywrightCrawler 7 | 8 | from .routes import router 9 | 10 | 11 | async def main() -> None: 12 | # highlight-next-line 13 | async with Actor: 14 | crawler = PlaywrightCrawler( 15 | # Let's limit our crawls to make our tests shorter and safer. 16 | max_requests_per_crawl=10, 17 | # Provide our router instance to the crawler. 18 | request_handler=router, 19 | ) 20 | 21 | await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /docs/introduction/code_examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/docs/introduction/code_examples/__init__.py -------------------------------------------------------------------------------- /docs/introduction/code_examples/routes.py: -------------------------------------------------------------------------------- 1 | from crawlee.crawlers import PlaywrightCrawlingContext 2 | from crawlee.router import Router 3 | 4 | router = Router[PlaywrightCrawlingContext]() 5 | -------------------------------------------------------------------------------- /docs/pyproject.toml: -------------------------------------------------------------------------------- 1 | # Line lenght different from the rest of the code to make sure that the example codes visualised on the generated 2 | # documentation webpages are shown without vertical slider to make them more readable. 3 | 4 | [tool.ruff] 5 | # Inherit all from project top configuration file. 6 | extend = "../pyproject.toml" 7 | 8 | # Override just line length 9 | line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider. 10 | -------------------------------------------------------------------------------- /docs/quick-start/code_examples/beautifulsoup_crawler_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | # BeautifulSoupCrawler crawls the web using HTTP requests 8 | # and parses HTML using the BeautifulSoup library. 9 | crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) 10 | 11 | # Define a request handler to process each crawled page 12 | # and attach it to the crawler using a decorator. 13 | @crawler.router.default_handler 14 | async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | # Extract relevant data from the page context. 17 | data = { 18 | 'url': context.request.url, 19 | 'title': context.soup.title.string if context.soup.title else None, 20 | } 21 | # Store the extracted data. 22 | await context.push_data(data) 23 | # Extract links from the current page and add them to the crawling queue. 24 | await context.enqueue_links() 25 | 26 | # Add first URL to the queue and start the crawl. 27 | await crawler.run(['https://crawlee.dev']) 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /docs/quick-start/code_examples/parsel_crawler_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import ParselCrawler, ParselCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | # ParselCrawler crawls the web using HTTP requests 8 | # and parses HTML using the Parsel library. 9 | crawler = ParselCrawler(max_requests_per_crawl=10) 10 | 11 | # Define a request handler to process each crawled page 12 | # and attach it to the crawler using a decorator. 13 | @crawler.router.default_handler 14 | async def request_handler(context: ParselCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | # Extract relevant data from the page context. 17 | data = { 18 | 'url': context.request.url, 19 | 'title': context.selector.xpath('//title/text()').get(), 20 | } 21 | # Store the extracted data. 22 | await context.push_data(data) 23 | # Extract links from the current page and add them to the crawling queue. 24 | await context.enqueue_links() 25 | 26 | # Add first URL to the queue and start the crawl. 27 | await crawler.run(['https://crawlee.dev']) 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /docs/quick-start/code_examples/playwright_crawler_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext 4 | 5 | 6 | async def main() -> None: 7 | # PlaywrightCrawler crawls the web using a headless browser 8 | # controlled by the Playwright library. 9 | crawler = PlaywrightCrawler() 10 | 11 | # Define a request handler to process each crawled page 12 | # and attach it to the crawler using a decorator. 13 | @crawler.router.default_handler 14 | async def request_handler(context: PlaywrightCrawlingContext) -> None: 15 | context.log.info(f'Processing {context.request.url} ...') 16 | # Extract relevant data from the page context. 17 | data = { 18 | 'url': context.request.url, 19 | 'title': await context.page.title(), 20 | } 21 | # Store the extracted data. 22 | await context.push_data(data) 23 | # Extract links from the current page and add them to the crawling queue. 24 | await context.enqueue_links() 25 | 26 | # Add first URL to the queue and start the crawl. 27 | await crawler.run(['https://crawlee.dev']) 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /docs/quick-start/code_examples/playwright_crawler_headful_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from crawlee.crawlers import PlaywrightCrawler 4 | 5 | 6 | async def main() -> None: 7 | crawler = PlaywrightCrawler( 8 | # Run with a visible browser window. 9 | # highlight-next-line 10 | headless=False, 11 | # Switch to the Firefox browser. 12 | browser_type='firefox', 13 | ) 14 | 15 | # ... 16 | 17 | 18 | if __name__ == '__main__': 19 | asyncio.run(main()) 20 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["config:base", ":semanticCommitTypeAll(chore)"], 3 | "pinVersions": false, 4 | "separateMajorMinor": false, 5 | "dependencyDashboard": false, 6 | "semanticCommits": "enabled", 7 | "lockFileMaintenance": { 8 | "enabled": true, 9 | "automerge": true, 10 | "automergeType": "branch" 11 | }, 12 | "packageRules": [ 13 | { 14 | "matchPaths": ["pyproject.toml"], 15 | "matchDepTypes": ["devDependencies"], 16 | "matchUpdateTypes": ["major", "minor"], 17 | "groupName": "major/minor dev dependencies", 18 | "groupSlug": "dev-dependencies", 19 | "automerge": true, 20 | "automergeType": "branch" 21 | } 22 | ], 23 | "schedule": ["before 7am every weekday"], 24 | "ignoreDeps": ["crawlee", "docusaurus-plugin-typedoc-api"] 25 | } 26 | -------------------------------------------------------------------------------- /src/crawlee/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib import metadata 2 | 3 | from ._request import Request, RequestOptions 4 | from ._service_locator import service_locator 5 | from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason 6 | from ._utils.globs import Glob 7 | 8 | __version__ = metadata.version('crawlee') 9 | 10 | __all__ = [ 11 | 'ConcurrencySettings', 12 | 'EnqueueStrategy', 13 | 'Glob', 14 | 'HttpHeaders', 15 | 'Request', 16 | 'RequestOptions', 17 | 'RequestTransformAction', 18 | 'SkippedReason', 19 | 'service_locator', 20 | ] 21 | -------------------------------------------------------------------------------- /src/crawlee/_autoscaling/__init__.py: -------------------------------------------------------------------------------- 1 | from .autoscaled_pool import AutoscaledPool 2 | from .snapshotter import Snapshotter 3 | from .system_status import SystemStatus 4 | 5 | __all__ = ['AutoscaledPool', 'Snapshotter', 'SystemStatus'] 6 | -------------------------------------------------------------------------------- /src/crawlee/_autoscaling/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/_autoscaling/py.typed -------------------------------------------------------------------------------- /src/crawlee/_consts.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | METADATA_FILENAME = '__metadata__.json' 4 | -------------------------------------------------------------------------------- /src/crawlee/_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/_utils/__init__.py -------------------------------------------------------------------------------- /src/crawlee/_utils/blocked.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/blocked.ts 4 | 5 | CLOUDFLARE_RETRY_CSS_SELECTORS = [ 6 | '#turnstile-wrapper iframe[src^="https://challenges.cloudflare.com"]', 7 | ] 8 | 9 | RETRY_CSS_SELECTORS = [ 10 | *CLOUDFLARE_RETRY_CSS_SELECTORS, 11 | 'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]', 12 | 'iframe[src*="_Incapsula_Resource"]', 13 | ] 14 | """ 15 | CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked. 16 | """ 17 | 18 | ROTATE_PROXY_ERRORS = [ 19 | 'ECONNRESET', 20 | 'ECONNREFUSED', 21 | 'ERR_PROXY_CONNECTION_FAILED', 22 | 'ERR_TUNNEL_CONNECTION_FAILED', 23 | 'Proxy responded with', 24 | ] 25 | """ 26 | Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning. 27 | """ 28 | -------------------------------------------------------------------------------- /src/crawlee/_utils/crypto.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import secrets 4 | from hashlib import sha256 5 | 6 | 7 | def compute_short_hash(data: bytes, *, length: int = 8) -> str: 8 | """Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it. 9 | 10 | Args: 11 | data: The binary data to be hashed. 12 | length: The length of the hash to be returned. 13 | 14 | Returns: 15 | A substring (prefix) of the hexadecimal hash of the data. 16 | """ 17 | hash_object = sha256(data) 18 | return hash_object.hexdigest()[:length] 19 | 20 | 21 | def crypto_random_object_id(length: int = 17) -> str: 22 | """Generate a random object ID.""" 23 | chars = 'abcdefghijklmnopqrstuvwxyzABCEDFGHIJKLMNOPQRSTUVWXYZ0123456789' 24 | return ''.join(secrets.choice(chars) for _ in range(length)) 25 | -------------------------------------------------------------------------------- /src/crawlee/_utils/docs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Callable, Literal 4 | 5 | GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Event payloads', 'Errors', 'Functions'] 6 | 7 | 8 | def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001 9 | """Mark a symbol for rendering and grouping in documentation. 10 | 11 | This decorator is used solely for documentation purposes and does not modify the behavior 12 | of the decorated callable. 13 | 14 | Args: 15 | group_name: The documentation group to which the symbol belongs. 16 | 17 | Returns: 18 | The original callable without modification. 19 | """ 20 | 21 | def wrapper(func: Callable) -> Callable: 22 | return func 23 | 24 | return wrapper 25 | -------------------------------------------------------------------------------- /src/crawlee/_utils/html_to_text.py: -------------------------------------------------------------------------------- 1 | # This file contains shared constants used by different implementations of html_to_text function. 2 | from __future__ import annotations 3 | 4 | import re 5 | 6 | # Tags based on Javascript implementation of htmlToText from: 7 | # https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11 8 | # Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9 9 | SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'} 10 | BLOCK_TAGS = { 11 | 'p', 12 | 'h1', 13 | 'h2', 14 | 'h3', 15 | 'h4', 16 | 'h5', 17 | 'h6', 18 | 'ol', 19 | 'ul', 20 | 'li', 21 | 'pre', 22 | 'address', 23 | 'blockquote', 24 | 'dl', 25 | 'div', 26 | 'fieldset', 27 | 'form', 28 | 'table', 29 | 'tr', 30 | 'select', 31 | 'option', 32 | } 33 | 34 | _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$') 35 | _EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$') 36 | _ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+') 37 | -------------------------------------------------------------------------------- /src/crawlee/_utils/measure_time.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import time 4 | from contextlib import contextmanager 5 | from dataclasses import dataclass 6 | from typing import TYPE_CHECKING 7 | 8 | if TYPE_CHECKING: 9 | from collections.abc import Iterator 10 | 11 | 12 | @dataclass 13 | class TimerResult: 14 | wall: float | None = None 15 | cpu: float | None = None 16 | 17 | 18 | @contextmanager 19 | def measure_time() -> Iterator[TimerResult]: 20 | """Measure the execution time (wall-clock and CPU) between the start and end of the with-block.""" 21 | result = TimerResult() 22 | before_wall = time.monotonic() 23 | before_cpu = time.thread_time() 24 | 25 | try: 26 | yield result 27 | finally: 28 | after_wall = time.monotonic() 29 | after_cpu = time.thread_time() 30 | result.wall = after_wall - before_wall 31 | result.cpu = after_cpu - before_cpu 32 | -------------------------------------------------------------------------------- /src/crawlee/_utils/urls.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydantic import AnyHttpUrl, TypeAdapter 4 | from yarl import URL 5 | 6 | 7 | def is_url_absolute(url: str) -> bool: 8 | """Check if a URL is absolute.""" 9 | url_parsed = URL(url) 10 | 11 | # We don't use .absolute because in yarl.URL, it is always True for links that start with '//' 12 | return bool(url_parsed.scheme) and bool(url_parsed.raw_authority) 13 | 14 | 15 | def convert_to_absolute_url(base_url: str, relative_url: str) -> str: 16 | """Convert a relative URL to an absolute URL using a base URL.""" 17 | return str(URL(base_url).join(URL(relative_url))) 18 | 19 | 20 | _http_url_adapter = TypeAdapter(AnyHttpUrl) 21 | 22 | 23 | def validate_http_url(value: str | None) -> str | None: 24 | """Validate the given HTTP URL. 25 | 26 | Raises: 27 | pydantic.ValidationError: If the URL is not valid. 28 | """ 29 | if value is not None: 30 | _http_url_adapter.validate_python(value) 31 | 32 | return value 33 | -------------------------------------------------------------------------------- /src/crawlee/_utils/web.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def is_status_code_client_error(value: int) -> bool: 5 | """Return `True` for 4xx status codes, `False` otherwise.""" 6 | return 400 <= value <= 499 # noqa: PLR2004 7 | 8 | 9 | def is_status_code_server_error(value: int) -> bool: 10 | """Return `True` for 5xx status codes, `False` otherwise.""" 11 | return value >= 500 # noqa: PLR2004 12 | -------------------------------------------------------------------------------- /src/crawlee/browsers/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E402, TID252 2 | 3 | from crawlee._utils.try_import import install_import_hook as _install_import_hook 4 | from crawlee._utils.try_import import try_import as _try_import 5 | 6 | _install_import_hook(__name__) 7 | 8 | # Due to patch_browserforge 9 | from .._browserforge_workaround import patch_browserforge 10 | 11 | patch_browserforge() 12 | 13 | # The following imports are wrapped in try_import to handle optional dependencies, 14 | # ensuring the module can still function even if these dependencies are missing. 15 | with _try_import(__name__, 'BrowserPool'): 16 | from ._browser_pool import BrowserPool 17 | with _try_import(__name__, 'PlaywrightBrowserController'): 18 | from ._playwright_browser_controller import PlaywrightBrowserController 19 | with _try_import(__name__, 'PlaywrightBrowserPlugin'): 20 | from ._playwright_browser_plugin import PlaywrightBrowserPlugin 21 | 22 | __all__ = [ 23 | 'BrowserPool', 24 | 'PlaywrightBrowserController', 25 | 'PlaywrightBrowserPlugin', 26 | ] 27 | -------------------------------------------------------------------------------- /src/crawlee/browsers/_types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dataclasses import dataclass 4 | from typing import TYPE_CHECKING, Literal 5 | 6 | if TYPE_CHECKING: 7 | from playwright.async_api import Page 8 | 9 | BrowserType = Literal['chromium', 'firefox', 'webkit'] 10 | 11 | 12 | @dataclass 13 | class CrawleePage: 14 | """Represents a page object within a browser, with additional metadata for tracking and management.""" 15 | 16 | id: str 17 | browser_type: BrowserType 18 | page: Page 19 | -------------------------------------------------------------------------------- /src/crawlee/browsers/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/browsers/py.typed -------------------------------------------------------------------------------- /src/crawlee/crawlers/_abstract_http/__init__.py: -------------------------------------------------------------------------------- 1 | from ._abstract_http_crawler import AbstractHttpCrawler 2 | from ._abstract_http_parser import AbstractHttpParser 3 | from ._http_crawling_context import ParsedHttpCrawlingContext 4 | 5 | __all__ = [ 6 | 'AbstractHttpCrawler', 7 | 'AbstractHttpParser', 8 | 'ParsedHttpCrawlingContext', 9 | ] 10 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_abstract_http/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/crawlers/_abstract_http/py.typed -------------------------------------------------------------------------------- /src/crawlee/crawlers/_adaptive_playwright/__init__.py: -------------------------------------------------------------------------------- 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook 2 | from crawlee._utils.try_import import try_import as _try_import 3 | 4 | # These imports have only mandatory dependencies, so they are imported directly. 5 | from ._adaptive_playwright_crawling_context import ( 6 | AdaptivePlaywrightCrawlingContext, 7 | AdaptivePlaywrightPreNavCrawlingContext, 8 | ) 9 | 10 | _install_import_hook(__name__) 11 | 12 | # The following imports are wrapped in try_import to handle optional dependencies, 13 | # ensuring the module can still function even if these dependencies are missing. 14 | with _try_import(__name__, 'BeautifulSoupCrawler'): 15 | from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor 16 | with _try_import(__name__, 'BeautifulSoupCrawlingContext'): 17 | from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler 18 | 19 | __all__ = [ 20 | 'AdaptivePlaywrightCrawler', 21 | 'AdaptivePlaywrightCrawlingContext', 22 | 'AdaptivePlaywrightPreNavCrawlingContext', 23 | 'RenderingType', 24 | 'RenderingTypePrediction', 25 | 'RenderingTypePredictor', 26 | ] 27 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Annotated 4 | 5 | from pydantic import ConfigDict, Field 6 | 7 | from crawlee._utils.docs import docs_group 8 | from crawlee.statistics import StatisticsState 9 | 10 | 11 | @docs_group('Data structures') 12 | class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): 13 | """Statistic data about a crawler run with additional information related to adaptive crawling.""" 14 | 15 | model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants') 16 | 17 | http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0 18 | """Number representing how many times static http based crawling was used.""" 19 | 20 | browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 21 | """Number representing how many times browser based crawling was used.""" 22 | 23 | rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 24 | """Number representing how many times the predictor gave incorrect prediction.""" 25 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_basic/__init__.py: -------------------------------------------------------------------------------- 1 | from ._basic_crawler import BasicCrawler, BasicCrawlerOptions 2 | from ._basic_crawling_context import BasicCrawlingContext 3 | from ._context_pipeline import ContextPipeline 4 | 5 | __all__ = [ 6 | 'BasicCrawler', 7 | 'BasicCrawlerOptions', 8 | 'BasicCrawlingContext', 9 | 'ContextPipeline', 10 | ] 11 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_basic/_basic_crawling_context.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | # Do just the re-export because of the circular imports. 4 | from crawlee._types import BasicCrawlingContext # noqa: F401 5 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_basic/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/crawlers/_basic/py.typed -------------------------------------------------------------------------------- /src/crawlee/crawlers/_beautifulsoup/__init__.py: -------------------------------------------------------------------------------- 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook 2 | from crawlee._utils.try_import import try_import as _try_import 3 | 4 | _install_import_hook(__name__) 5 | 6 | # The following imports are wrapped in try_import to handle optional dependencies, 7 | # ensuring the module can still function even if these dependencies are missing. 8 | with _try_import(__name__, 'BeautifulSoupCrawler'): 9 | from ._beautifulsoup_crawler import BeautifulSoupCrawler 10 | with _try_import(__name__, 'BeautifulSoupCrawlingContext'): 11 | from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext 12 | with _try_import(__name__, 'BeautifulSoupParserType'): 13 | from ._beautifulsoup_parser import BeautifulSoupParserType 14 | 15 | __all__ = [ 16 | 'BeautifulSoupCrawler', 17 | 'BeautifulSoupCrawlingContext', 18 | 'BeautifulSoupParserType', 19 | ] 20 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, fields 2 | 3 | from bs4 import BeautifulSoup 4 | from typing_extensions import Self 5 | 6 | from crawlee._utils.docs import docs_group 7 | from crawlee.crawlers import ParsedHttpCrawlingContext 8 | 9 | from ._utils import html_to_text 10 | 11 | 12 | @dataclass(frozen=True) 13 | @docs_group('Data structures') 14 | class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]): 15 | """The crawling context used by the `BeautifulSoupCrawler`. 16 | 17 | It provides access to key objects as well as utility functions for handling crawling tasks. 18 | """ 19 | 20 | @property 21 | def soup(self) -> BeautifulSoup: 22 | """Convenience alias.""" 23 | return self.parsed_content 24 | 25 | @classmethod 26 | def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self: 27 | """Initialize a new instance from an existing `ParsedHttpCrawlingContext`.""" 28 | return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) 29 | 30 | def html_to_text(self) -> str: 31 | """Convert the parsed HTML content to newline-separated plain text without tags.""" 32 | return html_to_text(self.parsed_content) 33 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_beautifulsoup/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/crawlers/_beautifulsoup/py.typed -------------------------------------------------------------------------------- /src/crawlee/crawlers/_http/__init__.py: -------------------------------------------------------------------------------- 1 | from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext 2 | from crawlee.http_clients import HttpCrawlingResult 3 | 4 | from ._http_crawler import HttpCrawler 5 | 6 | __all__ = [ 7 | 'HttpCrawler', 8 | 'HttpCrawlingContext', 9 | 'HttpCrawlingResult', 10 | ] 11 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_parsel/__init__.py: -------------------------------------------------------------------------------- 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook 2 | from crawlee._utils.try_import import try_import as _try_import 3 | 4 | _install_import_hook(__name__) 5 | 6 | # The following imports are wrapped in try_import to handle optional dependencies, 7 | # ensuring the module can still function even if these dependencies are missing. 8 | with _try_import(__name__, 'ParselCrawler'): 9 | from ._parsel_crawler import ParselCrawler 10 | with _try_import(__name__, 'ParselCrawlingContext'): 11 | from ._parsel_crawling_context import ParselCrawlingContext 12 | 13 | __all__ = [ 14 | 'ParselCrawler', 15 | 'ParselCrawlingContext', 16 | ] 17 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_parsel/_parsel_crawling_context.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, fields 2 | 3 | from parsel import Selector 4 | from typing_extensions import Self 5 | 6 | from crawlee._utils.docs import docs_group 7 | from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext 8 | 9 | from ._utils import html_to_text 10 | 11 | 12 | @dataclass(frozen=True) 13 | @docs_group('Data structures') 14 | class ParselCrawlingContext(ParsedHttpCrawlingContext[Selector]): 15 | """The crawling context used by the `ParselCrawler`. 16 | 17 | It provides access to key objects as well as utility functions for handling crawling tasks. 18 | """ 19 | 20 | @property 21 | def selector(self) -> Selector: 22 | """Convenience alias.""" 23 | return self.parsed_content 24 | 25 | @classmethod 26 | def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self: 27 | """Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`.""" 28 | return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) 29 | 30 | def html_to_text(self) -> str: 31 | """Convert the parsed HTML content to newline-separated plain text without tags.""" 32 | return html_to_text(self.parsed_content) 33 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_playwright/__init__.py: -------------------------------------------------------------------------------- 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook 2 | from crawlee._utils.try_import import try_import as _try_import 3 | 4 | _install_import_hook(__name__) 5 | 6 | # The following imports are wrapped in try_import to handle optional dependencies, 7 | # ensuring the module can still function even if these dependencies are missing. 8 | with _try_import(__name__, 'PlaywrightCrawler'): 9 | from ._playwright_crawler import PlaywrightCrawler 10 | with _try_import(__name__, 'PlaywrightCrawlingContext'): 11 | from ._playwright_crawling_context import PlaywrightCrawlingContext 12 | with _try_import(__name__, 'PlaywrightPreNavCrawlingContext'): 13 | from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext 14 | 15 | __all__ = [ 16 | 'PlaywrightCrawler', 17 | 'PlaywrightCrawlingContext', 18 | 'PlaywrightPreNavCrawlingContext', 19 | ] 20 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/_types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass(frozen=True) 7 | class BlockedInfo: 8 | """Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked.""" 9 | 10 | reason: str 11 | 12 | def __bool__(self) -> bool: 13 | """No reason means no blocking.""" 14 | return bool(self.reason) 15 | -------------------------------------------------------------------------------- /src/crawlee/crawlers/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/crawlers/py.typed -------------------------------------------------------------------------------- /src/crawlee/events/__init__.py: -------------------------------------------------------------------------------- 1 | from ._event_manager import EventManager 2 | from ._local_event_manager import LocalEventManager 3 | from ._types import ( 4 | Event, 5 | EventAbortingData, 6 | EventData, 7 | EventExitData, 8 | EventListener, 9 | EventMigratingData, 10 | EventPersistStateData, 11 | EventSystemInfoData, 12 | ) 13 | 14 | __all__ = [ 15 | 'Event', 16 | 'EventAbortingData', 17 | 'EventData', 18 | 'EventExitData', 19 | 'EventListener', 20 | 'EventManager', 21 | 'EventMigratingData', 22 | 'EventPersistStateData', 23 | 'EventSystemInfoData', 24 | 'LocalEventManager', 25 | ] 26 | -------------------------------------------------------------------------------- /src/crawlee/events/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/events/py.typed -------------------------------------------------------------------------------- /src/crawlee/fingerprint_suite/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E402, TID252 2 | 3 | # Due to patch_browserforge 4 | from .._browserforge_workaround import patch_browserforge 5 | 6 | patch_browserforge() 7 | 8 | from ._browserforge_adapter import BrowserforgeFingerprintGenerator as DefaultFingerprintGenerator 9 | from ._fingerprint_generator import FingerprintGenerator 10 | from ._header_generator import HeaderGenerator 11 | from ._types import HeaderGeneratorOptions, ScreenOptions 12 | 13 | __all__ = [ 14 | 'DefaultFingerprintGenerator', 15 | 'FingerprintGenerator', 16 | 'HeaderGenerator', 17 | 'HeaderGeneratorOptions', 18 | 'ScreenOptions', 19 | ] 20 | -------------------------------------------------------------------------------- /src/crawlee/fingerprint_suite/_consts.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | # ruff: noqa: E501 4 | 5 | COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9' 6 | 7 | BROWSER_TYPE_HEADER_KEYWORD = { 8 | 'chromium': {'Chrome', 'CriOS'}, 9 | 'firefox': {'Firefox', 'FxiOS'}, 10 | 'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'}, 11 | 'webkit': {'Safari'}, 12 | } 13 | -------------------------------------------------------------------------------- /src/crawlee/fingerprint_suite/_fingerprint_generator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING 5 | 6 | from crawlee._utils.docs import docs_group 7 | 8 | if TYPE_CHECKING: 9 | from browserforge.fingerprints import Fingerprint 10 | 11 | 12 | @docs_group('Abstract classes') 13 | class FingerprintGenerator(ABC): 14 | """A class for creating browser fingerprints that mimic browser fingerprints of real users.""" 15 | 16 | @abstractmethod 17 | def generate(self) -> Fingerprint: 18 | """Generate browser fingerprints. 19 | 20 | This is experimental feature. 21 | Return type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely 22 | it will change to custom `Fingerprint` class defined in this repo later. 23 | """ 24 | -------------------------------------------------------------------------------- /src/crawlee/fingerprint_suite/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/fingerprint_suite/py.typed -------------------------------------------------------------------------------- /src/crawlee/http_clients/__init__.py: -------------------------------------------------------------------------------- 1 | from crawlee._utils.try_import import install_import_hook as _install_import_hook 2 | from crawlee._utils.try_import import try_import as _try_import 3 | 4 | # These imports have only mandatory dependencies, so they are imported directly. 5 | from ._base import HttpClient, HttpCrawlingResult, HttpResponse 6 | from ._httpx import HttpxHttpClient 7 | 8 | _install_import_hook(__name__) 9 | 10 | # The following imports are wrapped in try_import to handle optional dependencies, 11 | # ensuring the module can still function even if these dependencies are missing. 12 | with _try_import(__name__, 'CurlImpersonateHttpClient'): 13 | from ._curl_impersonate import CurlImpersonateHttpClient 14 | 15 | 16 | __all__ = [ 17 | 'CurlImpersonateHttpClient', 18 | 'HttpClient', 19 | 'HttpCrawlingResult', 20 | 'HttpResponse', 21 | 'HttpxHttpClient', 22 | ] 23 | -------------------------------------------------------------------------------- /src/crawlee/project_template/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "project_name": "crawlee-python-project", 3 | "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", 4 | "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox"], 5 | "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}", 6 | "http_client": ["httpx", "curl-impersonate"], 7 | "package_manager": ["poetry", "pip", "uv", "manual"], 8 | "enable_apify_integration": false, 9 | "start_url": "https://crawlee.dev", 10 | "_jinja2_env_vars": { 11 | "line_statement_prefix": "# %" 12 | }, 13 | "_extensions": ["jinja2.ext.do"] 14 | } 15 | -------------------------------------------------------------------------------- /src/crawlee/project_template/hooks/pre_gen_project.py: -------------------------------------------------------------------------------- 1 | # % if cookiecutter.package_manager in ['poetry', 'uv'] 2 | import subprocess 3 | import shutil 4 | import re 5 | import sys 6 | 7 | manager = "{{cookiecutter.package_manager}}" 8 | manager_text = manager.title() 9 | # % if cookiecutter.package_manager == 'poetry' 10 | version_regex = r'Poetry \(version 2\..*\)' 11 | r_version = '2.x' 12 | # % elif cookiecutter.package_manager == 'uv' 13 | version_regex = r'uv (0\..*)' 14 | r_version = '0.x' 15 | # % endif 16 | 17 | # Check if package manager is available in PATH 18 | if not shutil.which(manager): 19 | sys.stderr.write(f'\nError: You selected {manager_text} as your package manager, but it is not installed. Please install it and try again.\n') 20 | sys.exit(1) 21 | 22 | # Check if the package manager is executable 23 | try: 24 | version = subprocess.check_output([manager, '--version']).decode().strip() 25 | except OSError: 26 | sys.stderr.write(f'\nError: Your selected package manager {manager_text} was found but failed to execute.\n') 27 | sys.exit(1) 28 | 29 | # Check if the version matches the required regex 30 | if not re.match(version_regex, version): 31 | sys.stderr.write(f'\nError: Your selected package manager {manager_text} requires version {r_version}, but {version} is installed.\n') 32 | sys.exit(1) 33 | # % endif 34 | -------------------------------------------------------------------------------- /src/crawlee/project_template/templates/main_beautifulsoup.py: -------------------------------------------------------------------------------- 1 | # % extends 'main.py' 2 | 3 | # % block import 4 | from crawlee.crawlers import BeautifulSoupCrawler 5 | # % endblock 6 | 7 | # % block instantiation 8 | crawler = BeautifulSoupCrawler( 9 | request_handler=router, 10 | max_requests_per_crawl=10, 11 | {{ self.http_client_instantiation() }}) 12 | # % endblock 13 | -------------------------------------------------------------------------------- /src/crawlee/project_template/templates/main_parsel.py: -------------------------------------------------------------------------------- 1 | # % extends 'main.py' 2 | 3 | # % block import 4 | from crawlee.crawlers import ParselCrawler 5 | # % endblock 6 | 7 | # % block instantiation 8 | crawler = ParselCrawler( 9 | request_handler=router, 10 | max_requests_per_crawl=10, 11 | {{ self.http_client_instantiation() }}) 12 | # % endblock 13 | -------------------------------------------------------------------------------- /src/crawlee/project_template/templates/main_playwright.py: -------------------------------------------------------------------------------- 1 | # % extends 'main.py' 2 | 3 | # % block import 4 | from crawlee.crawlers import PlaywrightCrawler 5 | # % endblock 6 | 7 | # % block instantiation 8 | crawler = PlaywrightCrawler( 9 | request_handler=router, 10 | headless=True, 11 | max_requests_per_crawl=10, 12 | {{ self.http_client_instantiation() }}) 13 | # % endblock 14 | -------------------------------------------------------------------------------- /src/crawlee/project_template/templates/routes_beautifulsoup.py: -------------------------------------------------------------------------------- 1 | from crawlee.crawlers import BeautifulSoupCrawlingContext 2 | from crawlee.router import Router 3 | 4 | router = Router[BeautifulSoupCrawlingContext]() 5 | 6 | 7 | @router.default_handler 8 | async def default_handler(context: BeautifulSoupCrawlingContext) -> None: 9 | """Default request handler.""" 10 | context.log.info(f'Processing {context.request.url} ...') 11 | title = context.soup.find('title') 12 | await context.push_data( 13 | { 14 | 'url': context.request.loaded_url, 15 | 'title': title.text if title else None, 16 | } 17 | ) 18 | 19 | await context.enqueue_links() 20 | -------------------------------------------------------------------------------- /src/crawlee/project_template/templates/routes_camoufox.py: -------------------------------------------------------------------------------- 1 | from crawlee.crawlers import PlaywrightCrawlingContext 2 | from crawlee.router import Router 3 | 4 | router = Router[PlaywrightCrawlingContext]() 5 | 6 | 7 | @router.default_handler 8 | async def default_handler(context: PlaywrightCrawlingContext) -> None: 9 | """Default request handler.""" 10 | context.log.info(f'Processing {context.request.url} ...') 11 | title = await context.page.query_selector('title') 12 | await context.push_data( 13 | { 14 | 'url': context.request.loaded_url, 15 | 'title': await title.inner_text() if title else None, 16 | } 17 | ) 18 | 19 | await context.enqueue_links() 20 | -------------------------------------------------------------------------------- /src/crawlee/project_template/templates/routes_parsel.py: -------------------------------------------------------------------------------- 1 | from crawlee.crawlers import ParselCrawlingContext 2 | from crawlee.router import Router 3 | 4 | router = Router[ParselCrawlingContext]() 5 | 6 | 7 | @router.default_handler 8 | async def default_handler(context: ParselCrawlingContext) -> None: 9 | """Default request handler.""" 10 | context.log.info(f'Processing {context.request.url} ...') 11 | title = context.selector.xpath('//title/text()').get() 12 | await context.push_data( 13 | { 14 | 'url': context.request.loaded_url, 15 | 'title': title, 16 | } 17 | ) 18 | 19 | await context.enqueue_links() 20 | -------------------------------------------------------------------------------- /src/crawlee/project_template/templates/routes_playwright.py: -------------------------------------------------------------------------------- 1 | from crawlee.crawlers import PlaywrightCrawlingContext 2 | from crawlee.router import Router 3 | 4 | router = Router[PlaywrightCrawlingContext]() 5 | 6 | 7 | @router.default_handler 8 | async def default_handler(context: PlaywrightCrawlingContext) -> None: 9 | """Default request handler.""" 10 | context.log.info(f'Processing {context.request.url} ...') 11 | title = await context.page.query_selector('title') 12 | await context.push_data( 13 | { 14 | 'url': context.request.loaded_url, 15 | 'title': await title.inner_text() if title else None, 16 | } 17 | ) 18 | 19 | await context.enqueue_links() 20 | -------------------------------------------------------------------------------- /src/crawlee/project_template/templates/routes_playwright_camoufox.py: -------------------------------------------------------------------------------- 1 | from crawlee.crawlers import PlaywrightCrawlingContext 2 | from crawlee.router import Router 3 | 4 | router = Router[PlaywrightCrawlingContext]() 5 | 6 | 7 | @router.default_handler 8 | async def default_handler(context: PlaywrightCrawlingContext) -> None: 9 | """Default request handler.""" 10 | context.log.info(f'Processing {context.request.url} ...') 11 | title = await context.page.query_selector('title') 12 | await context.push_data( 13 | { 14 | 'url': context.request.loaded_url, 15 | 'title': await title.inner_text() if title else None, 16 | } 17 | ) 18 | 19 | await context.enqueue_links() 20 | -------------------------------------------------------------------------------- /src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore: -------------------------------------------------------------------------------- 1 | .venv 2 | -------------------------------------------------------------------------------- /src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml: -------------------------------------------------------------------------------- 1 | # % if cookiecutter.crawler_type == 'playwright-camoufox' 2 | # % set extras = ['playwright'] 3 | # % else 4 | # % set extras = [cookiecutter.crawler_type] 5 | # % endif 6 | # % if cookiecutter.http_client == 'curl-impersonate' 7 | # % do extras.append('curl-impersonate') 8 | # % endif 9 | 10 | [project] 11 | name = "{{cookiecutter.project_name}}" 12 | version = "0.0.1" 13 | description = "" 14 | authors = [ 15 | {name = "Your Name",email = "you@example.com"} 16 | ] 17 | readme = "README.md" 18 | requires-python = ">=3.9,<4.0" 19 | dependencies = [ 20 | "crawlee[{{ extras|join(',') }}]", 21 | # % if cookiecutter.crawler_type == 'playwright-camoufox' 22 | "camoufox[geoip]~=0.4.5", 23 | # % endif 24 | # % if cookiecutter.enable_apify_integration 25 | "apify", 26 | # % endif 27 | ] 28 | 29 | # % if cookiecutter.package_manager == 'poetry' 30 | [tool.poetry] 31 | package-mode = false 32 | 33 | [build-system] 34 | requires = ["poetry-core>=2.0.0,<3.0.0"] 35 | build-backend = "poetry.core.masonry.api" 36 | # % endif 37 | -------------------------------------------------------------------------------- /src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt: -------------------------------------------------------------------------------- 1 | # % if cookiecutter.crawler_type == 'playwright-camoufox' 2 | camoufox[geoip]~=0.4.5 3 | # % set extras = ['playwright'] 4 | # % else 5 | # % set extras = [cookiecutter.crawler_type] 6 | # % endif 7 | # % if cookiecutter.enable_apify_integration 8 | apify 9 | # % endif 10 | # % if cookiecutter.http_client == 'curl-impersonate' 11 | # % do extras.append('curl-impersonate') 12 | # % endif 13 | crawlee[{{ extras | join(',') }}] 14 | -------------------------------------------------------------------------------- /src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py -------------------------------------------------------------------------------- /src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | # % if cookiecutter.http_client == 'curl-impersonate' 3 | import platform 4 | # % if 'playwright' in cookiecutter.crawler_type 5 | import warnings 6 | # % endif 7 | # % endif 8 | {{ '' }} 9 | from .main import main 10 | 11 | if __name__ == '__main__': 12 | # % if cookiecutter.http_client == 'curl-impersonate' 13 | if platform.system() == 'Windows': 14 | # This mitigates a warning raised by curl-cffi. 15 | # % if 'playwright' in cookiecutter.crawler_type 16 | warnings.warn( 17 | message=('curl-cffi suggests using WindowsSelectorEventLoopPolicy, but this conflicts with Playwright. ' 18 | 'Ignore the curl-cffi warning.'), 19 | category=UserWarning, 20 | stacklevel=2, 21 | ) 22 | # % else 23 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 24 | # % endif 25 | # % endif 26 | {{ '' }} 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py: -------------------------------------------------------------------------------- 1 | # % include 'main_%s.py' % cookiecutter.__crawler_type 2 | -------------------------------------------------------------------------------- /src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py: -------------------------------------------------------------------------------- 1 | # % include 'routes_%s.py' % cookiecutter.__crawler_type 2 | -------------------------------------------------------------------------------- /src/crawlee/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/py.typed -------------------------------------------------------------------------------- /src/crawlee/request_loaders/__init__.py: -------------------------------------------------------------------------------- 1 | from ._request_list import RequestList 2 | from ._request_loader import RequestLoader 3 | from ._request_manager import RequestManager 4 | from ._request_manager_tandem import RequestManagerTandem 5 | 6 | __all__ = [ 7 | 'RequestList', 8 | 'RequestLoader', 9 | 'RequestManager', 10 | 'RequestManagerTandem', 11 | ] 12 | -------------------------------------------------------------------------------- /src/crawlee/sessions/__init__.py: -------------------------------------------------------------------------------- 1 | from ._cookies import CookieParam, SessionCookies 2 | from ._session import Session 3 | from ._session_pool import SessionPool 4 | 5 | __all__ = ['CookieParam', 'Session', 'SessionCookies', 'SessionPool'] 6 | -------------------------------------------------------------------------------- /src/crawlee/sessions/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/sessions/py.typed -------------------------------------------------------------------------------- /src/crawlee/statistics/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: A005 2 | 3 | from ._models import FinalStatistics, StatisticsState 4 | from ._statistics import Statistics 5 | 6 | __all__ = ['FinalStatistics', 'Statistics', 'StatisticsState'] 7 | -------------------------------------------------------------------------------- /src/crawlee/storage_clients/__init__.py: -------------------------------------------------------------------------------- 1 | from ._base import StorageClient 2 | from ._memory import MemoryStorageClient 3 | 4 | __all__ = ['MemoryStorageClient', 'StorageClient'] 5 | -------------------------------------------------------------------------------- /src/crawlee/storage_clients/_base/__init__.py: -------------------------------------------------------------------------------- 1 | from ._dataset_client import DatasetClient 2 | from ._dataset_collection_client import DatasetCollectionClient 3 | from ._key_value_store_client import KeyValueStoreClient 4 | from ._key_value_store_collection_client import KeyValueStoreCollectionClient 5 | from ._request_queue_client import RequestQueueClient 6 | from ._request_queue_collection_client import RequestQueueCollectionClient 7 | from ._storage_client import StorageClient 8 | from ._types import ResourceClient, ResourceCollectionClient 9 | 10 | __all__ = [ 11 | 'DatasetClient', 12 | 'DatasetCollectionClient', 13 | 'KeyValueStoreClient', 14 | 'KeyValueStoreCollectionClient', 15 | 'RequestQueueClient', 16 | 'RequestQueueCollectionClient', 17 | 'ResourceClient', 18 | 'ResourceCollectionClient', 19 | 'StorageClient', 20 | ] 21 | -------------------------------------------------------------------------------- /src/crawlee/storage_clients/_base/_types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Union 4 | 5 | from ._dataset_client import DatasetClient 6 | from ._dataset_collection_client import DatasetCollectionClient 7 | from ._key_value_store_client import KeyValueStoreClient 8 | from ._key_value_store_collection_client import KeyValueStoreCollectionClient 9 | from ._request_queue_client import RequestQueueClient 10 | from ._request_queue_collection_client import RequestQueueCollectionClient 11 | 12 | ResourceClient = Union[ 13 | DatasetClient, 14 | KeyValueStoreClient, 15 | RequestQueueClient, 16 | ] 17 | 18 | ResourceCollectionClient = Union[ 19 | DatasetCollectionClient, 20 | KeyValueStoreCollectionClient, 21 | RequestQueueCollectionClient, 22 | ] 23 | -------------------------------------------------------------------------------- /src/crawlee/storage_clients/_base/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/storage_clients/_base/py.typed -------------------------------------------------------------------------------- /src/crawlee/storage_clients/_memory/__init__.py: -------------------------------------------------------------------------------- 1 | from ._dataset_client import DatasetClient 2 | from ._dataset_collection_client import DatasetCollectionClient 3 | from ._key_value_store_client import KeyValueStoreClient 4 | from ._key_value_store_collection_client import KeyValueStoreCollectionClient 5 | from ._memory_storage_client import MemoryStorageClient 6 | from ._request_queue_client import RequestQueueClient 7 | from ._request_queue_collection_client import RequestQueueCollectionClient 8 | 9 | __all__ = [ 10 | 'DatasetClient', 11 | 'DatasetCollectionClient', 12 | 'KeyValueStoreClient', 13 | 'KeyValueStoreCollectionClient', 14 | 'MemoryStorageClient', 15 | 'RequestQueueClient', 16 | 'RequestQueueCollectionClient', 17 | ] 18 | -------------------------------------------------------------------------------- /src/crawlee/storage_clients/_memory/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/storage_clients/_memory/py.typed -------------------------------------------------------------------------------- /src/crawlee/storage_clients/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/storage_clients/py.typed -------------------------------------------------------------------------------- /src/crawlee/storages/__init__.py: -------------------------------------------------------------------------------- 1 | from ._dataset import Dataset 2 | from ._key_value_store import KeyValueStore 3 | from ._request_queue import RequestQueue 4 | 5 | __all__ = [ 6 | 'Dataset', 7 | 'KeyValueStore', 8 | 'RequestQueue', 9 | ] 10 | -------------------------------------------------------------------------------- /src/crawlee/storages/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/src/crawlee/storages/py.typed -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/tests/__init__.py -------------------------------------------------------------------------------- /tests/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/tests/e2e/__init__.py -------------------------------------------------------------------------------- /tests/unit/README.md: -------------------------------------------------------------------------------- 1 | # Unit tests 2 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/_statistics/test_periodic_logging.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import logging 5 | from datetime import timedelta 6 | from typing import TYPE_CHECKING 7 | 8 | from crawlee.statistics import Statistics 9 | 10 | if TYPE_CHECKING: 11 | import pytest 12 | 13 | 14 | async def test_periodic_logging(caplog: pytest.LogCaptureFixture) -> None: 15 | caplog.set_level(logging.INFO) 16 | 17 | log_message = 'Periodic statistics XYZ' 18 | statistics = Statistics.with_default_state(log_interval=timedelta(milliseconds=50), log_message=log_message) 19 | 20 | async with statistics: 21 | await asyncio.sleep(0.1) 22 | 23 | matching_records = [rec for rec in caplog.records if rec.message.startswith(log_message)] 24 | assert len(matching_records) >= 1 25 | -------------------------------------------------------------------------------- /tests/unit/_statistics/test_persistence.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from crawlee.statistics import Statistics 4 | 5 | 6 | async def test_basic_persistence() -> None: 7 | key = 'statistics_foo' 8 | 9 | async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics: 10 | statistics.state.requests_failed = 42 11 | 12 | async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics: 13 | pass 14 | 15 | assert statistics.state.requests_failed == 42 16 | -------------------------------------------------------------------------------- /tests/unit/_utils/test_console.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from crawlee._utils.console import make_table 4 | 5 | 6 | def test_empty_input() -> None: 7 | assert make_table([]) == '' 8 | 9 | 10 | def test_empty_row() -> None: 11 | assert make_table([()]) == '' 12 | 13 | 14 | def test_single_column() -> None: 15 | result = make_table([('test',)]) 16 | lines = result.split('\n') 17 | assert len(lines) == 3 18 | assert lines[1] == '│ test │' 19 | 20 | 21 | def test_two_columns() -> None: 22 | data = [('Name', 'Age'), ('Alice', '30'), ('Bob', '25')] 23 | result = make_table(data) 24 | lines = result.split('\n') 25 | # fmt: off 26 | assert lines == ['┌───────┬─────┐', 27 | '│ Name │ Age │', 28 | '│ Alice │ 30 │', 29 | '│ Bob │ 25 │', 30 | '└───────┴─────┘'] 31 | # fmt: on 32 | 33 | 34 | def test_long_content_truncation() -> None: 35 | data = [('Short', 'VeryVeryVeryLongContent')] 36 | result = make_table(data, width=25) 37 | lines = result.split('\n') 38 | # fmt: off 39 | assert lines == ['┌───────────┬───────────┐', 40 | '│ Short │ VeryVe... │', 41 | '└───────────┴───────────┘'] 42 | # fmt: on 43 | -------------------------------------------------------------------------------- /tests/unit/_utils/test_globs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from crawlee._utils.globs import Glob 4 | 5 | 6 | def test_asterisk() -> None: 7 | glob = Glob('foo/*') 8 | assert glob.regexp.match('bar/') is None 9 | assert glob.regexp.match('foo/bar') is not None 10 | assert glob.regexp.match('foo/bar/baz') is None 11 | 12 | 13 | def test_double_asteritsk() -> None: 14 | glob = Glob('foo/**') 15 | assert glob.regexp.match('bar/') is None 16 | assert glob.regexp.match('foo/bar') is not None 17 | assert glob.regexp.match('foo/bar/baz') is not None 18 | -------------------------------------------------------------------------------- /tests/unit/_utils/test_measure_time.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import time 5 | 6 | from crawlee._utils.measure_time import measure_time 7 | 8 | 9 | def test_measure_time_wall_sync() -> None: 10 | with measure_time() as elapsed: 11 | time.sleep(0.1) 12 | 13 | assert elapsed.cpu is not None 14 | assert elapsed.wall is not None 15 | assert elapsed.wall >= 0.09 16 | 17 | 18 | def test_measure_time_cpu_sync() -> None: 19 | with measure_time() as elapsed: 20 | start = time.time() 21 | acc = 0 22 | 23 | while time.time() - start < 0.1: 24 | acc += 1 25 | acc *= acc 26 | 27 | assert elapsed.cpu is not None 28 | assert elapsed.wall is not None 29 | assert elapsed.cpu >= 0.05 30 | 31 | 32 | async def test_measure_time_wall_async() -> None: 33 | with measure_time() as elapsed: 34 | await asyncio.sleep(0.1) 35 | 36 | assert elapsed.cpu is not None 37 | assert elapsed.wall is not None 38 | assert elapsed.wall >= 0.09 39 | -------------------------------------------------------------------------------- /tests/unit/_utils/test_system.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from crawlee._utils.byte_size import ByteSize 4 | from crawlee._utils.system import get_cpu_info, get_memory_info 5 | 6 | 7 | def test_get_memory_info_returns_valid_values() -> None: 8 | memory_info = get_memory_info() 9 | 10 | assert ByteSize(0) < memory_info.total_size < ByteSize.from_tb(1) 11 | assert memory_info.current_size < memory_info.total_size 12 | 13 | 14 | def test_get_cpu_info_returns_valid_values() -> None: 15 | cpu_info = get_cpu_info() 16 | assert 0 <= cpu_info.used_ratio <= 1 17 | -------------------------------------------------------------------------------- /tests/unit/_utils/test_timedelata_ms.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from datetime import timedelta 4 | from typing import Any 5 | 6 | import pytest 7 | from pydantic import BaseModel 8 | 9 | from crawlee._utils.models import timedelta_ms 10 | 11 | 12 | class _ModelWithTimedeltaMs(BaseModel): 13 | time_delta: timedelta_ms | None = None 14 | 15 | 16 | @pytest.mark.parametrize( 17 | ('time_delta_input', 'expected_time_delta', 'expected_model_dump_value'), 18 | [ 19 | (1.0, timedelta(milliseconds=1), 1), 20 | (1, timedelta(milliseconds=1), 1), 21 | ('1', timedelta(milliseconds=1), 1), 22 | (timedelta(milliseconds=1), timedelta(milliseconds=1), 1), 23 | (3.01, timedelta(microseconds=3010), 3), 24 | (3.5, timedelta(microseconds=3500), 4), 25 | (3.99, timedelta(microseconds=3990), 4), 26 | (None, None, None), 27 | (float('inf'), timedelta(days=999999999, seconds=3600 * 24 - 1, microseconds=999999), float('inf')), 28 | ], 29 | ) 30 | def test_model_with_timedelta_ms_input_types( 31 | time_delta_input: float | timedelta | Any | None, expected_time_delta: timedelta, expected_model_dump_value: int 32 | ) -> None: 33 | model = _ModelWithTimedeltaMs(time_delta=time_delta_input) 34 | assert model.time_delta == expected_time_delta 35 | assert model.model_dump() == {'time_delta': expected_model_dump_value} 36 | -------------------------------------------------------------------------------- /tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py -------------------------------------------------------------------------------- /tests/unit/events/test_local_event_manager.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | from datetime import timedelta 5 | from functools import update_wrapper 6 | from typing import Any 7 | from unittest.mock import AsyncMock 8 | 9 | import pytest 10 | 11 | from crawlee.events import LocalEventManager 12 | from crawlee.events._types import Event, EventSystemInfoData 13 | 14 | 15 | @pytest.fixture 16 | def listener() -> AsyncMock: 17 | async def async_listener(payload: Any) -> None: 18 | pass 19 | 20 | al = AsyncMock() 21 | update_wrapper(al, async_listener) 22 | return al 23 | 24 | 25 | async def test_emit_system_info_event(listener: AsyncMock) -> None: 26 | async with LocalEventManager(system_info_interval=timedelta(milliseconds=50)) as event_manager: 27 | event_manager.on(event=Event.SYSTEM_INFO, listener=listener) 28 | await asyncio.sleep(0.2) 29 | 30 | assert listener.call_count >= 1 31 | assert isinstance(listener.call_args[0][0], EventSystemInfoData) 32 | -------------------------------------------------------------------------------- /tests/unit/server_endpoints.py: -------------------------------------------------------------------------------- 1 | # Test server response content for testing 2 | 3 | HELLO_WORLD = b"""\ 4 | <html><head> 5 | <title>Hello, world! 6 | 7 | 8 | """ 9 | 10 | START_ENQUEUE = b"""\ 11 | 12 | Hello 13 | 14 | 15 | Link 1 16 | Link 2 17 | """ 18 | 19 | SECONDARY_INDEX = b"""\ 20 | 21 | Hello 22 | 23 | 24 | Link 3 25 | Link 4 26 | """ 27 | 28 | INCAPSULA = b"""\ 29 | 30 | Hello 31 | 32 | 33 | 35 | """ 36 | 37 | GENERIC_RESPONSE = b"""\ 38 | 39 | Hello 40 | 41 | 42 | Insightful content 43 | """ 44 | 45 | 46 | ROBOTS_TXT = b"""\ 47 | User-agent: * 48 | Disallow: *deny_all/ 49 | Disallow: /page_ 50 | crawl-delay: 10 51 | 52 | User-agent: Googlebot 53 | Disallow: *deny_googlebot/ 54 | crawl-delay: 1 55 | 56 | user-agent: Mozilla 57 | crawl-delay: 2 58 | 59 | sitemap: http://not-exists.com/sitemap_1.xml 60 | sitemap: http://not-exists.com/sitemap_2.xml""" 61 | -------------------------------------------------------------------------------- /website/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "@apify/eslint-config-ts", 4 | "plugin:react/recommended", 5 | "plugin:react-hooks/recommended" 6 | ], 7 | "parserOptions": { 8 | "project": "./tsconfig.eslint.json", 9 | "ecmaFeatures": { 10 | "jsx": true 11 | }, 12 | "ecmaVersion": 2020 13 | }, 14 | "env": { 15 | "browser": true 16 | }, 17 | "settings": { 18 | "react": { 19 | "version": "detect" 20 | } 21 | }, 22 | "rules": { 23 | "quote-props": ["error", "consistent"], 24 | "no-void": 0 25 | }, 26 | "root": true 27 | } 28 | -------------------------------------------------------------------------------- /website/.yarnrc.yml: -------------------------------------------------------------------------------- 1 | nodeLinker: node-modules 2 | enableGlobalCache: true 3 | -------------------------------------------------------------------------------- /website/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [require.resolve('@docusaurus/core/lib/babel/preset')], 3 | }; 4 | -------------------------------------------------------------------------------- /website/build_api_reference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Generate import shortcuts from the modules 4 | python generate_module_shortcuts.py 5 | -------------------------------------------------------------------------------- /website/patches/@docusaurus+core+3.4.0.patch: -------------------------------------------------------------------------------- 1 | diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js 2 | index 903f8dc..b6b60bf 100644 3 | --- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js 4 | +++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js 5 | @@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) { 6 | window.scrollTo(0, 0); 7 | } 8 | else { 9 | - const id = decodeURIComponent(hash.substring(1)); 10 | - const element = document.getElementById(id); 11 | - element?.scrollIntoView(); 12 | + setTimeout(() => { 13 | + const id = decodeURIComponent(hash.substring(1)); 14 | + const element = document.getElementById(id); 15 | + element?.scrollIntoView(); 16 | + }, 100); 17 | } 18 | } 19 | function ClientLifecyclesDispatcher({ children, location, previousLocation, }) { 20 | -------------------------------------------------------------------------------- /website/patches/@docusaurus+core+3.5.2.patch: -------------------------------------------------------------------------------- 1 | diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js 2 | index 903f8dc..b6b60bf 100644 3 | --- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js 4 | +++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js 5 | @@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) { 6 | window.scrollTo(0, 0); 7 | } 8 | else { 9 | - const id = decodeURIComponent(hash.substring(1)); 10 | - const element = document.getElementById(id); 11 | - element?.scrollIntoView(); 12 | + setTimeout(() => { 13 | + const id = decodeURIComponent(hash.substring(1)); 14 | + const element = document.getElementById(id); 15 | + element?.scrollIntoView(); 16 | + }, 100); 17 | } 18 | } 19 | function ClientLifecyclesDispatcher({ children, location, previousLocation, }) { 20 | -------------------------------------------------------------------------------- /website/roa-loader/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "roa-loader", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "keywords": [], 10 | "author": "", 11 | "license": "ISC", 12 | "dependencies": { 13 | "loader-utils": "^3.2.1" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /website/src/components/ApiLink.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import Link from '@docusaurus/Link'; 3 | // eslint-disable-next-line import/no-extraneous-dependencies 4 | import { useDocsVersion } from '@docusaurus/theme-common/internal'; 5 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; 6 | 7 | // const pkg = require('../../../packages/crawlee/package.json'); 8 | // 9 | // const [v1, v2] = pkg.version.split('.'); 10 | // const stable = [v1, v2].join('.'); 11 | 12 | const ApiLink = ({ to, children }) => { 13 | return ( 14 | {children} 15 | ); 16 | 17 | // const version = useDocsVersion(); 18 | // const { siteConfig } = useDocusaurusContext(); 19 | // 20 | // // if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) { 21 | // if (siteConfig.presets[0][1].docs.disableVersioning) { 22 | // return ( 23 | // {children} 24 | // ); 25 | // } 26 | // 27 | // return ( 28 | // {children} 29 | // ); 30 | }; 31 | 32 | export default ApiLink; 33 | -------------------------------------------------------------------------------- /website/src/components/Button.jsx: -------------------------------------------------------------------------------- 1 | import Link from '@docusaurus/Link'; 2 | import clsx from 'clsx'; 3 | import React from 'react'; 4 | 5 | import styles from './Button.module.css'; 6 | import CrawleeSvg from '../../static/img/crawlee-logo-monocolor.svg'; 7 | 8 | export default function Button({ children, to, withIcon, type = 'primary', className, isBig }) { 9 | return ( 10 | 11 | 18 | {withIcon && } 19 | {children} 20 | 21 | 22 | ); 23 | } 24 | -------------------------------------------------------------------------------- /website/src/components/CopyButton.module.css: -------------------------------------------------------------------------------- 1 | .copyButton { 2 | all: unset; 3 | display: inline-flex; 4 | align-items: center; 5 | justify-content: center; 6 | box-sizing: border-box; 7 | cursor: pointer; 8 | fill: var(--color-icon); 9 | 10 | svg { 11 | flex-shrink: 0; 12 | } 13 | } 14 | 15 | .copyButtonDefault { 16 | width: 28px; 17 | height: 28px; 18 | background-color: var(--color-background-muted); 19 | border: 1px solid var(--color-border); 20 | border-radius: 6px; 21 | transition: background-color 0.12s ease-out; 22 | 23 | &:hover { 24 | background-color: var(--color-hover); 25 | } 26 | 27 | svg { 28 | padding: 1px; 29 | } 30 | } 31 | 32 | .copyButtonCompact { 33 | svg { 34 | width: 16px; 35 | height: 16px; 36 | } 37 | } -------------------------------------------------------------------------------- /website/src/components/Gradients.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | 3 | export default function Gradients() { 4 | return ( 5 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | 15 | 16 | 17 | 18 | 19 | ); 20 | } 21 | -------------------------------------------------------------------------------- /website/src/components/Highlights.module.css: -------------------------------------------------------------------------------- 1 | .features { 2 | display: flex; 3 | align-items: center; 4 | width: 100%; 5 | font-size: 18px; 6 | line-height: 32px; 7 | color: #41465d; 8 | } 9 | 10 | html[data-theme="dark"] .features { 11 | color: #b3b8d2; 12 | } 13 | 14 | .feature svg { 15 | height: 60px; 16 | width: 60px; 17 | } 18 | 19 | .features svg path:nth-child(1) { 20 | fill: url(#gradient-1) !important; 21 | } 22 | 23 | .features svg path:nth-child(n + 1) { 24 | fill: url(#gradient-2) !important; 25 | } 26 | 27 | html[data-theme="dark"] .featureIcon { 28 | background: #272c3d; 29 | } 30 | 31 | .featureIcon { 32 | display: flex; 33 | justify-content: center; 34 | align-items: center; 35 | margin-bottom: 24px; 36 | border-radius: 8px; 37 | background-color: #f2f3fb; 38 | width: 48px; 39 | height: 48px; 40 | } 41 | 42 | .features h3 { 43 | font-weight: 700; 44 | font-size: 18px; 45 | line-height: 32px; 46 | } 47 | -------------------------------------------------------------------------------- /website/src/components/Homepage/HomepageCliExample.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | 3 | import CopyButton from '../CopyButton'; 4 | import styles from './HomepageCliExample.module.css'; 5 | 6 | const cliCommand = `pipx run 'crawlee[cli]' create my-crawler`; 7 | 8 | export default function CliExample() { 9 | return ( 10 |
11 |
12 | Or start with a template from our CLI 13 |
14 | 15 |
16 |                     $
17 |                     {cliCommand}
18 |                     
19 |                 
20 |
21 |
22 | Built with 🤍 by Apify. Forever free and open-source. 23 |
24 |
25 | ); 26 | } 27 | -------------------------------------------------------------------------------- /website/src/components/Homepage/HomepageHeroSection.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | 3 | import styles from './HomepageHeroSection.module.css'; 4 | import homepageStyles from '../../pages/index.module.css'; 5 | 6 | export default function HomepageHeroSection() { 7 | return ( 8 |
9 |

10 | Build reliable web scrapers. Fast. 11 |

12 |
16 |

17 | Crawlee is a web scraping library for JavaScript and Python. It 18 | handles blocking, crawling, proxies, and browsers for you. 19 |

20 |
24 |
28 |
29 |
30 | ); 31 | } 32 | -------------------------------------------------------------------------------- /website/src/components/Homepage/LanguageSwitch.module.css: -------------------------------------------------------------------------------- 1 | .languageSwitch { 2 | z-index: 1; 3 | display: inline-flex; 4 | position: relative; 5 | background-color: var(--color-background-subtle); 6 | border-radius: 6px; 7 | padding: 4px; 8 | } 9 | 10 | .switchOption { 11 | position: relative; 12 | z-index: 1; 13 | padding: 6px 16px; 14 | font-size: 14px; 15 | font-weight: 500; 16 | color: var(--color-text-muted); 17 | background: none; 18 | border: none; 19 | cursor: pointer; 20 | transition: color 0.3s ease; 21 | } 22 | 23 | .switchOption:hover { 24 | color: var(--color-text); 25 | } 26 | 27 | .switchOption.active { 28 | color: var(--color-text); 29 | } 30 | 31 | .switchBackground { 32 | position: absolute; 33 | top: 4px; 34 | bottom: 4px; 35 | left: 0; 36 | border-radius: 6px; 37 | background-color: var(--color-background); 38 | transition: 39 | transform 0.3s ease, 40 | width 0.3s ease; 41 | } 42 | -------------------------------------------------------------------------------- /website/src/components/Homepage/RiverSection.jsx: -------------------------------------------------------------------------------- 1 | import Link from '@docusaurus/Link'; 2 | import clsx from 'clsx'; 3 | import React from 'react'; 4 | 5 | import styles from './RiverSection.module.css'; 6 | 7 | export default function RiverSection({ title, description, content, reversed, to }) { 8 | return ( 9 |
10 |
11 |
12 |

{title}

13 |

{description}

14 | 15 | Learn more 16 | 17 |
18 |
{content}
19 |
20 |
21 | ); 22 | } 23 | -------------------------------------------------------------------------------- /website/src/components/RunnableCodeBlock.module.css: -------------------------------------------------------------------------------- 1 | .button { 2 | display: inline-block; 3 | padding: 3px 10px; 4 | position: absolute; 5 | top: calc(var(--ifm-pre-padding) / 2); 6 | right: 9px; 7 | z-index: 1; 8 | font-size: 16px; 9 | line-height: 28px; 10 | background: var(--prism-background-color); 11 | color: var(--prism-color); 12 | border: 1px solid var(--ifm-color-emphasis-300); 13 | border-radius: var(--ifm-global-radius); 14 | opacity: 0.7; 15 | font-weight: 600; 16 | width: 155px; 17 | } 18 | 19 | @media screen and (max-width: 768px) { 20 | .button { 21 | display: none; 22 | } 23 | } 24 | 25 | .button svg { 26 | height: 20px; 27 | position: absolute; 28 | top: 7.5px; 29 | right: 0; 30 | } 31 | 32 | .button:hover { 33 | opacity: 1; 34 | color: var(--prism-color); 35 | } 36 | 37 | .container { 38 | position: relative; 39 | } 40 | -------------------------------------------------------------------------------- /website/src/theme/ColorModeToggle/light-mode-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /website/src/theme/ColorModeToggle/styles.module.css: -------------------------------------------------------------------------------- 1 | .toggleButton { 2 | padding: 4px; 3 | display: flex; 4 | gap: 4px; 5 | align-items: center; 6 | transition: all var(--ifm-transition-fast); 7 | position: relative; 8 | border-radius: 150px; 9 | background-color: var(--color-background-subtle); 10 | } 11 | 12 | .toggleButton span { 13 | width: 44px; 14 | height: 36px; 15 | border-radius: 50%; 16 | background: #fff; 17 | position: absolute; 18 | transition: all var(--ifm-transition-fast); 19 | left: 0; 20 | margin: 4px; 21 | 22 | border-radius: 150px; 23 | background-color: var(--color-background); 24 | 25 | /* Light/L1 */ 26 | box-shadow: 27 | 0px 0.5px 1.5px 0px rgba(63, 71, 93, 0.15), 28 | 0.4px 0.8px 1px -1.2px rgba(63, 71, 93, 0.14), 29 | 1px 2px 2.5px -2.5px rgba(63, 71, 93, 0.13); 30 | } 31 | 32 | .toggleButton svg { 33 | z-index: 1; 34 | margin: 8px 12px; 35 | width: 20px; 36 | height: 20px; 37 | path { 38 | stroke: var(--color-icon); 39 | } 40 | } 41 | 42 | [data-theme='dark'] .toggleButton span { 43 | left: 48px; 44 | } 45 | 46 | .toggleButtonDisabled { 47 | cursor: not-allowed; 48 | } 49 | -------------------------------------------------------------------------------- /website/src/theme/DocItem/Layout/styles.module.css: -------------------------------------------------------------------------------- 1 | .docItemContainer { 2 | margin-bottom: 50px; 3 | } 4 | 5 | .docItemContainer header + *, 6 | .docItemContainer article > *:first-child { 7 | margin-top: 0; 8 | } 9 | 10 | @media (min-width: 997px) { 11 | .docItemCol { 12 | max-width: 75% !important; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /website/src/theme/Footer/LinkItem/index.js: -------------------------------------------------------------------------------- 1 | import isInternalUrl from '@docusaurus/isInternalUrl'; 2 | import Link from '@docusaurus/Link'; 3 | import useBaseUrl from '@docusaurus/useBaseUrl'; 4 | import clsx from 'clsx'; 5 | import React from 'react'; 6 | 7 | import styles from './index.module.css'; 8 | 9 | export default function FooterLinkItem({ item }) { 10 | const ExternalLinkIcon = require('../../../../static/img/external-link.svg').default; 11 | 12 | const { to, href, label, prependBaseUrlToHref, className, ...props } = item; 13 | const toUrl = useBaseUrl(to); 14 | const normalizedHref = useBaseUrl(href, { forcePrependBaseUrl: true }); 15 | 16 | return ( 17 | 27 | {label} 28 | {href && !isInternalUrl(href) && } 29 | 30 | ); 31 | } 32 | -------------------------------------------------------------------------------- /website/src/theme/Footer/LinkItem/index.module.css: -------------------------------------------------------------------------------- 1 | .footerLink { 2 | color: var(--color-text); 3 | cursor: pointer; 4 | font-size: 14px; 5 | line-height: 20px; 6 | &:hover { 7 | color: var(--color-text-subtle); 8 | path { 9 | fill: var(--color-text-subtle); 10 | } 11 | } 12 | } 13 | 14 | .externalLinkIcon { 15 | margin-left: 5px; 16 | path { 17 | fill: var(--color-text); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /website/src/theme/MDXComponents/A.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable react/prop-types */ 2 | import Link from '@docusaurus/Link'; 3 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; 4 | import React from 'react'; 5 | 6 | export default function MDXA(props) { 7 | const { siteConfig } = useDocusaurusContext(); 8 | if (props.href?.startsWith(siteConfig.url)) { 9 | const { href, ...rest } = props; 10 | rest.to = props.href.replace(siteConfig.url + siteConfig.baseUrl, '/'); 11 | props = rest; 12 | } 13 | 14 | return ; 15 | } 16 | -------------------------------------------------------------------------------- /website/src/theme/Navbar/Content/styles.module.css: -------------------------------------------------------------------------------- 1 | .navbarItems { 2 | display: flex; 3 | align-items: center; 4 | margin-inline: auto; 5 | gap: 16px; 6 | } 7 | 8 | .navbarItems__leftMargin { 9 | margin-left: 40px; 10 | } 11 | 12 | .getStartedButton { 13 | color: var(--color-text-on-primary); 14 | background: var(--color-black-action); 15 | border-radius: 8px; 16 | font-size: 16px; 17 | font-weight: 500; 18 | line-height: 24px; 19 | padding: 8px 16px !important; 20 | border: none; 21 | transition: background-color 0.2s; 22 | 23 | &:hover { 24 | color: var(--color-text-on-primary); 25 | background-color: var(--color-primary-action-hover); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /website/src/theme/Navbar/MobileSidebar/Header/index.module.css: -------------------------------------------------------------------------------- 1 | .getStartedButton { 2 | color: var(--color-text-on-primary); 3 | background: var(--color-black-action); 4 | border-radius: 8px; 5 | font-size: 16px; 6 | font-weight: 500; 7 | line-height: 24px; 8 | padding: 8px 16px !important; 9 | border: none; 10 | &:hover { 11 | color: var(--color-text-on-primary); 12 | } 13 | text-align: center; 14 | } 15 | 16 | .navbarHeader { 17 | display: flex; 18 | width: 100%; 19 | align-items: center; 20 | justify-content: space-between; 21 | padding: 16px; 22 | 23 | @media (min-width: 768px) { 24 | padding: 20px 40px; 25 | } 26 | @media (min-width: 1024px) { 27 | padding: 20px 64px; 28 | } 29 | } 30 | 31 | .navbarButtonsWrapper { 32 | display: flex; 33 | gap: 16px; 34 | margin-left: auto; 35 | } 36 | 37 | .navbarButtonsWrapperDesktop { 38 | display: flex; 39 | @media (max-width: 767px) { 40 | display: none; 41 | } 42 | } 43 | .navbarButtonsWrapperMobile { 44 | border-top: 1px solid var(--color-separator); 45 | display: none; 46 | @media (max-width: 767px) { 47 | display: flex; 48 | } 49 | width: 100%; 50 | margin: 0; 51 | flex-direction: column; 52 | gap: 16px; 53 | button { 54 | width: 100%; 55 | } 56 | padding: 16px 24px; 57 | } 58 | -------------------------------------------------------------------------------- /website/src/theme/Navbar/MobileSidebar/Layout/index.js: -------------------------------------------------------------------------------- 1 | import { useNavbarSecondaryMenu } from '@docusaurus/theme-common/internal'; 2 | import clsx from 'clsx'; 3 | import React from 'react'; 4 | 5 | export default function NavbarMobileSidebarLayout({ 6 | header, 7 | primaryMenu, 8 | secondaryMenu, 9 | }) { 10 | const { shown: secondaryMenuShown } = useNavbarSecondaryMenu(); 11 | return ( 12 |
13 | {header} 14 |
18 |
{primaryMenu}
19 |
{secondaryMenu}
20 |
21 |
22 | ); 23 | } 24 | -------------------------------------------------------------------------------- /website/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.js: -------------------------------------------------------------------------------- 1 | import { useThemeConfig } from '@docusaurus/theme-common'; 2 | import { useNavbarMobileSidebar } from '@docusaurus/theme-common/internal'; 3 | import NavbarItem from '@theme/NavbarItem'; 4 | import React from 'react'; 5 | 6 | function useNavbarItems() { 7 | return useThemeConfig().navbar.items; 8 | } 9 | // The primary menu displays the navbar items 10 | export default function NavbarMobilePrimaryMenu() { 11 | const mobileSidebar = useNavbarMobileSidebar(); 12 | const items = useNavbarItems(); 13 | 14 | return ( 15 |
    16 | {items.map((item, i) => ( 17 | mobileSidebar.toggle()} 21 | key={i} 22 | /> 23 | ))} 24 |
25 | ); 26 | } 27 | -------------------------------------------------------------------------------- /website/src/theme/Navbar/MobileSidebar/index.js: -------------------------------------------------------------------------------- 1 | import { 2 | useLockBodyScroll, 3 | useNavbarMobileSidebar, 4 | useWindowSize, 5 | } from '@docusaurus/theme-common/internal'; 6 | import NavbarMobileSidebarHeader from '@theme/Navbar/MobileSidebar/Header'; 7 | import NavbarMobileSidebarLayout from '@theme/Navbar/MobileSidebar/Layout'; 8 | import NavbarMobileSidebarPrimaryMenu from '@theme/Navbar/MobileSidebar/PrimaryMenu'; 9 | import NavbarMobileSidebarSecondaryMenu from '@theme/Navbar/MobileSidebar/SecondaryMenu'; 10 | import React from 'react'; 11 | 12 | export default function NavbarMobileSidebar() { 13 | const mobileSidebar = useNavbarMobileSidebar(); 14 | const windowSize = useWindowSize({ 15 | desktopBreakpoint: 1200, 16 | }); 17 | 18 | useLockBodyScroll(mobileSidebar.shown); 19 | const shouldRender = !mobileSidebar.disabled && windowSize === 'mobile'; 20 | if (!shouldRender) { 21 | return null; 22 | } 23 | return ( 24 | } 26 | primaryMenu={} 27 | secondaryMenu={} 28 | /> 29 | ); 30 | } 31 | -------------------------------------------------------------------------------- /website/static/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/.nojekyll -------------------------------------------------------------------------------- /website/static/font/lota.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/font/lota.woff -------------------------------------------------------------------------------- /website/static/font/lota.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/font/lota.woff2 -------------------------------------------------------------------------------- /website/static/img/API.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/API.png -------------------------------------------------------------------------------- /website/static/img/apify_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /website/static/img/apify_og_SDK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/apify_og_SDK.png -------------------------------------------------------------------------------- /website/static/img/arrow_right.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 8 | -------------------------------------------------------------------------------- /website/static/img/auto-scaling-dark.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/auto-scaling-dark.webp -------------------------------------------------------------------------------- /website/static/img/auto-scaling-light.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/auto-scaling-light.webp -------------------------------------------------------------------------------- /website/static/img/check.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /website/static/img/chrome-scrape-dark.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/chrome-scrape-dark.gif -------------------------------------------------------------------------------- /website/static/img/chrome-scrape-light.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/chrome-scrape-light.gif -------------------------------------------------------------------------------- /website/static/img/cloud_icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /website/static/img/community-dark-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /website/static/img/community-light-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /website/static/img/crawlee-logo-monocolor.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /website/static/img/crawlee-python-og.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/crawlee-python-og.png -------------------------------------------------------------------------------- /website/static/img/defaults-dark-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /website/static/img/defaults-light-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /website/static/img/discord-brand-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/static/img/discord-brand.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/static/img/external-link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/favicon.ico -------------------------------------------------------------------------------- /website/static/img/favorite-tools-dark.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/favorite-tools-dark.webp -------------------------------------------------------------------------------- /website/static/img/favorite-tools-light.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/favorite-tools-light.webp -------------------------------------------------------------------------------- /website/static/img/features/automate-everything.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 9 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /website/static/img/features/node-requests.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /website/static/img/features/storage.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /website/static/img/features/works-everywhere.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 9 | 10 | 11 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /website/static/img/fill-and-submit-web-form/00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/fill-and-submit-web-form/00.jpg -------------------------------------------------------------------------------- /website/static/img/fill-and-submit-web-form/01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/fill-and-submit-web-form/01.jpg -------------------------------------------------------------------------------- /website/static/img/fill-and-submit-web-form/02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/fill-and-submit-web-form/02.jpg -------------------------------------------------------------------------------- /website/static/img/fill-and-submit-web-form/03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/fill-and-submit-web-form/03.jpg -------------------------------------------------------------------------------- /website/static/img/getting-started/current-price.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/current-price.jpg -------------------------------------------------------------------------------- /website/static/img/getting-started/scraping-practice.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/scraping-practice.jpg -------------------------------------------------------------------------------- /website/static/img/getting-started/select-an-element.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/select-an-element.jpg -------------------------------------------------------------------------------- /website/static/img/getting-started/selected-element.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/selected-element.jpg -------------------------------------------------------------------------------- /website/static/img/getting-started/sku.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/sku.jpg -------------------------------------------------------------------------------- /website/static/img/getting-started/title.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/getting-started/title.jpg -------------------------------------------------------------------------------- /website/static/img/hearth copy.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/static/img/hearth.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/static/img/javascript_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /website/static/img/js_file.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /website/static/img/logo-blur.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/logo-blur.png -------------------------------------------------------------------------------- /website/static/img/menu-arrows.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /website/static/img/oss_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/oss_logo.png -------------------------------------------------------------------------------- /website/static/img/puppeteer-live-view-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/puppeteer-live-view-dashboard.png -------------------------------------------------------------------------------- /website/static/img/puppeteer-live-view-detail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/puppeteer-live-view-detail.png -------------------------------------------------------------------------------- /website/static/img/resuming-paused-crawl/00.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/resuming-paused-crawl/00.webp -------------------------------------------------------------------------------- /website/static/img/resuming-paused-crawl/01.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/resuming-paused-crawl/01.webp -------------------------------------------------------------------------------- /website/static/img/robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/robot.png -------------------------------------------------------------------------------- /website/static/img/routing-dark-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /website/static/img/routing-light-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /website/static/img/scraping-utils-dark-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /website/static/img/scraping-utils-light-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /website/static/img/smart-proxy-dark.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/smart-proxy-dark.webp -------------------------------------------------------------------------------- /website/static/img/smart-proxy-light.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/smart-proxy-light.webp -------------------------------------------------------------------------------- /website/static/img/source_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/static/img/source_code.png -------------------------------------------------------------------------------- /website/static/img/system.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /website/static/img/workflow.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /website/static/img/zero-setup-dark-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /website/static/img/zero-setup-light-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /website/static/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Sitemap: https://crawlee.dev/python/sitemap.xml 3 | -------------------------------------------------------------------------------- /website/tools/docs-prettier.config.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @type {import('prettier').Options} 3 | */ 4 | module.exports = { 5 | parser: 'markdown', 6 | arrowParens: 'avoid', 7 | trailingComma: 'all', 8 | singleQuote: true, 9 | tabWidth: 4, 10 | printWidth: 150, 11 | proseWrap: 'always', 12 | }; 13 | -------------------------------------------------------------------------------- /website/tools/utils/externalLink.js: -------------------------------------------------------------------------------- 1 | const { parse } = require('url'); 2 | 3 | const visit = import('unist-util-visit').then((m) => m.visit); 4 | 5 | const internalUrls = ['crawlee.dev']; 6 | 7 | /** 8 | * @param {import('url').UrlWithStringQuery} href 9 | */ 10 | function isInternal(href) { 11 | return internalUrls.some( 12 | (internalUrl) => href.host === internalUrl 13 | || (!href.protocol && !href.host && (href.pathname || href.hash)), 14 | ); 15 | } 16 | 17 | /** 18 | * @type {import('unified').Plugin} 19 | */ 20 | exports.externalLinkProcessor = () => { 21 | return async (tree) => { 22 | (await visit)(tree, 'element', (node) => { 23 | if ( 24 | node.tagName === 'a' 25 | && node.properties 26 | && typeof node.properties.href === 'string' 27 | ) { 28 | const href = parse(node.properties.href); 29 | 30 | if (!isInternal(href)) { 31 | node.properties.target = '_blank'; 32 | node.properties.rel = 'noopener'; 33 | } else { 34 | node.properties.target = null; 35 | node.properties.rel = null; 36 | } 37 | } 38 | }); 39 | }; 40 | }; 41 | -------------------------------------------------------------------------------- /website/tools/website_gif/chrome-scrape-dark.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/tools/website_gif/chrome-scrape-dark.gif -------------------------------------------------------------------------------- /website/tools/website_gif/chrome-scrape-dark.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/tools/website_gif/chrome-scrape-dark.mp4 -------------------------------------------------------------------------------- /website/tools/website_gif/chrome-scrape-light.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/tools/website_gif/chrome-scrape-light.gif -------------------------------------------------------------------------------- /website/tools/website_gif/chrome-scrape-light.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/crawlee-python/415717a8d3d46a0f609ed1b94f461289ef1de1d7/website/tools/website_gif/chrome-scrape-light.mp4 -------------------------------------------------------------------------------- /website/tsconfig.eslint.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@apify/tsconfig", 3 | "compilerOptions": { 4 | "jsx": "preserve" 5 | }, 6 | "include": [ 7 | "src/**/*.js", 8 | "src/**/*.ts", 9 | "src/**/*.jsx", 10 | "src/**/*.tsx" 11 | ] 12 | } 13 | --------------------------------------------------------------------------------