├── .gitignore ├── CHANGELOG.txt ├── LICENSE.txt ├── README.md ├── classes └── phpcrawler.class.php ├── composer.json ├── documentation ├── about.html ├── classreferences │ ├── PHPCrawler │ │ ├── collapse.gif │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── method_detail_tpl_method_PHPCrawler.htm │ │ ├── method_detail_tpl_method_addBasicAuthentication.htm │ │ ├── method_detail_tpl_method_addContentTypeReceiveRule.htm │ │ ├── method_detail_tpl_method_addFollowMatch.htm │ │ ├── method_detail_tpl_method_addLinkExtractionTags.htm │ │ ├── method_detail_tpl_method_addLinkPriority.htm │ │ ├── method_detail_tpl_method_addLinkSearchContentType.htm │ │ ├── method_detail_tpl_method_addNonFollowMatch.htm │ │ ├── method_detail_tpl_method_addPostData.htm │ │ ├── method_detail_tpl_method_addReceiveContentType.htm │ │ ├── method_detail_tpl_method_addReceiveToMemoryMatch.htm │ │ ├── method_detail_tpl_method_addReceiveToTmpFileMatch.htm │ │ ├── method_detail_tpl_method_addStreamToFileContentType.htm │ │ ├── method_detail_tpl_method_addURLFilterRule.htm │ │ ├── method_detail_tpl_method_addURLFollowRule.htm │ │ ├── method_detail_tpl_method_disableExtendedLinkInfo.htm │ │ ├── method_detail_tpl_method_enableAggressiveLinkSearch.htm │ │ ├── method_detail_tpl_method_enableCookieHandling.htm │ │ ├── method_detail_tpl_method_enableResumption.htm │ │ ├── method_detail_tpl_method_excludeLinkSearchDocumentSections.htm │ │ ├── method_detail_tpl_method_getCrawlerId.htm │ │ ├── method_detail_tpl_method_getProcessReport.htm │ │ ├── method_detail_tpl_method_getReport.htm │ │ ├── method_detail_tpl_method_go.htm │ │ ├── method_detail_tpl_method_goMultiProcessed.htm │ │ ├── method_detail_tpl_method_handleDocumentInfo.htm │ │ ├── method_detail_tpl_method_handleHeaderInfo.htm │ │ ├── method_detail_tpl_method_handlePageData.htm │ │ ├── method_detail_tpl_method_initChildProcess.htm │ │ ├── method_detail_tpl_method_obeyNoFollowTags.htm │ │ ├── method_detail_tpl_method_obeyRobotsTxt.htm │ │ ├── method_detail_tpl_method_requestGzipContent.htm │ │ ├── method_detail_tpl_method_resume.htm │ │ ├── method_detail_tpl_method_setAggressiveLinkExtraction.htm │ │ ├── method_detail_tpl_method_setConnectionTimeout.htm │ │ ├── method_detail_tpl_method_setContentSizeLimit.htm │ │ ├── method_detail_tpl_method_setCookieHandling.htm │ │ ├── method_detail_tpl_method_setCrawlingDepthLimit.htm │ │ ├── method_detail_tpl_method_setFollowMode.htm │ │ ├── method_detail_tpl_method_setFollowRedirects.htm │ │ ├── method_detail_tpl_method_setFollowRedirectsTillContent.htm │ │ ├── method_detail_tpl_method_setHTTPProtocolVersion.htm │ │ ├── method_detail_tpl_method_setLinkExtractionTags.htm │ │ ├── method_detail_tpl_method_setPageLimit.htm │ │ ├── method_detail_tpl_method_setPort.htm │ │ ├── method_detail_tpl_method_setProxy.htm │ │ ├── method_detail_tpl_method_setRequestDelay.htm │ │ ├── method_detail_tpl_method_setRequestLimit.htm │ │ ├── method_detail_tpl_method_setStreamTimeout.htm │ │ ├── method_detail_tpl_method_setTmpFile.htm │ │ ├── method_detail_tpl_method_setTrafficLimit.htm │ │ ├── method_detail_tpl_method_setURL.htm │ │ ├── method_detail_tpl_method_setUrlCacheType.htm │ │ ├── method_detail_tpl_method_setUserAgentString.htm │ │ ├── method_detail_tpl_method_setWorkingDirectory.htm │ │ ├── overview.html │ │ ├── property_detail_tpl_property_class_version.htm │ │ └── style.css │ ├── PHPCrawlerAbortReasons │ │ ├── collapse.gif │ │ ├── constant_detail_tpl_constant_ABORTREASON_FILELIMIT_REACHED.htm │ │ ├── constant_detail_tpl_constant_ABORTREASON_PASSEDTHROUGH.htm │ │ ├── constant_detail_tpl_constant_ABORTREASON_TRAFFICLIMIT_REACHED.htm │ │ ├── constant_detail_tpl_constant_ABORTREASON_USERABORT.htm │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── overview.html │ │ └── style.css │ ├── PHPCrawlerCookieDescriptor │ │ ├── collapse.gif │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── method_detail_tpl_method_PHPCrawlerCookieDescriptor.htm │ │ ├── method_detail_tpl_method_getFromHeaderLine.htm │ │ ├── overview.html │ │ ├── property_detail_tpl_property_cookie_send_time.htm │ │ ├── property_detail_tpl_property_domain.htm │ │ ├── property_detail_tpl_property_expire_timestamp.htm │ │ ├── property_detail_tpl_property_expires.htm │ │ ├── property_detail_tpl_property_name.htm │ │ ├── property_detail_tpl_property_path.htm │ │ ├── property_detail_tpl_property_source_domain.htm │ │ ├── property_detail_tpl_property_source_url.htm │ │ ├── property_detail_tpl_property_value.htm │ │ └── style.css │ ├── PHPCrawlerDocumentInfo │ │ ├── collapse.gif │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── method_detail_tpl_method_setLinksFoundArray.htm │ │ ├── method_detail_tpl_method_toArray.htm │ │ ├── overview.html │ │ ├── property_detail_tpl_property_benchmarks.htm │ │ ├── property_detail_tpl_property_bytes_received.htm │ │ ├── property_detail_tpl_property_content.htm │ │ ├── property_detail_tpl_property_content_tmp_file.htm │ │ ├── property_detail_tpl_property_content_type.htm │ │ ├── property_detail_tpl_property_cookies.htm │ │ ├── property_detail_tpl_property_data_transfer_rate.htm │ │ ├── property_detail_tpl_property_data_transfer_time.htm │ │ ├── property_detail_tpl_property_error_code.htm │ │ ├── property_detail_tpl_property_error_occured.htm │ │ ├── property_detail_tpl_property_error_string.htm │ │ ├── property_detail_tpl_property_file.htm │ │ ├── property_detail_tpl_property_header.htm │ │ ├── property_detail_tpl_property_header_bytes_received.htm │ │ ├── property_detail_tpl_property_header_send.htm │ │ ├── property_detail_tpl_property_host.htm │ │ ├── property_detail_tpl_property_http_status_code.htm │ │ ├── property_detail_tpl_property_links_found.htm │ │ ├── property_detail_tpl_property_links_found_url_descriptors.htm │ │ ├── property_detail_tpl_property_meta_attributes.htm │ │ ├── property_detail_tpl_property_path.htm │ │ ├── property_detail_tpl_property_port.htm │ │ ├── property_detail_tpl_property_protocol.htm │ │ ├── property_detail_tpl_property_query.htm │ │ ├── property_detail_tpl_property_received.htm │ │ ├── property_detail_tpl_property_received_completely.htm │ │ ├── property_detail_tpl_property_received_completly.htm │ │ ├── property_detail_tpl_property_received_to_file.htm │ │ ├── property_detail_tpl_property_received_to_memory.htm │ │ ├── property_detail_tpl_property_referer_url.htm │ │ ├── property_detail_tpl_property_refering_link_raw.htm │ │ ├── property_detail_tpl_property_refering_linkcode.htm │ │ ├── property_detail_tpl_property_refering_linktext.htm │ │ ├── property_detail_tpl_property_responseHeader.htm │ │ ├── property_detail_tpl_property_server_connect_time.htm │ │ ├── property_detail_tpl_property_server_response_time.htm │ │ ├── property_detail_tpl_property_source.htm │ │ ├── property_detail_tpl_property_traffic_limit_reached.htm │ │ ├── property_detail_tpl_property_unbuffered_bytes_read.htm │ │ ├── property_detail_tpl_property_url.htm │ │ ├── property_detail_tpl_property_url_link_depth.htm │ │ └── style.css │ ├── PHPCrawlerHTTPProtocols │ │ ├── collapse.gif │ │ ├── constant_detail_tpl_constant_HTTP_1_0.htm │ │ ├── constant_detail_tpl_constant_HTTP_1_1.htm │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── overview.html │ │ └── style.css │ ├── PHPCrawlerLinkSearchDocumentSections │ │ ├── collapse.gif │ │ ├── constant_detail_tpl_constant_ALL_SPECIAL_SECTIONS.htm │ │ ├── constant_detail_tpl_constant_HTML_COMMENT_SECTIONS.htm │ │ ├── constant_detail_tpl_constant_JS_TRIGGERING_SECTIONS.htm │ │ ├── constant_detail_tpl_constant_SCRIPT_SECTIONS.htm │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── overview.html │ │ └── style.css │ ├── PHPCrawlerMultiProcessModes │ │ ├── collapse.gif │ │ ├── constant_detail_tpl_constant_MPMODE_CHILDS_EXECUTES_USERCODE.htm │ │ ├── constant_detail_tpl_constant_MPMODE_NONE.htm │ │ ├── constant_detail_tpl_constant_MPMODE_PARENT_EXECUTES_USERCODE.htm │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── overview.html │ │ └── style.css │ ├── PHPCrawlerProcessReport │ │ ├── collapse.gif │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── method_detail_tpl_method_toArray.htm │ │ ├── overview.html │ │ ├── property_detail_tpl_property_abort_reason.htm │ │ ├── property_detail_tpl_property_avg_proc_data_transfer_rate.htm │ │ ├── property_detail_tpl_property_avg_server_connect_time.htm │ │ ├── property_detail_tpl_property_avg_server_response_time.htm │ │ ├── property_detail_tpl_property_bytes_received.htm │ │ ├── property_detail_tpl_property_data_throughput.htm │ │ ├── property_detail_tpl_property_file_limit_reached.htm │ │ ├── property_detail_tpl_property_files_received.htm │ │ ├── property_detail_tpl_property_links_followed.htm │ │ ├── property_detail_tpl_property_memory_peak_usage.htm │ │ ├── property_detail_tpl_property_process_runtime.htm │ │ ├── property_detail_tpl_property_traffic_limit_reached.htm │ │ ├── property_detail_tpl_property_user_abort.htm │ │ └── style.css │ ├── PHPCrawlerRequestErrors │ │ ├── collapse.gif │ │ ├── constant_detail_tpl_constant_ERROR_HOST_UNREACHABLE.htm │ │ ├── constant_detail_tpl_constant_ERROR_NO_HTTP_HEADER.htm │ │ ├── constant_detail_tpl_constant_ERROR_PROXY_UNREACHABLE.htm │ │ ├── constant_detail_tpl_constant_ERROR_SOCKET_TIMEOUT.htm │ │ ├── constant_detail_tpl_constant_ERROR_SSL_NOT_SUPPORTED.htm │ │ ├── constant_detail_tpl_constant_ERROR_TMP_FILE_NOT_WRITEABLE.htm │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── overview.html │ │ └── style.css │ ├── PHPCrawlerResponseHeader │ │ ├── collapse.gif │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── method_detail_tpl_method_PHPCrawlerResponseHeader.htm │ │ ├── overview.html │ │ ├── property_detail_tpl_property_content_encoding.htm │ │ ├── property_detail_tpl_property_content_length.htm │ │ ├── property_detail_tpl_property_content_type.htm │ │ ├── property_detail_tpl_property_cookies.htm │ │ ├── property_detail_tpl_property_header_raw.htm │ │ ├── property_detail_tpl_property_http_status_code.htm │ │ ├── property_detail_tpl_property_source_url.htm │ │ ├── property_detail_tpl_property_transfer_encoding.htm │ │ └── style.css │ ├── PHPCrawlerURLDescriptor │ │ ├── collapse.gif │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── method_detail_tpl_method_PHPCrawlerURLDescriptor.htm │ │ ├── overview.html │ │ ├── property_detail_tpl_property_is_redirect_url.htm │ │ ├── property_detail_tpl_property_link_raw.htm │ │ ├── property_detail_tpl_property_linkcode.htm │ │ ├── property_detail_tpl_property_linktext.htm │ │ ├── property_detail_tpl_property_refering_url.htm │ │ ├── property_detail_tpl_property_url_link_depth.htm │ │ ├── property_detail_tpl_property_url_rebuild.htm │ │ └── style.css │ ├── PHPCrawlerUrlCacheTypes │ │ ├── collapse.gif │ │ ├── constant_detail_tpl_constant_URLCACHE_MEMORY.htm │ │ ├── constant_detail_tpl_constant_URLCACHE_SQLITE.htm │ │ ├── expand.gif │ │ ├── google_code.php │ │ ├── index.html │ │ ├── overview.html │ │ └── style.css │ ├── collapse.gif │ ├── expand.gif │ ├── google_code.php │ ├── index.html │ ├── print_googlead_div.js │ ├── project_overview.htm │ └── style.css ├── example.html ├── example2.html ├── faq.html ├── google_code.php ├── index.html ├── menu.html ├── multiprocesses.html ├── multiprocessing_modes.html ├── quickstart.html ├── requirements.html ├── resume_aborted_processes.html ├── spidering_huge_websites.html ├── style.css ├── testinterface.html ├── testinterface.jpg └── versionhistory.html ├── example.php ├── libs ├── CookieCache │ ├── PHPCrawlerCookieCacheBase.class.php │ ├── PHPCrawlerMemoryCookieCache.class.php │ └── PHPCrawlerSQLiteCookieCache.class.php ├── Enums │ ├── PHPCrawlerAbortReasons.class.php │ ├── PHPCrawlerHTTPProtocols.class.php │ ├── PHPCrawlerLinkSearchDocumentSections.class.php │ ├── PHPCrawlerMultiProcessModes.class.php │ ├── PHPCrawlerRequestErrors.class.php │ └── PHPCrawlerUrlCacheTypes.class.php ├── PHPCrawler.class.php ├── PHPCrawlerBenchmark.class.php ├── PHPCrawlerCookieDescriptor.class.php ├── PHPCrawlerDNSCache.class.php ├── PHPCrawlerDocumentInfo.class.php ├── PHPCrawlerHTTPRequest.class.php ├── PHPCrawlerLinkFinder.class.php ├── PHPCrawlerProcessReport.class.php ├── PHPCrawlerResponseHeader.class.php ├── PHPCrawlerRobotsTxtParser.class.php ├── PHPCrawlerStatus.class.php ├── PHPCrawlerURLDescriptor.class.php ├── PHPCrawlerURLFilter.class.php ├── PHPCrawlerUrlPartsDescriptor.class.php ├── PHPCrawlerUserSendDataCache.class.php ├── ProcessCommunication │ ├── PHPCrawlerDocumentInfoQueue.class.php │ ├── PHPCrawlerProcessHandler.class.php │ └── PHPCrawlerStatusHandler.class.php ├── UrlCache │ ├── PHPCrawlerMemoryURLCache.class.php │ ├── PHPCrawlerSQLiteURLCache.class.php │ └── PHPCrawlerURLCacheBase.class.php └── Utils │ ├── PHPCrawlerEncodingUtils.class.php │ └── PHPCrawlerUtils.class.php ├── multiprocessing_example.php ├── resumable_example.php └── test_interface ├── index.php ├── info.gif ├── js.js ├── output.php ├── phpcrawl_testinterface.conf.php ├── phpcrawl_testinterface.func.php ├── phpcrawlsetup.class.php ├── setups └── Example_Setup.psf └── style.css /.gitignore: -------------------------------------------------------------------------------- 1 | libs/PHPCrawlerUtils.class.php 2 | vendor/ 3 | -------------------------------------------------------------------------------- /CHANGELOG.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmerian/phpcrawl/1c5e07ff33cf079c69191eb9540a3ced64d392dc/CHANGELOG.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # phpcrawl 2 | Copy of http://phpcrawl.cuab.de/ for using with composer 3 | 4 | [](https://packagist.org/packages/mmerian/phpcrawl) [](https://packagist.org/packages/mmerian/phpcrawl) [](https://packagist.org/packages/mmerian/phpcrawl) [](https://packagist.org/packages/mmerian/phpcrawl) 5 | -------------------------------------------------------------------------------- /classes/phpcrawler.class.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mmerian/phpcrawl", 3 | "description": "PHPCrawl is a webcrawler/webspider-library written in PHP. It supports filters, limiters, cookie-handling, robots.txt-handling, multiprocessing and much more.", 4 | "license": "GPL-v2", 5 | "authors": [ 6 | { 7 | "name": "Uwe Hunfeld", 8 | "email": "phpcrawl@cuab.de" 9 | } 10 | ], 11 | "autoload": { 12 | "classmap": [ 13 | "libs/Utils/PHPCrawlerUtils.class.php", 14 | "libs" 15 | ] 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /documentation/classreferences/PHPCrawler/collapse.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmerian/phpcrawl/1c5e07ff33cf079c69191eb9540a3ced64d392dc/documentation/classreferences/PHPCrawler/collapse.gif -------------------------------------------------------------------------------- /documentation/classreferences/PHPCrawler/expand.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmerian/phpcrawl/1c5e07ff33cf079c69191eb9540a3ced64d392dc/documentation/classreferences/PHPCrawler/expand.gif -------------------------------------------------------------------------------- /documentation/classreferences/PHPCrawler/google_code.php: -------------------------------------------------------------------------------- 1 | 5 | --> 6 | 7 |
62 | 63 | public $class_version = "0.83rc1" 64 |
65 |70 |
No information |
80 | 81 | 82 | - 83 | 84 |
85 |