├── .github ├── dependabot.yml └── workflows │ └── php.yml ├── .gitignore ├── .phan └── config.php ├── .php-cs-fixer.dist.php ├── .run ├── Unit Tests.run.xml ├── [CHECK] Phan.run.xml ├── [CHECK] coverage-enforce 100.run.xml ├── [SAMPLE] example_basic_auth.php.run.xml └── [SAMPLE] example_complex.php.run.xml ├── LICENSE ├── README.md ├── bin ├── coverage-enforce ├── fix-style └── static-analysis ├── composer.json ├── docs ├── 500px-Graph.traversal.example.png ├── Makefile ├── _build │ ├── doctrees │ │ ├── environment.pickle │ │ └── index.doctree │ └── html │ │ ├── .buildinfo │ │ ├── _sources │ │ └── index.txt │ │ ├── _static │ │ ├── ajax-loader.gif │ │ ├── basic.css │ │ ├── comment-bright.png │ │ ├── comment-close.png │ │ ├── comment.png │ │ ├── default.css │ │ ├── doctools.js │ │ ├── down-pressed.png │ │ ├── down.png │ │ ├── file.png │ │ ├── jquery.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── sidebar.js │ │ ├── underscore.js │ │ ├── up-pressed.png │ │ ├── up.png │ │ └── websupport.js │ │ ├── genindex.html │ │ ├── index.html │ │ ├── objects.inv │ │ ├── search.html │ │ └── searchindex.js ├── conf.py └── index.rst ├── example ├── cache │ └── .gitkeep ├── example_basic_auth.php ├── example_complex.php ├── example_complex_bootstrap.php ├── example_link_check.php ├── example_persistent_request_params.php ├── example_simple.php └── lib │ └── Example │ ├── GuzzleTimerMiddleware.php │ ├── LinkCheckRequestHandler.php │ ├── LogHandler.php │ └── StatsHandler.php ├── phpmd-tests.xml ├── phpmd.xml ├── phpunit.xml.dist ├── src ├── Discoverer │ ├── CrawlerDiscoverer.php │ ├── CssSelectorDiscoverer.php │ ├── Discoverer.php │ ├── DiscovererInterface.php │ ├── DiscovererSet.php │ └── XPathExpressionDiscoverer.php ├── Downloader │ ├── Downloader.php │ └── DownloaderInterface.php ├── Event │ ├── DispatcherTrait.php │ └── SpiderEvents.php ├── EventListener │ └── PolitenessPolicyListener.php ├── Exception │ └── MaxQueueSizeExceededException.php ├── Filter │ ├── PostFetchFilterInterface.php │ ├── Postfetch │ │ └── MimeTypeFilter.php │ ├── PreFetchFilterInterface.php │ └── Prefetch │ │ ├── AllowedHostsFilter.php │ │ ├── AllowedPortsFilter.php │ │ ├── AllowedSchemeFilter.php │ │ ├── ExtractRobotsTxtException.php │ │ ├── FetchRobotsTxtException.php │ │ ├── RestrictToBaseUriFilter.php │ │ ├── RobotsTxtDisallowFilter.php │ │ ├── UriFilter.php │ │ ├── UriWithHashFragmentFilter.php │ │ └── UriWithQueryStringFilter.php ├── FilterableInterface.php ├── PersistenceHandler │ ├── FilePersistenceHandler.php │ ├── FileSerializedResourcePersistenceHandler.php │ ├── MemoryPersistenceHandler.php │ └── PersistenceHandlerInterface.php ├── QueueManager │ ├── InMemoryQueueManager.php │ └── QueueManagerInterface.php ├── RequestHandler │ ├── GuzzleRequestHandler.php │ └── RequestHandlerInterface.php ├── Resource.php ├── Spider.php └── Uri │ └── DiscoveredUri.php └── tests ├── Discoverer ├── CssSelectorDiscovererTest.php ├── DiscovererSetTest.php ├── DiscovererTest.php ├── DiscovererTestCase.php ├── XpathExpressionDiscovererTest.php └── robots.txt ├── Downloader └── DownloaderTest.php ├── EventListener └── PolitenessPolicyListenerTest.php ├── Filter ├── Postfetch │ └── MimeTypeFilterTest.php └── Prefetch │ ├── AllowedHostsFilterTest.php │ ├── AllowedPortsFilterTest.php │ ├── AllowedSchemeFilterTest.php │ ├── RestrictToBaseUriFilterTest.php │ ├── RobotsTxtDisallowFilterTest.php │ ├── UriFilterTest.php │ ├── UriWithHashFragmentFilterTest.php │ ├── UriWithQueryStringFilterTest.php │ └── robots.txt ├── Fixtures ├── DownloaderTestHTMLResource.html ├── ResourceTestHTMLResource.html ├── SpiderTestHTMLResourceA.html ├── SpiderTestHTMLResourceB.html ├── SpiderTestHTMLResourceC.html ├── SpiderTestHTMLResourceD.html ├── SpiderTestHTMLResourceE.html ├── SpiderTestHTMLResourceF.html └── SpiderTestHTMLResourceG.html ├── PersistenceHandler ├── FileSerializedResourcePersistenceHandlerTest.php └── MemoryPersistenceHandlerTest.php ├── QueueManager └── InMemoryQueueManagerTest.php ├── RequestHandler └── GuzzleRequestHandlerTest.php ├── ResourceTest.php ├── SpiderTest.php ├── TestCase.php └── Uri └── DiscoveredUriTest.php /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: "composer" # See documentation for possible values 5 | directory: "/" # Location of package manifests 6 | schedule: 7 | interval: "weekly" 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | # Check for updates to GitHub Actions every weekday 12 | interval: "weekly" 13 | -------------------------------------------------------------------------------- /.github/workflows/php.yml: -------------------------------------------------------------------------------- 1 | name: PHP-Spider 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | name: PHP Spider (PHP ${{ matrix.php-versions }} on ${{ matrix.operating-system }}) 12 | runs-on: ${{ matrix.operating-system }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | operating-system: [ ubuntu-latest ] 17 | php-versions: [ '8.0', '8.1', '8.2', '8.3' ] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Setup PHP, with composer and extensions 23 | uses: shivammathur/setup-php@v2 #https://github.com/shivammathur/setup-php 24 | with: 25 | php-version: ${{ matrix.php-versions }} 26 | extensions: ast 27 | coverage: xdebug 28 | 29 | - name: Get Composer Cache Directory 30 | id: composer-cache 31 | run: | 32 | echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT 33 | 34 | - name: Cache composer dependencies 35 | uses: actions/cache@v4 36 | with: 37 | path: ${{ steps.composer-cache.outputs.dir }} 38 | # Use composer.json for key, if composer.lock is not committed. 39 | key: ${{ runner.os }}-composer-${{ hashFiles('**/composer.json') }} 40 | restore-keys: ${{ runner.os }}-composer- 41 | 42 | - name: Validate composer.json and composer.lock 43 | run: composer validate 44 | 45 | - name: Install Composer dependencies 46 | run: composer install --no-progress --no-suggest --prefer-dist --optimize-autoloader --no-interaction 47 | 48 | - name: Run Tests 49 | id: tests 50 | run: bin/coverage-enforce 100 51 | 52 | - name: Run Static Analysis 53 | run: bin/static-analysis 54 | 55 | - name: Display Text Code Coverage 56 | if: ${{ failure() && steps.tests.conclusion == 'failure' }} # Only if tests fail 57 | run: cat build/coverage/coverage.txt 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | composer.lock 3 | vendor 4 | TODO.md 5 | build 6 | example/logs/*.log 7 | example/results 8 | *.doctrinecache.php 9 | .DS_Store 10 | cache.properties 11 | .tmp 12 | .phpunit.result.cache 13 | .php_cs.cache 14 | .php-cs-fixer.cache 15 | composer.phar 16 | -------------------------------------------------------------------------------- /.php-cs-fixer.dist.php: -------------------------------------------------------------------------------- 1 | in(__DIR__ . '/src/'); 5 | 6 | $fixer = new PhpCsFixer\Config(); 7 | return $fixer->setRules([ 8 | '@PSR2' => true 9 | ]) 10 | ->setFinder($finder); 11 | 12 | -------------------------------------------------------------------------------- /.run/Unit Tests.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.run/[CHECK] Phan.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.run/[CHECK] coverage-enforce 100.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /.run/[SAMPLE] example_basic_auth.php.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /.run/[SAMPLE] example_complex.php.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023 Matthijs van den Bos 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Build Status](https://github.com/mvdbos/php-spider/workflows/PHP-Spider/badge.svg?branch=master) 2 | [![Latest Stable Version](https://poser.pugx.org/vdb/php-spider/v)](https://packagist.org/packages/vdb/php-spider) 3 | [![Total Downloads](https://poser.pugx.org/vdb/php-spider/downloads)](https://packagist.org/packages/vdb/php-spider) 4 | [![License](https://poser.pugx.org/vdb/php-spider/license)](https://packagist.org/packages/vdb/php-spider) 5 | 6 | 7 | PHP-Spider Features 8 | ====== 9 | - supports two traversal algorithms: breadth-first and depth-first 10 | - supports crawl depth limiting, queue size limiting and max downloads limiting 11 | - supports adding custom URI discovery logic, based on XPath, CSS selectors, or plain old PHP 12 | - comes with a useful set of URI filters, such as robots.txt and Domain limiting 13 | - supports custom URI filters, both prefetch (URI) and postfetch (Resource content) 14 | - supports custom request handling logic 15 | - supports Basic, Digest and NTLM HTTP authentication. See [example](example/example_basic_auth.php). 16 | - comes with a useful set of persistence handlers (memory, file) 17 | - supports custom persistence handlers 18 | - collects statistics about the crawl for reporting 19 | - dispatches useful events, allowing developers to add even more custom behavior 20 | - supports a politeness policy 21 | 22 | This Spider does not support Javascript. 23 | 24 | Installation 25 | ------------ 26 | The easiest way to install PHP-Spider is with [composer](https://getcomposer.org/). Find it on [Packagist](https://packagist.org/packages/vdb/php-spider). 27 | 28 | ```bash 29 | $ composer require vdb/php-spider 30 | ``` 31 | 32 | Usage 33 | ----- 34 | This is a very simple example. This code can be found in [example/example_simple.php](example/example_simple.php). For a more complete example with some logging, caching and filters, see [example/example_complex.php](example/example_complex.php). That file contains a more real-world example. 35 | 36 | >> Note that by default, the spider stops processing when it encounters a 4XX or 5XX error responses. To set the spider up to keep processing, please see [the link checker example](https://github.com/mvdbos/php-spider/blob/master/example/example_link_check.php). It uses a custom request handler, that configures the default Guzzle request handler to not fail on 4XX and 5XX responses. 37 | 38 | First create the spider 39 | ```php 40 | $spider = new Spider('http://www.dmoz.org'); 41 | ``` 42 | Add a URI discoverer. Without it, the spider does nothing. In this case, we want all `` nodes from a certain `
` 43 | 44 | ```php 45 | $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@id='catalogs']//a")); 46 | ``` 47 | Set some sane options for this example. In this case, we only get the first 10 items from the start page. 48 | 49 | ```php 50 | $spider->getDiscovererSet()->maxDepth = 1; 51 | $spider->getQueueManager()->maxQueueSize = 10; 52 | ``` 53 | Add a listener to collect stats from the Spider and the QueueManager. 54 | There are more components that dispatch events you can use. 55 | 56 | ```php 57 | $statsHandler = new StatsHandler(); 58 | $spider->getQueueManager()->getDispatcher()->addSubscriber($statsHandler); 59 | $spider->getDispatcher()->addSubscriber($statsHandler); 60 | ``` 61 | Execute the crawl 62 | 63 | ```php 64 | $spider->crawl(); 65 | ``` 66 | When crawling is done, we could get some info about the crawl 67 | ```php 68 | echo "\n ENQUEUED: " . count($statsHandler->getQueued()); 69 | echo "\n SKIPPED: " . count($statsHandler->getFiltered()); 70 | echo "\n FAILED: " . count($statsHandler->getFailed()); 71 | echo "\n PERSISTED: " . count($statsHandler->getPersisted()); 72 | ``` 73 | Finally we could do some processing on the downloaded resources. In this example, we will echo the title of all resources 74 | ```php 75 | echo "\n\nDOWNLOADED RESOURCES: "; 76 | foreach ($spider->getDownloader()->getPersistenceHandler() as $resource) { 77 | echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text(); 78 | } 79 | 80 | ``` 81 | Contributing 82 | ------------ 83 | Contributing to PHP-Spider is as easy as Forking the repository on Github and submitting a Pull Request. 84 | The Symfony documentation contains an excellent guide for how to do that properly here: [Submitting a Patch](http://symfony.com/doc/current/contributing/code/patches.html#step-1-setup-your-environment). 85 | 86 | There a few requirements for a Pull Request to be accepted: 87 | - Follow the coding standards: PHP-Spider follows the coding standards defined in the [PSR-0](https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-0.md), [PSR-1](https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-1-basic-coding-standard.md) and [PSR-2](https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-2-coding-style-guide.md) Coding Style Guides; 88 | - Prove that the code works with unit tests and that coverage remains 100%; 89 | 90 | > Note: An easy way to check if your code conforms to PHP-Spider is by running the script `bin/static-analysis`, which is part of this repo. This will run the following tools, configured for PHP-Spider: PHP CodeSniffer, PHP Mess Detector and PHP Copy/Paste Detector. 91 | 92 | > Note: To run PHPUnit with coverage, and to check that coverage == 100%, you can run `bin/coverage-enforce`. 93 | 94 | Support 95 | ------- 96 | For things like reporting bugs and requesting features it is best to create an [issue](https://github.com/mvdbos/php-spider/issues) here on GitHub. It is even better to accompany it with a Pull Request. ;-) 97 | 98 | License 99 | ------- 100 | PHP-Spider is licensed under the MIT license. 101 | -------------------------------------------------------------------------------- /bin/coverage-enforce: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | 3 | 100) { 10 | exit("Usage: coverage-enforce (percentage <= 100)\n"); 11 | } 12 | } else { 13 | $requiredLineCoverage = 100; 14 | } 15 | 16 | print "\n\e[0;37m#############################################################\e[0m"; 17 | print "\n\e[0;37m# Starting Unit Tests with code coverage. Requirement: {$requiredLineCoverage}% #\e[0m"; 18 | print "\n\e[0;37m#############################################################\n\n\e[0m"; 19 | 20 | chdir(realpath(dirname(__FILE__) . '/../')); 21 | if (!file_exists("./build/coverage")) { 22 | mkdir("./build/coverage/", 0777, true); 23 | } 24 | require './vendor/autoload.php'; 25 | 26 | passthru("XDEBUG_MODE=coverage ./vendor/bin/phpunit --colors=always --coverage-php build/coverage/coverage.php --coverage-text=build/coverage/coverage.txt --coverage-html build/coverage/html", $status); 27 | if ($status != 0) { 28 | exit($status); 29 | } 30 | 31 | /** @var CodeCoverage $coverage */ 32 | $coverage = require_once('build/coverage/coverage.php'); 33 | $report = $coverage->getReport(); 34 | $percentage = round(($report->numberOfExecutedLines() / $report->numberOfExecutableLines()) * 100, 2); 35 | 36 | if ($percentage < $requiredLineCoverage) { 37 | print "\n\n\e[0;37;41mLine Coverage NOT OK (Actual coverage of {$percentage}% < requirement of {$requiredLineCoverage}%)\e[0m\n\n"; 38 | exit(1); 39 | } else { 40 | print "\n\n\e[0;30;42mUnit tests and line coverage OK (Actual coverage of {$percentage}% >= requirement of {$requiredLineCoverage}%)\e[0m\n\n"; 41 | exit(0); 42 | } 43 | -------------------------------------------------------------------------------- /bin/fix-style: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd $(git rev-parse --show-toplevel) 4 | 5 | echo -e "\nPHPCBF" 6 | ./vendor/bin/phpcbf --standard=PSR2 tests/ src/ 7 | echo -e "\nPHP-CS-FIXER" 8 | ./vendor/bin/php-cs-fixer fix --verbose 9 | -------------------------------------------------------------------------------- /bin/static-analysis: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Go to the root dir first, so configs and vendor dir is found 4 | cd $(git rev-parse --show-toplevel) 5 | 6 | savedOutput="" 7 | 8 | failOnError=1 9 | if [[ $1 = "false" ]]; then 10 | failOnError=0 11 | fi 12 | 13 | error () { 14 | local status=$1 15 | echo -e "\e[0;37;41mNOK\e[0m" 16 | echo -e "${savedOutput}\n" 17 | if [[ ${failOnError} = 1 ]]; then 18 | exit ${status} 19 | fi 20 | } 21 | 22 | success () { 23 | echo -e "\e[0;30;42mOK\e[0m" 24 | } 25 | 26 | runCheck () { 27 | local label=$1 28 | local command=$2 29 | 30 | echo -en "\e[0;37m- $(padTo "${label}" 25): \e[0m" 31 | savedOutput=$(eval "${command}") && success || error $? 32 | } 33 | 34 | padTo () { 35 | local string=$1 36 | local targetLen=$2 37 | 38 | len=$(echo -n "${string}" | wc -c) 39 | while [[ ${len} -lt ${targetLen} ]]; 40 | do 41 | string=${string}"." 42 | let len=len+1 43 | done 44 | echo ${string} 45 | } 46 | 47 | echo -en "\n\e[0;37m############################\e[0m" 48 | echo -en "\n\e[0;37m# Starting static analysis #\e[0m" 49 | echo -e "\n\e[0;37m############################\e[0m\n" 50 | 51 | runCheck "php lint 'src/'" "find src/ -iname "*.php" -print0 | xargs -0 -n1 php -l 2>&1 1>/dev/null" 52 | runCheck "php lint 'tests/'" "find tests/ -iname "*.php" -print0 | xargs -0 -n1 php -l 2>&1 1>/dev/null" 53 | runCheck "phpcs 'src/'" "vendor/bin/phpcs --warning-severity=0 --standard=PSR2 src/" 54 | runCheck "phpcs 'tests/'" "vendor/bin/phpcs --warning-severity=0 --standard=PSR2 tests/" 55 | runCheck "phpmd 'src/'" "vendor/bin/phpmd src/ text phpmd.xml" 56 | runCheck "phpmd 'tests/'" "vendor/bin/phpmd tests/ text phpmd-tests.xml" 57 | runCheck "phan" "PHAN_DISABLE_XDEBUG_WARN=1 vendor/bin/phan --no-progress-bar 2>/dev/null" 58 | 59 | echo -e "\n\n\e[0;37;42mStatic analysis completed successfully.\e[0m\n" 60 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "vdb/php-spider", 3 | "type": "application", 4 | "description": "A configurable and extensible PHP web spider", 5 | "keywords": ["spider", "crawler", "scraper"], 6 | "license": "MIT", 7 | "authors": [ 8 | { 9 | "name": "Matthijs van den Bos" 10 | } 11 | ], 12 | "support": { 13 | "issues": "https://github.com/matthijsvandenbos/php-spider/issues", 14 | "source": "https://github.com/matthijsvandenbos/php-spider" 15 | }, 16 | "require": { 17 | "php": ">=8.0", 18 | "ext-dom": "*", 19 | "ext-pcntl": "*", 20 | "guzzlehttp/guzzle": "^6.0.0||^7.0.0", 21 | "pdepend/pdepend": "^2.16.1", 22 | "symfony/css-selector": "^3.0.0||^4.0.0||^5.0.0||^6.0||^7.0", 23 | "symfony/dom-crawler": "^3.0.0||^4.0.0||^5.0.0||^6.0||^7.0", 24 | "symfony/finder": "^3.0.0||^4.0.0||^5.0.0||^6.0||^7.0", 25 | "symfony/event-dispatcher": "^4.0.0||^5.0.0||^6.0||^7.0", 26 | "vdb/uri": "^0.3.2", 27 | "spatie/robots-txt": "^2.0", 28 | "phan/phan": "^4.0||^5.0" 29 | }, 30 | "require-dev": { 31 | "phpunit/phpunit": "^9.0.0", 32 | "squizlabs/php_codesniffer": "^3.0.0", 33 | "phpmd/phpmd": "^2.0.0", 34 | "friendsofphp/php-cs-fixer": "^3.69.0" 35 | }, 36 | "autoload": { 37 | "psr-4": { 38 | "VDB\\Spider\\": "src/" 39 | } 40 | }, 41 | "autoload-dev": { 42 | "psr-4": { 43 | "VDB\\Spider\\Tests\\": "tests/" 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /docs/500px-Graph.traversal.example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/500px-Graph.traversal.example.png -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PHP-Spider.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PHP-Spider.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/PHP-Spider" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PHP-Spider" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/_build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/_build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/_build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 14726791e883e0ede4cab1f0574237a0 4 | tags: fbb0d17656682115ca4d033fb2f83ba1 5 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/index.txt: -------------------------------------------------------------------------------- 1 | .. PHP-Spider documentation master file, created by 2 | sphinx-quickstart on Mon Mar 11 00:44:09 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to PHP-Spider's documentation! 7 | ====================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | 23 | -------------------------------------------------------------------------------- /docs/_build/html/_static/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/ajax-loader.gif -------------------------------------------------------------------------------- /docs/_build/html/_static/comment-bright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/comment-bright.png -------------------------------------------------------------------------------- /docs/_build/html/_static/comment-close.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/comment-close.png -------------------------------------------------------------------------------- /docs/_build/html/_static/comment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/comment.png -------------------------------------------------------------------------------- /docs/_build/html/_static/default.css: -------------------------------------------------------------------------------- 1 | /* 2 | * default.css_t 3 | * ~~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- default theme. 6 | * 7 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: sans-serif; 18 | font-size: 100%; 19 | background-color: #11303d; 20 | color: #000; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.document { 26 | background-color: #1c4e63; 27 | } 28 | 29 | div.documentwrapper { 30 | float: left; 31 | width: 100%; 32 | } 33 | 34 | div.bodywrapper { 35 | margin: 0 0 0 230px; 36 | } 37 | 38 | div.body { 39 | background-color: #ffffff; 40 | color: #000000; 41 | padding: 0 20px 30px 20px; 42 | } 43 | 44 | div.footer { 45 | color: #ffffff; 46 | width: 100%; 47 | padding: 9px 0 9px 0; 48 | text-align: center; 49 | font-size: 75%; 50 | } 51 | 52 | div.footer a { 53 | color: #ffffff; 54 | text-decoration: underline; 55 | } 56 | 57 | div.related { 58 | background-color: #133f52; 59 | line-height: 30px; 60 | color: #ffffff; 61 | } 62 | 63 | div.related a { 64 | color: #ffffff; 65 | } 66 | 67 | div.sphinxsidebar { 68 | } 69 | 70 | div.sphinxsidebar h3 { 71 | font-family: 'Trebuchet MS', sans-serif; 72 | color: #ffffff; 73 | font-size: 1.4em; 74 | font-weight: normal; 75 | margin: 0; 76 | padding: 0; 77 | } 78 | 79 | div.sphinxsidebar h3 a { 80 | color: #ffffff; 81 | } 82 | 83 | div.sphinxsidebar h4 { 84 | font-family: 'Trebuchet MS', sans-serif; 85 | color: #ffffff; 86 | font-size: 1.3em; 87 | font-weight: normal; 88 | margin: 5px 0 0 0; 89 | padding: 0; 90 | } 91 | 92 | div.sphinxsidebar p { 93 | color: #ffffff; 94 | } 95 | 96 | div.sphinxsidebar p.topless { 97 | margin: 5px 10px 10px 10px; 98 | } 99 | 100 | div.sphinxsidebar ul { 101 | margin: 10px; 102 | padding: 0; 103 | color: #ffffff; 104 | } 105 | 106 | div.sphinxsidebar a { 107 | color: #98dbcc; 108 | } 109 | 110 | div.sphinxsidebar input { 111 | border: 1px solid #98dbcc; 112 | font-family: sans-serif; 113 | font-size: 1em; 114 | } 115 | 116 | 117 | 118 | /* -- hyperlink styles ------------------------------------------------------ */ 119 | 120 | a { 121 | color: #355f7c; 122 | text-decoration: none; 123 | } 124 | 125 | a:visited { 126 | color: #355f7c; 127 | text-decoration: none; 128 | } 129 | 130 | a:hover { 131 | text-decoration: underline; 132 | } 133 | 134 | 135 | 136 | /* -- body styles ----------------------------------------------------------- */ 137 | 138 | div.body h1, 139 | div.body h2, 140 | div.body h3, 141 | div.body h4, 142 | div.body h5, 143 | div.body h6 { 144 | font-family: 'Trebuchet MS', sans-serif; 145 | background-color: #f2f2f2; 146 | font-weight: normal; 147 | color: #20435c; 148 | border-bottom: 1px solid #ccc; 149 | margin: 20px -20px 10px -20px; 150 | padding: 3px 0 3px 10px; 151 | } 152 | 153 | div.body h1 { margin-top: 0; font-size: 200%; } 154 | div.body h2 { font-size: 160%; } 155 | div.body h3 { font-size: 140%; } 156 | div.body h4 { font-size: 120%; } 157 | div.body h5 { font-size: 110%; } 158 | div.body h6 { font-size: 100%; } 159 | 160 | a.headerlink { 161 | color: #c60f0f; 162 | font-size: 0.8em; 163 | padding: 0 4px 0 4px; 164 | text-decoration: none; 165 | } 166 | 167 | a.headerlink:hover { 168 | background-color: #c60f0f; 169 | color: white; 170 | } 171 | 172 | div.body p, div.body dd, div.body li { 173 | text-align: justify; 174 | line-height: 130%; 175 | } 176 | 177 | div.admonition p.admonition-title + p { 178 | display: inline; 179 | } 180 | 181 | div.admonition p { 182 | margin-bottom: 5px; 183 | } 184 | 185 | div.admonition pre { 186 | margin-bottom: 5px; 187 | } 188 | 189 | div.admonition ul, div.admonition ol { 190 | margin-bottom: 5px; 191 | } 192 | 193 | div.note { 194 | background-color: #eee; 195 | border: 1px solid #ccc; 196 | } 197 | 198 | div.seealso { 199 | background-color: #ffc; 200 | border: 1px solid #ff6; 201 | } 202 | 203 | div.topic { 204 | background-color: #eee; 205 | } 206 | 207 | div.warning { 208 | background-color: #ffe4e4; 209 | border: 1px solid #f66; 210 | } 211 | 212 | p.admonition-title { 213 | display: inline; 214 | } 215 | 216 | p.admonition-title:after { 217 | content: ":"; 218 | } 219 | 220 | pre { 221 | padding: 5px; 222 | background-color: #eeffcc; 223 | color: #333333; 224 | line-height: 120%; 225 | border: 1px solid #ac9; 226 | border-left: none; 227 | border-right: none; 228 | } 229 | 230 | tt { 231 | background-color: #ecf0f3; 232 | padding: 0 1px 0 1px; 233 | font-size: 0.95em; 234 | } 235 | 236 | th { 237 | background-color: #ede; 238 | } 239 | 240 | .warning tt { 241 | background: #efc2c2; 242 | } 243 | 244 | .note tt { 245 | background: #d6d6d6; 246 | } 247 | 248 | .viewcode-back { 249 | font-family: sans-serif; 250 | } 251 | 252 | div.viewcode-block:target { 253 | background-color: #f4debf; 254 | border-top: 1px solid #ac9; 255 | border-bottom: 1px solid #ac9; 256 | } -------------------------------------------------------------------------------- /docs/_build/html/_static/down-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/down-pressed.png -------------------------------------------------------------------------------- /docs/_build/html/_static/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/down.png -------------------------------------------------------------------------------- /docs/_build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/file.png -------------------------------------------------------------------------------- /docs/_build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/_static/plus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #ffffcc } 2 | .highlight { background: #eeffcc; } 3 | .highlight .c { color: #408090; font-style: italic } /* Comment */ 4 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */ 6 | .highlight .o { color: #666666 } /* Operator */ 7 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ 8 | .highlight .cp { color: #007020 } /* Comment.Preproc */ 9 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ 10 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ 11 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 12 | .highlight .ge { font-style: italic } /* Generic.Emph */ 13 | .highlight .gr { color: #FF0000 } /* Generic.Error */ 14 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 15 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 16 | .highlight .go { color: #333333 } /* Generic.Output */ 17 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ 18 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 19 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 20 | .highlight .gt { color: #0044DD } /* Generic.Traceback */ 21 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ 22 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ 23 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ 24 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */ 25 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ 26 | .highlight .kt { color: #902000 } /* Keyword.Type */ 27 | .highlight .m { color: #208050 } /* Literal.Number */ 28 | .highlight .s { color: #4070a0 } /* Literal.String */ 29 | .highlight .na { color: #4070a0 } /* Name.Attribute */ 30 | .highlight .nb { color: #007020 } /* Name.Builtin */ 31 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ 32 | .highlight .no { color: #60add5 } /* Name.Constant */ 33 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ 34 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ 35 | .highlight .ne { color: #007020 } /* Name.Exception */ 36 | .highlight .nf { color: #06287e } /* Name.Function */ 37 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ 38 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ 39 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ 40 | .highlight .nv { color: #bb60d5 } /* Name.Variable */ 41 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ 42 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 43 | .highlight .mf { color: #208050 } /* Literal.Number.Float */ 44 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */ 45 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */ 46 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */ 47 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ 48 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */ 49 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ 50 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */ 51 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ 52 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ 53 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ 54 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */ 55 | .highlight .sr { color: #235388 } /* Literal.String.Regex */ 56 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */ 57 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */ 58 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ 59 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ 60 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ 61 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ 62 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/_build/html/_static/sidebar.js: -------------------------------------------------------------------------------- 1 | /* 2 | * sidebar.js 3 | * ~~~~~~~~~~ 4 | * 5 | * This script makes the Sphinx sidebar collapsible. 6 | * 7 | * .sphinxsidebar contains .sphinxsidebarwrapper. This script adds 8 | * in .sphixsidebar, after .sphinxsidebarwrapper, the #sidebarbutton 9 | * used to collapse and expand the sidebar. 10 | * 11 | * When the sidebar is collapsed the .sphinxsidebarwrapper is hidden 12 | * and the width of the sidebar and the margin-left of the document 13 | * are decreased. When the sidebar is expanded the opposite happens. 14 | * This script saves a per-browser/per-session cookie used to 15 | * remember the position of the sidebar among the pages. 16 | * Once the browser is closed the cookie is deleted and the position 17 | * reset to the default (expanded). 18 | * 19 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 20 | * :license: BSD, see LICENSE for details. 21 | * 22 | */ 23 | 24 | $(function() { 25 | // global elements used by the functions. 26 | // the 'sidebarbutton' element is defined as global after its 27 | // creation, in the add_sidebar_button function 28 | var bodywrapper = $('.bodywrapper'); 29 | var sidebar = $('.sphinxsidebar'); 30 | var sidebarwrapper = $('.sphinxsidebarwrapper'); 31 | 32 | // for some reason, the document has no sidebar; do not run into errors 33 | if (!sidebar.length) return; 34 | 35 | // original margin-left of the bodywrapper and width of the sidebar 36 | // with the sidebar expanded 37 | var bw_margin_expanded = bodywrapper.css('margin-left'); 38 | var ssb_width_expanded = sidebar.width(); 39 | 40 | // margin-left of the bodywrapper and width of the sidebar 41 | // with the sidebar collapsed 42 | var bw_margin_collapsed = '.8em'; 43 | var ssb_width_collapsed = '.8em'; 44 | 45 | // colors used by the current theme 46 | var dark_color = $('.related').css('background-color'); 47 | var light_color = $('.document').css('background-color'); 48 | 49 | function sidebar_is_collapsed() { 50 | return sidebarwrapper.is(':not(:visible)'); 51 | } 52 | 53 | function toggle_sidebar() { 54 | if (sidebar_is_collapsed()) 55 | expand_sidebar(); 56 | else 57 | collapse_sidebar(); 58 | } 59 | 60 | function collapse_sidebar() { 61 | sidebarwrapper.hide(); 62 | sidebar.css('width', ssb_width_collapsed); 63 | bodywrapper.css('margin-left', bw_margin_collapsed); 64 | sidebarbutton.css({ 65 | 'margin-left': '0', 66 | 'height': bodywrapper.height() 67 | }); 68 | sidebarbutton.find('span').text('»'); 69 | sidebarbutton.attr('title', _('Expand sidebar')); 70 | document.cookie = 'sidebar=collapsed'; 71 | } 72 | 73 | function expand_sidebar() { 74 | bodywrapper.css('margin-left', bw_margin_expanded); 75 | sidebar.css('width', ssb_width_expanded); 76 | sidebarwrapper.show(); 77 | sidebarbutton.css({ 78 | 'margin-left': ssb_width_expanded-12, 79 | 'height': bodywrapper.height() 80 | }); 81 | sidebarbutton.find('span').text('«'); 82 | sidebarbutton.attr('title', _('Collapse sidebar')); 83 | document.cookie = 'sidebar=expanded'; 84 | } 85 | 86 | function add_sidebar_button() { 87 | sidebarwrapper.css({ 88 | 'float': 'left', 89 | 'margin-right': '0', 90 | 'width': ssb_width_expanded - 28 91 | }); 92 | // create the button 93 | sidebar.append( 94 | '
«
' 95 | ); 96 | var sidebarbutton = $('#sidebarbutton'); 97 | light_color = sidebarbutton.css('background-color'); 98 | // find the height of the viewport to center the '<<' in the page 99 | var viewport_height; 100 | if (window.innerHeight) 101 | viewport_height = window.innerHeight; 102 | else 103 | viewport_height = $(window).height(); 104 | sidebarbutton.find('span').css({ 105 | 'display': 'block', 106 | 'margin-top': (viewport_height - sidebar.position().top - 20) / 2 107 | }); 108 | 109 | sidebarbutton.click(toggle_sidebar); 110 | sidebarbutton.attr('title', _('Collapse sidebar')); 111 | sidebarbutton.css({ 112 | 'color': '#FFFFFF', 113 | 'border-left': '1px solid ' + dark_color, 114 | 'font-size': '1.2em', 115 | 'cursor': 'pointer', 116 | 'height': bodywrapper.height(), 117 | 'padding-top': '1px', 118 | 'margin-left': ssb_width_expanded - 12 119 | }); 120 | 121 | sidebarbutton.hover( 122 | function () { 123 | $(this).css('background-color', dark_color); 124 | }, 125 | function () { 126 | $(this).css('background-color', light_color); 127 | } 128 | ); 129 | } 130 | 131 | function set_position_from_cookie() { 132 | if (!document.cookie) 133 | return; 134 | var items = document.cookie.split(';'); 135 | for(var k=0; k 7 | 8 | 9 | 10 | 11 | 12 | 13 | Index — PHP-Spider master documentation 14 | 15 | 16 | 17 | 18 | 27 | 28 | 29 | 30 | 31 | 32 | 33 |
42 | 43 |
44 |
45 |
46 |
47 | 48 | 49 |

Index

50 | 51 |
52 | 53 |
54 | 55 | 56 |
57 |
58 |
59 |
60 |
61 | 62 | 63 | 64 | 76 | 77 |
78 |
79 |
80 |
81 | 90 | 94 | 95 | -------------------------------------------------------------------------------- /docs/_build/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Welcome to PHP-Spider’s documentation! — PHP-Spider master documentation 12 | 13 | 14 | 15 | 16 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 40 | 41 |
42 |
43 |
44 |
45 | 46 |
47 |

Welcome to PHP-Spider’s documentation!

48 |

Contents:

49 |
50 |
    51 |
52 |
53 |
54 |
55 |

Indices and tables

56 | 61 |
62 | 63 | 64 |
65 |
66 |
67 |
68 |
69 |

Table Of Contents

70 | 76 | 77 |

This Page

78 | 82 | 94 | 95 |
96 |
97 |
98 |
99 | 108 | 112 | 113 | -------------------------------------------------------------------------------- /docs/_build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/docs/_build/html/objects.inv -------------------------------------------------------------------------------- /docs/_build/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Search — PHP-Spider master documentation 12 | 13 | 14 | 15 | 16 | 25 | 26 | 27 | 28 | 29 | 30 | 33 | 34 | 35 | 36 | 37 | 46 | 47 |
48 |
49 |
50 |
51 | 52 |

Search

53 |
54 | 55 |

56 | Please activate JavaScript to enable the search 57 | functionality. 58 |

59 |
60 |

61 | From here you can search these documents. Enter your search 62 | words into the box below and click "search". Note that the search 63 | function will automatically search for all of the words. Pages 64 | containing fewer words won't appear in the result list. 65 |

66 |
67 | 68 | 69 | 70 |
71 | 72 |
73 | 74 |
75 | 76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 | 94 | 98 | 99 | -------------------------------------------------------------------------------- /docs/_build/html/searchindex.js: -------------------------------------------------------------------------------- 1 | Search.setIndex({objects:{},terms:{index:0,search:0,document:0,welcom:0,modul:0,spider:0,indic:0,content:0,tabl:0,php:0,page:0},objtypes:{},titles:["Welcome to PHP-Spider’s documentation!"],objnames:{},filenames:["index"]}) -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. PHP-Spider documentation master file, created by 2 | sphinx-quickstart on Mon Mar 11 00:44:09 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to PHP-Spider's documentation! 7 | ====================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | 23 | -------------------------------------------------------------------------------- /example/cache/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvdbos/php-spider/28a8dbaa28c385cef7b6dc12489149d178b16054/example/cache/.gitkeep -------------------------------------------------------------------------------- /example/example_basic_auth.php: -------------------------------------------------------------------------------- 1 | setClient(new Client(['auth' => ['foo', 'bar', 'basic'], 'http_errors' => false])); 15 | $spider->getDownloader()->setRequestHandler($requestHandler); 16 | 17 | // Execute crawl 18 | $spider->crawl(); 19 | 20 | // Finally we could do some processing on the downloaded resources 21 | // In this example, we will echo the title of all resources 22 | echo "\n\nRESPONSE: "; 23 | foreach ($spider->getDownloader()->getPersistenceHandler() as $resource) { 24 | echo "\n" . $resource->getResponse()->getStatusCode() . ": " . $resource->getResponse()->getReasonPhrase(); 25 | echo "\n" . $resource->getResponse()->getBody(); 26 | } 27 | -------------------------------------------------------------------------------- /example/example_complex.php: -------------------------------------------------------------------------------- 1 | getDownloader()->setDownloadLimit(10); 32 | 33 | $statsHandler = new StatsHandler(); 34 | $LogHandler = new LogHandler(); 35 | 36 | $queueManager = new InMemoryQueueManager(); 37 | 38 | $queueManager->getDispatcher()->addSubscriber($statsHandler); 39 | $queueManager->getDispatcher()->addSubscriber($LogHandler); 40 | $spider->getDownloader()->getDispatcher()->addSubscriber($statsHandler); 41 | 42 | // Set some sane defaults for this example. 43 | // We only visit the first level of http://dmoztools.net. We stop at 10 queued resources 44 | $spider->getDiscovererSet()->maxDepth = 1; 45 | 46 | // This time, we set the traversal algorithm to breadth-first. The default is depth-first 47 | $queueManager->setTraversalAlgorithm(QueueManagerInterface::ALGORITHM_BREADTH_FIRST); 48 | 49 | $spider->setQueueManager($queueManager); 50 | 51 | // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. 52 | $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a")); 53 | 54 | // Let's tell the spider to save all found resources on the filesystem 55 | $spider->getDownloader()->setPersistenceHandler( 56 | new FileSerializedResourcePersistenceHandler(__DIR__ . '/results') 57 | ); 58 | 59 | // Add some prefetch filters. These are executed before a resource is requested. 60 | // The more you have of these, the less HTTP requests and work for the processors 61 | $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('http', 'https'))); 62 | $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); 63 | $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter()); 64 | $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter()); 65 | $spider->getDiscovererSet()->addFilter(new RobotsTxtDisallowFilter($seed, 'PHP-Spider')); 66 | 67 | // We add an event listener to the crawler that implements a politeness policy. 68 | // We wait 100ms between every request to the same domain 69 | $politenessPolicyEventListener = new PolitenessPolicyListener(100); 70 | $spider->getDownloader()->getDispatcher()->addListener( 71 | SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, 72 | array($politenessPolicyEventListener, 'onCrawlPreRequest') 73 | ); 74 | 75 | $spider->getDispatcher()->addSubscriber($statsHandler); 76 | $spider->getDispatcher()->addSubscriber($LogHandler); 77 | 78 | // Let's add something to enable us to stop the script 79 | $spider->getDispatcher()->addListener( 80 | SpiderEvents::SPIDER_CRAWL_USER_STOPPED, 81 | function (GenericEvent $event) { 82 | echo "\nCrawl aborted by user.\n"; 83 | exit(); 84 | } 85 | ); 86 | 87 | // Let's add a CLI progress meter for fun 88 | echo "\nCrawling"; 89 | $spider->getDownloader()->getDispatcher()->addListener( 90 | SpiderEvents::SPIDER_CRAWL_POST_REQUEST, 91 | function (GenericEvent $event) { 92 | echo '.'; 93 | } 94 | ); 95 | 96 | // Set up some caching, logging and profiling on the HTTP client of the spider 97 | $guzzleClient = $spider->getDownloader()->getRequestHandler()->getClient(); 98 | $tapMiddleware = Middleware::tap([$timerMiddleware, 'onRequest'], [$timerMiddleware, 'onResponse']); 99 | $guzzleClient->getConfig('handler')->push($tapMiddleware, 'timer'); 100 | 101 | // Execute the crawl 102 | $spider->crawl(); 103 | 104 | // Report 105 | echo "\n ENQUEUED: " . count($statsHandler->getQueued()); 106 | echo "\n SKIPPED: " . count($statsHandler->getFiltered()); 107 | echo "\n FAILED: " . count($statsHandler->getFailed()); 108 | echo "\n PERSISTED: " . count($statsHandler->getPersisted()); 109 | 110 | // With the information from some of plugins and listeners, we can determine some metrics 111 | $peakMem = round(memory_get_peak_usage(true) / 1024 / 1024, 2); 112 | $totalTime = round(microtime(true) - $start, 2); 113 | $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); 114 | echo "\n\nMETRICS:"; 115 | echo "\n PEAK MEM USAGE: " . $peakMem . 'MB'; 116 | echo "\n TOTAL TIME: " . $totalTime . 's'; 117 | echo "\n REQUEST TIME: " . $timerMiddleware->getTotal() . 's'; 118 | echo "\n POLITENESS WAIT TIME: " . $totalDelay . 's'; 119 | echo "\n PROCESSING TIME: " . ($totalTime - $timerMiddleware->getTotal() - $totalDelay) . 's'; 120 | 121 | // Finally we could start some processing on the downloaded resources 122 | echo "\n\nDOWNLOADED RESOURCES: "; 123 | $downloaded = $spider->getDownloader()->getPersistenceHandler(); 124 | foreach ($downloaded as $resource) { 125 | $title = $resource->getCrawler()->filterXpath('//title')->text(); 126 | $contentLength = (int)$resource->getResponse()->getHeaderLine('Content-Length'); 127 | $contentLengthString = ''; 128 | if ($contentLength >= 1024) { 129 | $contentLengthString = str_pad("[" . round($contentLength / 1024), 4, ' ', STR_PAD_LEFT) . "KB]"; 130 | } else { 131 | $contentLengthString = str_pad("[" . $contentLength, 5, ' ', STR_PAD_LEFT) . "B]"; 132 | } 133 | $uri = $resource->getUri()->toString(); 134 | echo "\n - " . $contentLengthString . " $title ($uri)"; 135 | } 136 | echo "\n"; 137 | 138 | echo "\nFAILED RESOURCES: "; 139 | foreach ($statsHandler->getFailed() as $uri => $message) { 140 | echo "\n - " . $uri . " failed because: " . $message; 141 | } -------------------------------------------------------------------------------- /example/example_complex_bootstrap.php: -------------------------------------------------------------------------------- 1 | add('Example', __DIR__ . '/lib'); 14 | 15 | // activate the autoloader 16 | $loader->register(); 17 | 18 | $timerMiddleware = new GuzzleTimerMiddleware(); 19 | -------------------------------------------------------------------------------- /example/example_link_check.php: -------------------------------------------------------------------------------- 1 | getDownloader()->setDownloadLimit(10); 35 | 36 | // Set a custom request handler that does not throw exceptions on failed requests 37 | $spider->getDownloader()->setRequestHandler(new \Example\LinkCheckRequestHandler()); 38 | 39 | $statsHandler = new StatsHandler(); 40 | $LogHandler = new LogHandler(); 41 | 42 | $queueManager = new InMemoryQueueManager(); 43 | 44 | $queueManager->getDispatcher()->addSubscriber($statsHandler); 45 | $queueManager->getDispatcher()->addSubscriber($LogHandler); 46 | 47 | // Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources 48 | $spider->getDiscovererSet()->maxDepth = 1; 49 | 50 | // This time, we set the traversal algorithm to breadth-first. The default is depth-first 51 | $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); 52 | 53 | $spider->setQueueManager($queueManager); 54 | 55 | // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. 56 | //$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//*[@id='cat-list-content-2']/div/a")); 57 | $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a")); 58 | 59 | // Let's tell the spider to save all found resources on the filesystem 60 | $spider->getDownloader()->setPersistenceHandler( 61 | new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results') 62 | ); 63 | 64 | // Add some prefetch filters. These are executed before a resource is requested. 65 | // The more you have of these, the less HTTP requests and work for the processors 66 | //$spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('http'))); 67 | $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('https', 'http'))); 68 | $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); 69 | $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter()); 70 | $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter()); 71 | 72 | // We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain 73 | $politenessPolicyEventListener = new PolitenessPolicyListener(100); 74 | $spider->getDownloader()->getDispatcher()->addListener( 75 | SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, 76 | array($politenessPolicyEventListener, 'onCrawlPreRequest') 77 | ); 78 | 79 | $spider->getDispatcher()->addSubscriber($statsHandler); 80 | $spider->getDispatcher()->addSubscriber($LogHandler); 81 | 82 | // Let's add something to enable us to stop the script 83 | $spider->getDispatcher()->addListener( 84 | SpiderEvents::SPIDER_CRAWL_USER_STOPPED, 85 | function (GenericEvent $event) { 86 | echo "\nCrawl aborted by user.\n"; 87 | exit(); 88 | } 89 | ); 90 | 91 | // Let's add a CLI progress meter for fun 92 | echo "\nCrawling"; 93 | $spider->getDownloader()->getDispatcher()->addListener( 94 | SpiderEvents::SPIDER_CRAWL_POST_REQUEST, 95 | function (GenericEvent $event) { 96 | echo '.'; 97 | } 98 | ); 99 | 100 | // Set up some caching, logging and profiling on the HTTP client of the spider 101 | $guzzleClient = $spider->getDownloader()->getRequestHandler()->getClient(); 102 | $tapMiddleware = Middleware::tap([$timerMiddleware, 'onRequest'], [$timerMiddleware, 'onResponse']); 103 | $guzzleClient->getConfig('handler')->push($tapMiddleware, 'timer'); 104 | 105 | // Execute the crawl 106 | $result = $spider->crawl(); 107 | 108 | // Report 109 | echo "\n ENQUEUED: " . count($statsHandler->getQueued()); 110 | echo "\n SKIPPED: " . count($statsHandler->getFiltered()); 111 | echo "\n FAILED: " . count($statsHandler->getFailed()); 112 | echo "\n PERSISTED: " . count($statsHandler->getPersisted()); 113 | 114 | // With the information from some of plugins and listeners, we can determine some metrics 115 | $peakMem = round(memory_get_peak_usage(true) / 1024 / 1024, 2); 116 | $totalTime = round(microtime(true) - $start, 2); 117 | $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); 118 | echo "\n\nMETRICS:"; 119 | echo "\n PEAK MEM USAGE: " . $peakMem . 'MB'; 120 | echo "\n TOTAL TIME: " . $totalTime . 's'; 121 | echo "\n REQUEST TIME: " . $timerMiddleware->getTotal() . 's'; 122 | echo "\n POLITENESS WAIT TIME: " . $totalDelay . 's'; 123 | echo "\n PROCESSING TIME: " . ($totalTime - $timerMiddleware->getTotal() - $totalDelay) . 's'; 124 | 125 | // Finally we could start some processing on the downloaded resources 126 | echo "\n\nDOWNLOADED RESOURCES: "; 127 | $downloaded = $spider->getDownloader()->getPersistenceHandler(); 128 | 129 | /** @var \VDB\Spider\Resource $resource */ 130 | foreach ($downloaded as $resource) { 131 | $code = $resource->getResponse()->getStatusCode(); 132 | $reason = $resource->getResponse()->getReasonPhrase(); 133 | $title = $resource->getCrawler()->filterXpath('//title')->text(""); 134 | $contentLength = (int)$resource->getResponse()->getHeaderLine('Content-Length'); 135 | $contentLengthString = ''; 136 | if ($contentLength >= 1024) { 137 | $contentLengthString = str_pad("[" . round($contentLength / 1024), 4, ' ', STR_PAD_LEFT) . "KB]"; 138 | } else { 139 | $contentLengthString = str_pad("[" . $contentLength, 5, ' ', STR_PAD_LEFT) . "B]"; 140 | } 141 | $uri = $resource->getUri()->toString(); 142 | echo "\n - " . $contentLengthString . " $title ($uri) " .$code ." ". $reason; 143 | } 144 | echo "\n"; 145 | -------------------------------------------------------------------------------- /example/example_persistent_request_params.php: -------------------------------------------------------------------------------- 1 | setClient(new Client(['auth' => ['foo', 'bar', 'basic'], 'http_errors' => false])); 15 | $spider->getDownloader()->setRequestHandler($requestHandler); 16 | 17 | // Execute crawl 18 | $spider->crawl(); 19 | 20 | // Finally we could do some processing on the downloaded resources 21 | // In this example, we will echo the title of all resources 22 | echo "\n\nRESPONSE: "; 23 | foreach ($spider->getDownloader()->getPersistenceHandler() as $resource) { 24 | echo "\n" . $resource->getResponse()->getStatusCode() . ": " . $resource->getResponse()->getReasonPhrase(); 25 | echo "\n" . $resource->getResponse()->getBody(); 26 | } 27 | -------------------------------------------------------------------------------- /example/example_simple.php: -------------------------------------------------------------------------------- 1 | tags from a certain
16 | $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@id='catalogs']//a")); 17 | 18 | // Set some sane options for this example. In this case, we only get the first 10 items from the start page. 19 | $spider->getDiscovererSet()->maxDepth = 1; 20 | $spider->getQueueManager()->maxQueueSize = 10; 21 | 22 | // Let's add something to enable us to stop the script 23 | $spider->getDispatcher()->addListener( 24 | SpiderEvents::SPIDER_CRAWL_USER_STOPPED, 25 | function (Event $event) { 26 | echo "\nCrawl aborted by user.\n"; 27 | exit(); 28 | } 29 | ); 30 | 31 | // Add a listener to collect stats to the Spider and the QueueMananger. 32 | // There are more components that dispatch events you can use. 33 | $statsHandler = new StatsHandler(); 34 | $spider->getQueueManager()->getDispatcher()->addSubscriber($statsHandler); 35 | $spider->getDispatcher()->addSubscriber($statsHandler); 36 | 37 | // Execute crawl 38 | $spider->crawl(); 39 | 40 | // Build a report 41 | echo "\n ENQUEUED: " . count($statsHandler->getQueued()); 42 | echo "\n SKIPPED: " . count($statsHandler->getFiltered()); 43 | echo "\n FAILED: " . count($statsHandler->getFailed()); 44 | echo "\n PERSISTED: " . count($statsHandler->getPersisted()); 45 | 46 | // Finally we could do some processing on the downloaded resources 47 | // In this example, we will echo the title of all resources 48 | echo "\n\nDOWNLOADED RESOURCES: "; 49 | foreach ($spider->getDownloader()->getPersistenceHandler() as $resource) { 50 | echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text(); 51 | } 52 | -------------------------------------------------------------------------------- /example/lib/Example/GuzzleTimerMiddleware.php: -------------------------------------------------------------------------------- 1 | start = microtime(true); 24 | } 25 | 26 | /** 27 | * @param RequestInterface $request 28 | * @param array $options 29 | * @param PromiseInterface $response 30 | * @return void 31 | * 32 | * @SuppressWarnings(PHPMD.UnusedFormalParameter) 33 | */ 34 | public function onResponse(RequestInterface $request, array $options, PromiseInterface $response): void 35 | { 36 | $duration = microtime(true) - $this->start; 37 | $this->total = $this->total + $duration; 38 | } 39 | 40 | public function getTotal(): float 41 | { 42 | return round($this->total, 2); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /example/lib/Example/LinkCheckRequestHandler.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2013 Matthijs van den Bos 5 | */ 6 | 7 | namespace Example; 8 | 9 | use VDB\Spider\RequestHandler\GuzzleRequestHandler; 10 | use VDB\Spider\Resource; 11 | use VDB\Spider\Uri\DiscoveredUri; 12 | 13 | class LinkCheckRequestHandler extends GuzzleRequestHandler 14 | { 15 | public function request(DiscoveredUri $uri): Resource 16 | { 17 | $response = $this->getClient()->get($uri->toString(), ['http_errors' => false]); 18 | return new Resource($uri, $response); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /example/lib/Example/LogHandler.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2013 Matthijs van den Bos 5 | */ 6 | 7 | namespace Example; 8 | 9 | use Symfony\Component\EventDispatcher\EventSubscriberInterface; 10 | use Symfony\Component\EventDispatcher\GenericEvent; 11 | use VDB\Uri\UriInterface; 12 | use VDB\Spider\Event\SpiderEvents; 13 | 14 | class LogHandler implements EventSubscriberInterface 15 | { 16 | private $debug = false; 17 | 18 | public function __construct($debug = false) 19 | { 20 | $this->debug = $debug; 21 | } 22 | 23 | public static function getSubscribedEvents(): array 24 | { 25 | return array( 26 | SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH => 'logFiltered', 27 | SpiderEvents::SPIDER_CRAWL_FILTER_PREFETCH => 'logFiltered', 28 | SpiderEvents::SPIDER_CRAWL_POST_ENQUEUE => 'logQueued', 29 | SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED => 'logPersisted', 30 | SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST => 'logFailed' 31 | ); 32 | } 33 | 34 | protected function logEvent($name, GenericEvent $event) 35 | { 36 | if ($this->debug === true) { 37 | echo "\n[$name]\t:" . $event->getArgument('uri')->toString(); 38 | } 39 | } 40 | 41 | public function logQueued(GenericEvent $event) 42 | { 43 | $this->logEvent('queued', $event); 44 | } 45 | 46 | public function logPersisted(GenericEvent $event) 47 | { 48 | $this->logEvent('persisted', $event); 49 | } 50 | 51 | public function logFiltered(GenericEvent $event) 52 | { 53 | $this->logEvent('filtered', $event); 54 | } 55 | 56 | public function logFailed(GenericEvent $event) 57 | { 58 | $this->logEvent('failed', $event); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /example/lib/Example/StatsHandler.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2021 Matthijs van den Bos 5 | */ 6 | 7 | namespace Example; 8 | 9 | use Symfony\Component\EventDispatcher\EventSubscriberInterface; 10 | use Symfony\Component\EventDispatcher\GenericEvent; 11 | use VDB\Spider\Event\SpiderEvents; 12 | use VDB\Uri\UriInterface; 13 | 14 | class StatsHandler implements EventSubscriberInterface 15 | { 16 | /** @var string */ 17 | protected string $spiderId; 18 | 19 | protected array $persisted = array(); 20 | 21 | protected array $queued = array(); 22 | 23 | protected array $filtered = array(); 24 | 25 | protected array $failed = array(); 26 | 27 | public static function getSubscribedEvents(): array 28 | { 29 | return array( 30 | SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH => 'addToFiltered', 31 | SpiderEvents::SPIDER_CRAWL_FILTER_PREFETCH => 'addToFiltered', 32 | SpiderEvents::SPIDER_CRAWL_POST_ENQUEUE => 'addToQueued', 33 | SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED => 'addToPersisted', 34 | SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST => 'addToFailed' 35 | ); 36 | } 37 | 38 | public function addToQueued(GenericEvent $event): void 39 | { 40 | $this->queued[] = $event->getArgument('uri'); 41 | } 42 | 43 | public function addToPersisted(GenericEvent $event): void 44 | { 45 | $this->persisted[] = $event->getArgument('uri'); 46 | } 47 | 48 | public function addToFiltered(GenericEvent $event): void 49 | { 50 | $this->filtered[] = $event->getArgument('uri'); 51 | } 52 | 53 | public function addToFailed(GenericEvent $event): void 54 | { 55 | $this->failed[$event->getArgument('uri')->toString()] = $event->getArgument('message'); 56 | } 57 | 58 | /** 59 | * @return UriInterface[] 60 | */ 61 | public function getQueued(): array 62 | { 63 | return $this->queued; 64 | } 65 | 66 | /** 67 | * @return UriInterface[] 68 | */ 69 | public function getPersisted(): array 70 | { 71 | return $this->persisted; 72 | } 73 | 74 | /** 75 | * @return FilterableInterface[] 76 | */ 77 | public function getFiltered(): array 78 | { 79 | return $this->filtered; 80 | } 81 | 82 | /** 83 | * @return array of form array($uriString, $reason) 84 | */ 85 | public function getFailed(): array 86 | { 87 | return $this->failed; 88 | } 89 | 90 | public function toString(): string 91 | { 92 | $spiderId = $this->getSpiderId(); 93 | $queued = $this->getQueued(); 94 | $filtered = $this->getFiltered(); 95 | $failed = $this->getFailed(); 96 | 97 | $string = ''; 98 | 99 | $string .= "\n\nSPIDER ID: " . $spiderId; 100 | $string .= "\n ENQUEUED: " . count($queued); 101 | $string .= "\n SKIPPED: " . count($filtered); 102 | $string .= "\n FAILED: " . count($failed); 103 | 104 | return $string; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /phpmd-tests.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | PHP Spider Ruleset for test code 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | true 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /phpmd.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | PHP Spider Ruleset for source code 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | src 12 | 13 | 14 | 15 | 16 | ./tests 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /src/Discoverer/CrawlerDiscoverer.php: -------------------------------------------------------------------------------- 1 | 16 | * @copyright 2021 Matthijs van den Bos 17 | */ 18 | abstract class CrawlerDiscoverer extends Discoverer implements DiscovererInterface 19 | { 20 | protected string $selector; 21 | 22 | /** 23 | * @param string $selector 24 | */ 25 | public function __construct(string $selector) 26 | { 27 | $this->selector = $selector; 28 | } 29 | 30 | /** 31 | * @param Resource $resource 32 | * @return Crawler 33 | */ 34 | abstract protected function getFilteredCrawler(Resource $resource): Crawler; 35 | 36 | /** 37 | * @param Resource $resource 38 | * @return DiscoveredUri[] 39 | * @throws ErrorException 40 | */ 41 | public function discover(Resource $resource): array 42 | { 43 | $crawler = $this->getFilteredCrawler($resource); 44 | 45 | $uris = array(); 46 | foreach ($crawler as $node) { 47 | /**@var $node DOMElement */ 48 | try { 49 | $baseUri = $resource->getUri()->toString(); 50 | $href = $node->getAttribute('href'); 51 | $depthFound = $resource->getUri()->getDepthFound() + 1; 52 | 53 | if (substr($href, 0, 4) === "http") { 54 | $uris[] = new DiscoveredUri(new Http($href, $baseUri), $depthFound); 55 | } else { 56 | $uris[] = new DiscoveredUri(new Uri($href, $baseUri), $depthFound); 57 | } 58 | } catch (UriSyntaxException $e) { 59 | // do nothing. We simply ignore invalid URIs, since we don't control what we crawl. 60 | } 61 | } 62 | return $uris; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/Discoverer/CssSelectorDiscoverer.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright 2021 Matthijs van den Bos 11 | */ 12 | class CssSelectorDiscoverer extends CrawlerDiscoverer 13 | { 14 | protected function getFilteredCrawler(Resource $resource): Crawler 15 | { 16 | return $resource->getCrawler()->filter($this->selector); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/Discoverer/Discoverer.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright 2021 Matthijs van den Bos 11 | */ 12 | abstract class Discoverer implements DiscovererInterface 13 | { 14 | public function getName(): string 15 | { 16 | return get_class($this); 17 | } 18 | 19 | /** 20 | * @param Resource $resource 21 | * @return DiscoveredUri[] 22 | */ 23 | abstract public function discover(Resource $resource): array; 24 | } 25 | -------------------------------------------------------------------------------- /src/Discoverer/DiscovererInterface.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright 2021 Matthijs van den Bos 11 | */ 12 | interface DiscovererInterface 13 | { 14 | /** 15 | * @param Resource $resource 16 | * @return DiscoveredUri[] 17 | */ 18 | public function discover(Resource $resource): array; 19 | 20 | /** 21 | * @return string The name of this discoverer 22 | */ 23 | public function getName(): string; 24 | } 25 | -------------------------------------------------------------------------------- /src/Discoverer/DiscovererSet.php: -------------------------------------------------------------------------------- 1 | set($discoverer); 34 | } 35 | } 36 | 37 | /** 38 | * @param DiscoveredUri $uri 39 | * 40 | * Mark an Uri as already seen. 41 | * 42 | * If it already exists, it is not overwritten, since we want to keep the 43 | * first depth it was found at. 44 | */ 45 | private function markSeen(DiscoveredUri $uri): void 46 | { 47 | $uriString = $uri->normalize()->toString(); 48 | if (!array_key_exists($uriString, $this->alreadySeenUris)) { 49 | $this->alreadySeenUris[$uriString] = $uri->getDepthFound(); 50 | } 51 | } 52 | 53 | /** 54 | * @param DiscoveredUri $uri 55 | * @return bool Returns true if this URI was found at max depth 56 | */ 57 | private function isAtMaxDepth(DiscoveredUri $uri): bool 58 | { 59 | return $uri->getDepthFound() === $this->maxDepth; 60 | } 61 | 62 | /** 63 | * @param Resource $resource 64 | * @return DiscoveredUri[] 65 | */ 66 | public function discover(Resource $resource): array 67 | { 68 | $this->markSeen($resource->getUri()); 69 | 70 | if ($this->isAtMaxDepth($resource->getUri())) { 71 | return []; 72 | } 73 | 74 | $discoveredUris = []; 75 | 76 | foreach ($this->discoverers as $discoverer) { 77 | $discoveredUris = array_merge($discoveredUris, $discoverer->discover($resource)); 78 | } 79 | 80 | $this->normalize($discoveredUris); 81 | $this->removeDuplicates($discoveredUris); 82 | $this->filterAlreadySeen($discoveredUris); 83 | $this->filter($discoveredUris); 84 | 85 | // reset the indexes of the discovered URIs after filtering 86 | $discoveredUris = array_values($discoveredUris); 87 | 88 | foreach ($discoveredUris as $uri) { 89 | $this->markSeen($uri); 90 | } 91 | 92 | return $discoveredUris; 93 | } 94 | 95 | /** 96 | * Sets a discoverer. 97 | * 98 | * @param discovererInterface $discoverer The discoverer instance 99 | */ 100 | public function set(DiscovererInterface $discoverer): void 101 | { 102 | $this->discoverers[$discoverer->getName()] = $discoverer; 103 | } 104 | 105 | public function addFilter(PreFetchFilterInterface $filter): void 106 | { 107 | $this->filters[] = $filter; 108 | } 109 | 110 | /** 111 | * @param UriInterface[] $discoveredUris 112 | */ 113 | private function normalize(array &$discoveredUris): void 114 | { 115 | /** @var DiscoveredUri[] $discoveredUris */ 116 | foreach ($discoveredUris as $k => $uri) { 117 | $discoveredUris[$k] = $uri->normalize(); 118 | } 119 | } 120 | 121 | /** 122 | * @param UriInterface[] $discoveredUris 123 | */ 124 | private function filterAlreadySeen(array &$discoveredUris): void 125 | { 126 | foreach ($discoveredUris as $k => $uri) { 127 | if (array_key_exists($uri->toString(), $this->alreadySeenUris)) { 128 | unset($discoveredUris[$k]); 129 | } 130 | } 131 | } 132 | 133 | /** 134 | * Filter out any URI that matches any of the filters 135 | * @param UriInterface[] $discoveredUris 136 | */ 137 | private function filter(array &$discoveredUris): void 138 | { 139 | foreach ($discoveredUris as $k => $uri) { 140 | foreach ($this->filters as $filter) { 141 | if ($filter->match($uri)) { 142 | unset($discoveredUris[$k]); 143 | } 144 | } 145 | } 146 | } 147 | 148 | /** 149 | * @param UriInterface[] $discoveredUris 150 | */ 151 | private function removeDuplicates(array &$discoveredUris): void 152 | { 153 | // make sure there are no duplicates in the list 154 | $tmp = array(); 155 | foreach ($discoveredUris as $k => $uri) { 156 | $tmp[$k] = $uri->toString(); 157 | } 158 | 159 | // Find duplicates in temporary array 160 | $tmp = array_unique($tmp); 161 | 162 | // Remove the duplicates from original array 163 | foreach ($discoveredUris as $k => $uri) { 164 | if (!array_key_exists($k, $tmp)) { 165 | unset($discoveredUris[$k]); 166 | } 167 | } 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /src/Discoverer/XPathExpressionDiscoverer.php: -------------------------------------------------------------------------------- 1 | 11 | * @copyright 2021 Matthijs van den Bos 12 | */ 13 | class XPathExpressionDiscoverer extends CrawlerDiscoverer 14 | { 15 | /** 16 | * Set the XPath selector to use. 17 | * 18 | * This selector should look for `a` elements so that the Discoverer can 19 | * extract their `href` attribute for further crawling. 20 | * 21 | * @param string $selector 22 | * @throws InvalidArgumentException 23 | */ 24 | public function __construct(string $selector) 25 | { 26 | if (!self::endsWith($selector, "/a")) { 27 | throw new InvalidArgumentException("Please end your selector with '/a': " . 28 | "selectors should look for `a` elements " . 29 | "so that the Discoverer can extract their `href` attribute for further crawling."); 30 | } 31 | parent::__construct($selector); 32 | } 33 | 34 | protected function getFilteredCrawler(Resource $resource): Crawler 35 | { 36 | return $resource->getCrawler()->filterXPath($this->selector); 37 | } 38 | 39 | private static function endsWith($haystack, $needle): bool 40 | { 41 | $length = strlen($needle); 42 | return substr($haystack, -$length) === $needle; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/Downloader/Downloader.php: -------------------------------------------------------------------------------- 1 | setPersistenceHandler($persistenceHandler ?: new MemoryPersistenceHandler()); 47 | $this->setRequestHandler($requestHandler ?: new GuzzleRequestHandler()); 48 | foreach ($postFetchFilters as $filter) { 49 | $this->addPostFetchFilter($filter); 50 | } 51 | $this->setDownloadLimit($downloadLimit); 52 | } 53 | 54 | /** 55 | * @param int $downloadLimit Maximum number of resources to download 56 | * @return $this 57 | */ 58 | public function setDownloadLimit(int $downloadLimit): DownloaderInterface 59 | { 60 | $this->downloadLimit = $downloadLimit; 61 | return $this; 62 | } 63 | 64 | /** 65 | * @return int Maximum number of resources to download 66 | */ 67 | public function getDownloadLimit(): int 68 | { 69 | return $this->downloadLimit; 70 | } 71 | 72 | /** 73 | * @param PostFetchFilterInterface $filter 74 | */ 75 | public function addPostFetchFilter(PostFetchFilterInterface $filter): void 76 | { 77 | $this->postFetchFilters[] = $filter; 78 | } 79 | 80 | /** 81 | * @param DiscoveredUri $uri 82 | * @return false|Resource 83 | */ 84 | public function download(DiscoveredUri $uri): Resource|false 85 | { 86 | $resource = $this->fetchResource($uri); 87 | 88 | if (!$resource) { 89 | return false; 90 | } 91 | 92 | if ($this->matchesPostfetchFilter($resource)) { 93 | return false; 94 | } 95 | 96 | $this->getPersistenceHandler()->persist($resource); 97 | 98 | return $resource; 99 | } 100 | 101 | public function isDownLoadLimitExceeded(): bool 102 | { 103 | return $this->getDownloadLimit() !== 0 && $this->getPersistenceHandler()->count() >= $this->getDownloadLimit(); 104 | } 105 | 106 | /** 107 | * A shortcut for EventDispatcher::dispatch() 108 | * 109 | * @param GenericEvent $event 110 | * @param string $eventName 111 | */ 112 | private function dispatch(GenericEvent $event, string $eventName): void 113 | { 114 | $this->getDispatcher()->dispatch($event, $eventName); 115 | } 116 | 117 | /** 118 | * @param DiscoveredUri $uri 119 | * @return Resource|false 120 | */ 121 | protected function fetchResource(DiscoveredUri $uri): Resource|false 122 | { 123 | $resource = false; 124 | 125 | $this->dispatch(new GenericEvent($this, array('uri' => $uri)), SpiderEvents::SPIDER_CRAWL_PRE_REQUEST); 126 | 127 | try { 128 | $resource = $this->getRequestHandler()->request($uri); 129 | } catch (Exception $e) { 130 | $this->dispatch( 131 | new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage())), 132 | SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST 133 | ); 134 | } finally { 135 | $this->dispatch( 136 | new GenericEvent($this, array('uri' => $uri)), 137 | SpiderEvents::SPIDER_CRAWL_POST_REQUEST 138 | ); 139 | } 140 | 141 | return $resource; 142 | } 143 | 144 | /** 145 | * @param Resource $resource 146 | * @return bool 147 | */ 148 | private function matchesPostfetchFilter(Resource $resource): bool 149 | { 150 | foreach ($this->postFetchFilters as $filter) { 151 | if ($filter->match($resource)) { 152 | $this->dispatch( 153 | new GenericEvent($this, array('uri' => $resource->getUri())), 154 | SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH 155 | ); 156 | return true; 157 | } 158 | } 159 | return false; 160 | } 161 | 162 | /** 163 | * @param PersistenceHandlerInterface $persistenceHandler 164 | */ 165 | public function setPersistenceHandler(PersistenceHandlerInterface $persistenceHandler): void 166 | { 167 | $this->persistenceHandler = $persistenceHandler; 168 | } 169 | 170 | /** 171 | * @return PersistenceHandlerInterface 172 | */ 173 | public function getPersistenceHandler(): PersistenceHandlerInterface 174 | { 175 | return $this->persistenceHandler; 176 | } 177 | 178 | /** 179 | * @param RequestHandlerInterface $requestHandler 180 | */ 181 | public function setRequestHandler(RequestHandlerInterface $requestHandler): void 182 | { 183 | $this->requestHandler = $requestHandler; 184 | } 185 | 186 | /** 187 | * @return RequestHandlerInterface 188 | */ 189 | public function getRequestHandler(): RequestHandlerInterface 190 | { 191 | return $this->requestHandler; 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /src/Downloader/DownloaderInterface.php: -------------------------------------------------------------------------------- 1 | dispatcher == null) { 19 | $this->dispatcher = new EventDispatcher(); 20 | } 21 | 22 | return $this->dispatcher; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/Event/SpiderEvents.php: -------------------------------------------------------------------------------- 1 | 7 | * @copyright 2021 Matthijs van den Bos 8 | */ 9 | final class SpiderEvents 10 | { 11 | /** 12 | * The spider.crawl.filter.prefetch event fires when the URI is not yet fetched and filtered 13 | * 14 | * Note: any listener for this event could stop propagation when its filter matches the event information 15 | * This means you can't assume your listener will be called 16 | * 17 | * @var string 18 | */ 19 | const SPIDER_CRAWL_FILTER_PREFETCH = 'spider.crawl.filter.prefetch'; 20 | 21 | /** 22 | * The spider.crawl.filter.postfetch event fires when the Resource is already fetched and filtered 23 | * 24 | * Note: any listener for this event could stop propagation when its filter matches the event information 25 | * This means you can't assume your listener will be called 26 | * 27 | * @var string 28 | */ 29 | const SPIDER_CRAWL_FILTER_POSTFETCH = 'spider.crawl.filter.postfetch'; 30 | 31 | /** 32 | * The spider.crawl.pre_request event fires before the Spider starts it crawl 33 | */ 34 | const SPIDER_CRAWL_PRE_CRAWL = 'spider.crawl.pre_crawl'; 35 | 36 | /** 37 | * The spider.crawl.pre_request event fires just before the [RequestHandlerInterface] executes 38 | * the request for a specific URI 39 | */ 40 | const SPIDER_CRAWL_PRE_REQUEST = 'spider.crawl.pre_request'; 41 | 42 | /** 43 | * The spider.crawl.post_request event fires immediately the [RequestHandlerInterface] executes 44 | * the request for a specific URI 45 | */ 46 | const SPIDER_CRAWL_POST_REQUEST = 'spider.crawl.post_request'; 47 | 48 | /** 49 | * The spider.crawl.pre_enqueue event fires after the URI was added to the queue 50 | * 51 | * The event contains an instance of the Resource being enqueued. 52 | * An example use case for this event would be to change the Resources queue priority based on certain rules 53 | * 54 | * Note: any listener for this event could stop propagation when its filter matches the event information 55 | * This means you can't assume your listener will be called 56 | * 57 | * @var string 58 | */ 59 | const SPIDER_CRAWL_POST_ENQUEUE = 'spider.crawl.post.enqueue'; 60 | 61 | const SPIDER_CRAWL_ERROR_REQUEST = 'spider.error.request'; 62 | 63 | const SPIDER_CRAWL_RESOURCE_PERSISTED = 'spider.crawl.resource.persisted'; 64 | 65 | /** 66 | * The spider.crawl.user.stopped event fires when the spider was stopped by a user action 67 | * 68 | * @var string 69 | */ 70 | const SPIDER_CRAWL_USER_STOPPED = 'spider.crawl.user.stopped'; 71 | } 72 | -------------------------------------------------------------------------------- /src/EventListener/PolitenessPolicyListener.php: -------------------------------------------------------------------------------- 1 | 9 | * @copyright 2021 Matthijs van den Bos 10 | */ 11 | class PolitenessPolicyListener 12 | { 13 | private ?string $previousHostname = null; 14 | 15 | /** @var int the delay in microseconds between requests to the same domain */ 16 | private int $requestDelay; 17 | 18 | public int $totalDelay = 0; 19 | 20 | /** 21 | * @param int $requestDelay the delay in milliseconds between requests to the same domain 22 | */ 23 | public function __construct(int $requestDelay) 24 | { 25 | $this->requestDelay = $requestDelay * 1000; 26 | } 27 | 28 | /** 29 | * @param GenericEvent $event 30 | */ 31 | public function onCrawlPreRequest(GenericEvent $event): void 32 | { 33 | $currentHostname = $event->getArgument('uri')->getHost(); 34 | 35 | if ($currentHostname === $this->previousHostname) { 36 | $this->totalDelay = $this->totalDelay + $this->requestDelay; 37 | usleep($this->requestDelay); 38 | } 39 | $this->previousHostname = $currentHostname; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/Exception/MaxQueueSizeExceededException.php: -------------------------------------------------------------------------------- 1 | 8 | * @copyright 2021 Matthijs van den Bos 9 | */ 10 | class MaxQueueSizeExceededException extends Exception 11 | { 12 | } 13 | -------------------------------------------------------------------------------- /src/Filter/PostFetchFilterInterface.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | interface PostFetchFilterInterface 11 | { 12 | /** 13 | * @param Resource $resource 14 | * @return boolean 15 | */ 16 | public function match(Resource $resource): bool; 17 | } 18 | -------------------------------------------------------------------------------- /src/Filter/Postfetch/MimeTypeFilter.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class MimeTypeFilter implements PostFetchFilterInterface 12 | { 13 | protected string $allowedMimeType = ''; 14 | 15 | public function __construct($allowedMimeType) 16 | { 17 | $this->allowedMimeType = $allowedMimeType; 18 | } 19 | 20 | public function match(Resource $resource): bool 21 | { 22 | $contentType = $resource->getResponse()->getHeaderLine('Content-Type'); 23 | return $contentType !== $this->allowedMimeType; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/Filter/PreFetchFilterInterface.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | interface PreFetchFilterInterface 11 | { 12 | /** 13 | * Returns true of the URI should be filtered out, i.e. NOT be crawled. 14 | * @param UriInterface $uri 15 | * @return boolean 16 | */ 17 | public function match(UriInterface $uri): bool; 18 | } 19 | -------------------------------------------------------------------------------- /src/Filter/Prefetch/AllowedHostsFilter.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class AllowedHostsFilter implements PreFetchFilterInterface 12 | { 13 | /** @var array The hostnames to filter links with */ 14 | private array $allowedHosts; 15 | 16 | private bool $allowSubDomains; 17 | 18 | /** 19 | * @param string[] $seeds 20 | * @param bool $allowSubDomains 21 | */ 22 | public function __construct(array $seeds, bool $allowSubDomains = false) 23 | { 24 | $this->allowSubDomains = $allowSubDomains; 25 | 26 | foreach ($seeds as $seed) { 27 | $hostname = parse_url($seed, PHP_URL_HOST); 28 | 29 | if ($this->allowSubDomains) { 30 | // only use hostname.tld for comparison 31 | $this->allowedHosts[] = join('.', array_slice(explode('.', $hostname), -2)); 32 | } else { 33 | // user entire *.hostname.tld for comparison 34 | $this->allowedHosts[] = $hostname; 35 | } 36 | } 37 | } 38 | 39 | public function match(UriInterface $uri): bool 40 | { 41 | $currentHostname = $uri->getHost(); 42 | 43 | if ($this->allowSubDomains) { 44 | // only use hostname.tld for comparison 45 | $currentHostname = join('.', array_slice(explode('.', $currentHostname), -2)); 46 | } 47 | 48 | return !in_array($currentHostname, $this->allowedHosts); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/Filter/Prefetch/AllowedPortsFilter.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class AllowedPortsFilter implements PreFetchFilterInterface 12 | { 13 | /** 14 | * @var array 15 | */ 16 | private array $allowedPorts; 17 | 18 | /** 19 | * The whitelist of allowed ports 20 | * @param array $allowedPorts 21 | */ 22 | public function __construct(array $allowedPorts) 23 | { 24 | $this->allowedPorts = $allowedPorts; 25 | } 26 | 27 | public function match(UriInterface $uri): bool 28 | { 29 | return !in_array($uri->getPort(), $this->allowedPorts); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/Filter/Prefetch/AllowedSchemeFilter.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class AllowedSchemeFilter implements PreFetchFilterInterface 12 | { 13 | private array $allowedSchemes; 14 | 15 | /** 16 | * @param string[] $schemes 17 | */ 18 | public function __construct(array $schemes) 19 | { 20 | $this->allowedSchemes = $schemes; 21 | } 22 | 23 | /** 24 | * @param UriInterface $uri 25 | * @return bool 26 | */ 27 | public function match(UriInterface $uri): bool 28 | { 29 | return !in_array($uri->getScheme(), $this->allowedSchemes); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/Filter/Prefetch/ExtractRobotsTxtException.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class RestrictToBaseUriFilter implements PreFetchFilterInterface 16 | { 17 | /** @var Uri */ 18 | private Uri $seed; 19 | 20 | /** 21 | * @param string $seed 22 | */ 23 | public function __construct(string $seed) 24 | { 25 | try { 26 | $this->seed = new Uri($seed); 27 | } catch (ErrorException | UriSyntaxException $e) { 28 | throw new InvalidArgumentException("Invalid seed: " . $e->getMessage()); 29 | } 30 | } 31 | 32 | public function match(UriInterface $uri): bool 33 | { 34 | /* 35 | * if the URI does not contain the seed, it is not allowed 36 | */ 37 | return false === stripos($uri->toString(), $this->seed->toString()); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/Filter/Prefetch/RobotsTxtDisallowFilter.php: -------------------------------------------------------------------------------- 1 | 17 | */ 18 | class RobotsTxtDisallowFilter implements PreFetchFilterInterface 19 | { 20 | private RobotsTxt $parser; 21 | private ?string $userAgent; 22 | private Uri $seedUri; 23 | 24 | /** 25 | * @param string $seedUrl The robots.txt file will be loaded from this domain. 26 | * @param string|null $userAgent 27 | * @throws ErrorException 28 | * @throws UriSyntaxException 29 | */ 30 | public function __construct(string $seedUrl, ?string $userAgent = null) 31 | { 32 | $this->seedUri = new Uri($seedUrl); 33 | $this->seedUri->normalize(); 34 | $this->userAgent = $userAgent; 35 | $this->parser = new RobotsTxt(self::fetchRobotsTxt(self::extractRobotsTxtUri($seedUrl))); 36 | } 37 | 38 | /** 39 | * @param string $robotsUri 40 | * @return string 41 | */ 42 | private static function fetchRobotsTxt(string $robotsUri): string 43 | { 44 | try { 45 | $robotsTxt = file_get_contents($robotsUri); 46 | } catch (Exception $e) { 47 | throw new FetchRobotsTxtException("Could not fetch $robotsUri: " . $e->getMessage()); 48 | } 49 | 50 | return $robotsTxt; 51 | } 52 | 53 | /** 54 | * Clean up the URL and strip any parameters and fragments 55 | * 56 | * @param string $seedUrl 57 | * @return string 58 | * 59 | * @throws ErrorException 60 | * @throws UriSyntaxException 61 | */ 62 | private static function extractRobotsTxtUri(string $seedUrl): string 63 | { 64 | $uri = new Uri($seedUrl); 65 | if (in_array($uri->getScheme(), FileUri::$allowedSchemes)) { 66 | return new FileUri($seedUrl . '/robots.txt'); 67 | } elseif (in_array($uri->getScheme(), Http::$allowedSchemes)) { 68 | return $uri->toBaseUri()->toString() . '/robots.txt'; 69 | } else { 70 | throw new ExtractRobotsTxtException( 71 | "Seed URL scheme must be one of " . 72 | implode(', ', array_merge(FileUri::$allowedSchemes, Http::$allowedSchemes)) 73 | ); 74 | } 75 | } 76 | 77 | public function match(UriInterface $uri): bool 78 | { 79 | // Make the uri relative to $this->seedUri, so it will match with the rules in the robots.txt 80 | $relativeUri = str_replace($this->seedUri->toString(), '', $uri->normalize()->toString()); 81 | return !$this->parser->allows($relativeUri, $this->userAgent); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/Filter/Prefetch/UriFilter.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class UriFilter implements PreFetchFilterInterface 12 | { 13 | /** 14 | * @var array An array of regexes 15 | */ 16 | public array $regexes = array(); 17 | 18 | public function __construct(array $regexes = array()) 19 | { 20 | $this->regexes = $regexes; 21 | } 22 | 23 | public function match(UriInterface $uri): bool 24 | { 25 | foreach ($this->regexes as $regex) { 26 | if (preg_match($regex, $uri->toString())) { 27 | return true; 28 | } 29 | } 30 | return false; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/Filter/Prefetch/UriWithHashFragmentFilter.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class UriWithHashFragmentFilter implements PreFetchFilterInterface 12 | { 13 | public function match(UriInterface $uri): bool 14 | { 15 | return null !== $uri->getFragment(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/Filter/Prefetch/UriWithQueryStringFilter.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class UriWithQueryStringFilter implements PreFetchFilterInterface 12 | { 13 | public function match(UriInterface $uri): bool 14 | { 15 | return null !== $uri->getQuery(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/FilterableInterface.php: -------------------------------------------------------------------------------- 1 | 6 | * @copyright 2021 Matthijs van den Bos 7 | */ 8 | interface FilterableInterface 9 | { 10 | /** 11 | * @param bool $filtered 12 | * @param string $reason 13 | * @return void 14 | */ 15 | public function setFiltered(bool $filtered = true, string $reason = ''): void; 16 | 17 | /** 18 | * @return boolean whether the item matched a filter 19 | */ 20 | public function isFiltered(): bool; 21 | 22 | /** 23 | * Get the reason the item was filtered 24 | * 25 | * @return string 26 | */ 27 | public function getFilterReason(): string; 28 | 29 | /** 30 | * Get a unique identifier for the filterable item 31 | * Used for reporting 32 | * 33 | * @return string 34 | */ 35 | public function getIdentifier(): string; 36 | } 37 | -------------------------------------------------------------------------------- /src/PersistenceHandler/FilePersistenceHandler.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2021 Matthijs van den Bos 5 | */ 6 | 7 | namespace VDB\Spider\PersistenceHandler; 8 | 9 | use Exception; 10 | use Iterator; 11 | use ReturnTypeWillChange; // @phan-suppress-current-line PhanUnreferencedUseNormal 12 | use Symfony\Component\Finder\Finder; 13 | use VDB\Spider\Resource; 14 | 15 | abstract class FilePersistenceHandler implements PersistenceHandlerInterface 16 | { 17 | /** 18 | * @var string the path where all spider results should be persisted. 19 | * The results will be grouped in a directory by spider ID. 20 | */ 21 | protected string $path = ''; 22 | 23 | protected string $spiderId = ''; 24 | 25 | protected int $totalSizePersisted = 0; 26 | 27 | protected ?Iterator $iterator = null; 28 | 29 | protected ?Finder $finder = null; 30 | 31 | /** @var string The filename that will be appended for resources that end with a slash */ 32 | protected string $defaultFilename = 'index.html'; 33 | 34 | /** 35 | * @param string $path the path where all spider results should be persisted. 36 | * The results will be grouped in a directory by spider ID. 37 | */ 38 | public function __construct(string $path) 39 | { 40 | $this->path = $path; 41 | } 42 | 43 | public function setSpiderId(string $spiderId): void 44 | { 45 | $this->spiderId = $spiderId; 46 | 47 | // create the path 48 | if (!file_exists($this->getResultPath())) { 49 | mkdir($this->getResultPath(), 0700, true); 50 | } 51 | } 52 | 53 | protected function getResultPath(): string 54 | { 55 | return $this->path . DIRECTORY_SEPARATOR . $this->spiderId . DIRECTORY_SEPARATOR; 56 | } 57 | 58 | public function count(): int 59 | { 60 | return $this->getFinder()->count(); 61 | } 62 | 63 | /** 64 | * @return Finder 65 | */ 66 | protected function getFinder(): Finder 67 | { 68 | if ($this->finder == null) { 69 | $this->finder = Finder::create()->files()->in($this->getResultPath()); 70 | } 71 | return $this->finder; 72 | } 73 | 74 | abstract public function persist(Resource $resource); 75 | 76 | /** 77 | * @return Resource 78 | */ 79 | abstract public function current(): Resource; 80 | 81 | /** 82 | * @return void 83 | * @throws Exception 84 | */ 85 | public function next(): void 86 | { 87 | $this->getIterator()->next(); 88 | } 89 | 90 | /** 91 | * @return Iterator 92 | * @throws Exception 93 | */ 94 | protected function getIterator(): Iterator 95 | { 96 | if ($this->iterator == null) { 97 | $this->iterator = $this->getFinder()->getIterator(); 98 | } 99 | return $this->iterator; 100 | } 101 | 102 | /** 103 | * @return integer|double|string|boolean|null 104 | * @throws Exception 105 | */ 106 | #[ReturnTypeWillChange] // @phan-suppress-current-line PhanUndeclaredClassAttribute 107 | public function key(): float|bool|int|string|null 108 | { 109 | return $this->getIterator()->key(); 110 | } 111 | 112 | /** 113 | * @return boolean 114 | * @throws Exception 115 | */ 116 | public function valid(): bool 117 | { 118 | return $this->getIterator()->valid(); 119 | } 120 | 121 | /** 122 | * @return void 123 | * @throws Exception 124 | */ 125 | public function rewind(): void 126 | { 127 | $this->getIterator()->rewind(); 128 | } 129 | 130 | protected function getFileSystemFilename(Resource $resource): string 131 | { 132 | $fullPath = $this->completePath($resource->getUri()->getPath()); 133 | 134 | return urlencode(basename($fullPath)); 135 | } 136 | 137 | /** 138 | * @param string|null $path 139 | * @return string The path that was provided with a default filename appended if it is 140 | * a path ending in a /. This is because we don't want to persist 141 | * the directories as files. This is similar to wget behaviour. 142 | */ 143 | protected function completePath(?string $path): string 144 | { 145 | if ($path == '') { 146 | $path = "/" . $this->defaultFilename; 147 | } elseif (substr($path, -1, 1) === '/') { 148 | $path .= $this->defaultFilename; 149 | } 150 | 151 | return $path; 152 | } 153 | 154 | protected function getFileSystemPath(Resource $resource): string 155 | { 156 | $hostname = $resource->getUri()->getHost(); 157 | $fullPath = $this->completePath($resource->getUri()->getPath()); 158 | 159 | return $hostname . dirname($fullPath); 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /src/PersistenceHandler/FileSerializedResourcePersistenceHandler.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2021 Matthijs van den Bos 5 | */ 6 | 7 | namespace VDB\Spider\PersistenceHandler; 8 | 9 | use Exception; 10 | use SplFileObject; 11 | use VDB\Spider\Resource; 12 | 13 | class FileSerializedResourcePersistenceHandler extends FilePersistenceHandler implements PersistenceHandlerInterface 14 | { 15 | public function persist(Resource $resource) 16 | { 17 | $path = $this->getResultPath() . $this->getFileSystemPath($resource); 18 | if (!is_dir($path)) { 19 | mkdir($path, 0777, true); 20 | } 21 | $file = new SplFileObject($path . DIRECTORY_SEPARATOR . $this->getFileSystemFilename($resource), 'w'); 22 | $this->totalSizePersisted += $file->fwrite(serialize($resource)); 23 | } 24 | 25 | /** 26 | * @return Resource 27 | * @throws Exception 28 | */ 29 | public function current(): Resource 30 | { 31 | return unserialize($this->getIterator()->current()->getContents()); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/PersistenceHandler/MemoryPersistenceHandler.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2021 Matthijs van den Bos 5 | */ 6 | 7 | namespace VDB\Spider\PersistenceHandler; 8 | 9 | use VDB\Spider\Resource; 10 | 11 | class MemoryPersistenceHandler implements PersistenceHandlerInterface 12 | { 13 | /** 14 | * @var Resource[] 15 | */ 16 | private array $resources = array(); 17 | 18 | /** 19 | * @param string $spiderId 20 | * @SuppressWarnings(PHPMD.UnusedFormalParameter) 21 | */ 22 | public function setSpiderId(string $spiderId) 23 | { 24 | // memory handler ignores this. Only interesting for true persistence as some kind of key or prefix 25 | } 26 | 27 | public function count(): int 28 | { 29 | return count($this->resources); 30 | } 31 | 32 | public function persist(Resource $resource): void 33 | { 34 | $this->resources[] = $resource; 35 | } 36 | 37 | /** 38 | * @return mixed Returns Resource or false 39 | * @suppress PhanTypeMismatchDeclaredReturn Can be fixed by setting return type to mixed when lowest PHP version is 8 40 | */ 41 | public function current(): Resource 42 | { 43 | return current($this->resources); 44 | } 45 | 46 | /** 47 | * @return void Any returned value is ignored. 48 | */ 49 | public function next(): void 50 | { 51 | next($this->resources); 52 | } 53 | 54 | /** 55 | * @return int 56 | */ 57 | public function key(): int 58 | { 59 | return key($this->resources); 60 | } 61 | 62 | /** 63 | * @return boolean 64 | */ 65 | public function valid(): bool 66 | { 67 | return (bool)current($this->resources); 68 | } 69 | 70 | /** 71 | * @return void 72 | */ 73 | public function rewind(): void 74 | { 75 | reset($this->resources); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/PersistenceHandler/PersistenceHandlerInterface.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2021 Matthijs van den Bos 5 | */ 6 | 7 | namespace VDB\Spider\PersistenceHandler; 8 | 9 | use Countable; 10 | use Iterator; 11 | use VDB\Spider\Resource; 12 | 13 | interface PersistenceHandlerInterface extends Iterator, Countable 14 | { 15 | /** 16 | * @param string $spiderId 17 | * 18 | * @return void 19 | */ 20 | public function setSpiderId(string $spiderId); 21 | 22 | /** 23 | * @param Resource $resource 24 | * @return void 25 | */ 26 | public function persist(Resource $resource); 27 | } 28 | -------------------------------------------------------------------------------- /src/QueueManager/InMemoryQueueManager.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2021 Matthijs van den Bos 5 | */ 6 | 7 | namespace VDB\Spider\QueueManager; 8 | 9 | use InvalidArgumentException; 10 | use Symfony\Component\EventDispatcher\GenericEvent; 11 | use VDB\Spider\Event\DispatcherTrait; 12 | use VDB\Spider\Event\SpiderEvents; 13 | use VDB\Spider\Exception\MaxQueueSizeExceededException; 14 | use VDB\Spider\Uri\DiscoveredUri; 15 | 16 | class InMemoryQueueManager implements QueueManagerInterface 17 | { 18 | use DispatcherTrait; 19 | 20 | /** @var int The maximum size of the process queue for this spider. 0 means infinite */ 21 | public int $maxQueueSize = 0; 22 | 23 | /** @var int the amount of times a Resource was enqueued */ 24 | private int $currentQueueSize = 0; 25 | 26 | /** @var DiscoveredUri[] the list of URIs to process */ 27 | private array $traversalQueue = array(); 28 | 29 | /** @var int The traversal algorithm to use. Choose from the class constants 30 | */ 31 | private int $traversalAlgorithm = self::ALGORITHM_DEPTH_FIRST; 32 | 33 | /** 34 | * InMemoryQueueManager constructor. 35 | * @param int $traversalAlgorithm 36 | */ 37 | public function __construct(int $traversalAlgorithm = self::ALGORITHM_DEPTH_FIRST) 38 | { 39 | $this->setTraversalAlgorithm($traversalAlgorithm); 40 | } 41 | 42 | /** 43 | * @return int 44 | */ 45 | public function getTraversalAlgorithm(): int 46 | { 47 | return $this->traversalAlgorithm; 48 | } 49 | 50 | /** 51 | * @param int $traversalAlgorithm Choose from the class constants 52 | */ 53 | public function setTraversalAlgorithm(int $traversalAlgorithm): void 54 | { 55 | if ($traversalAlgorithm != QueueManagerInterface::ALGORITHM_DEPTH_FIRST 56 | && $traversalAlgorithm != QueueManagerInterface::ALGORITHM_BREADTH_FIRST) { 57 | throw new InvalidArgumentException("Invalid traversal algorithm. See QueueManagerInterface for options."); 58 | } 59 | $this->traversalAlgorithm = $traversalAlgorithm; 60 | } 61 | 62 | /** 63 | * @param DiscoveredUri $uri 64 | * @throws MaxQueueSizeExceededException 65 | */ 66 | public function addUri(DiscoveredUri $uri): void 67 | { 68 | if ($this->maxQueueSize != 0 && $this->currentQueueSize >= $this->maxQueueSize) { 69 | throw new MaxQueueSizeExceededException('Maximum Queue Size of ' . $this->maxQueueSize . ' reached'); 70 | } 71 | 72 | $this->currentQueueSize++; 73 | $this->traversalQueue[] = $uri; 74 | 75 | $this->getDispatcher()->dispatch( 76 | new GenericEvent($this, array('uri' => $uri)), 77 | SpiderEvents::SPIDER_CRAWL_POST_ENQUEUE 78 | ); 79 | } 80 | 81 | public function next(): ?DiscoveredUri 82 | { 83 | $uri = null; 84 | if ($this->traversalAlgorithm === static::ALGORITHM_DEPTH_FIRST) { 85 | $uri = array_pop($this->traversalQueue); 86 | } elseif ($this->traversalAlgorithm === static::ALGORITHM_BREADTH_FIRST) { 87 | $uri = array_shift($this->traversalQueue); 88 | } 89 | return $uri; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/QueueManager/QueueManagerInterface.php: -------------------------------------------------------------------------------- 1 | 4 | * @copyright 2021 Matthijs van den Bos 5 | */ 6 | 7 | namespace VDB\Spider\QueueManager; 8 | 9 | use VDB\Spider\Uri\DiscoveredUri; 10 | 11 | interface QueueManagerInterface 12 | { 13 | const ALGORITHM_DEPTH_FIRST = 0; 14 | const ALGORITHM_BREADTH_FIRST = 1; 15 | 16 | /** 17 | * @param int $traversalAlgorithm Choose from the class constants 18 | * TODO: This should be extracted to a Strategy pattern 19 | * @return void 20 | */ 21 | public function setTraversalAlgorithm(int $traversalAlgorithm); 22 | 23 | /** 24 | * @return int 25 | */ 26 | public function getTraversalAlgorithm(): int; 27 | 28 | /** 29 | * @param DiscoveredUri $uri 30 | */ 31 | public function addUri(DiscoveredUri $uri); 32 | 33 | /** 34 | * @return null|DiscoveredUri 35 | */ 36 | public function next(): ?DiscoveredUri; 37 | } 38 | -------------------------------------------------------------------------------- /src/RequestHandler/GuzzleRequestHandler.php: -------------------------------------------------------------------------------- 1 | 12 | * @copyright 2021 Matthijs van den Bos 13 | */ 14 | class GuzzleRequestHandler implements RequestHandlerInterface 15 | { 16 | /** @var Client */ 17 | private Client $client; 18 | 19 | /** 20 | * GuzzleRequestHandler constructor. 21 | * @param Client|null $client 22 | */ 23 | public function __construct(?Client $client = null) 24 | { 25 | $this->setClient($client ?: new Client()); 26 | } 27 | 28 | 29 | /** 30 | * @param Client $client 31 | * @return RequestHandlerInterface 32 | */ 33 | public function setClient(Client $client): RequestHandlerInterface 34 | { 35 | $this->client = $client; 36 | return $this; 37 | } 38 | 39 | /** 40 | * @return Client 41 | */ 42 | public function getClient(): Client 43 | { 44 | return $this->client; 45 | } 46 | 47 | /** 48 | * @param DiscoveredUri $uri 49 | * @return Resource 50 | * @throws GuzzleException 51 | * @suppress PhanTypeInvalidThrowsIsInterface 52 | */ 53 | public function request(DiscoveredUri $uri): Resource 54 | { 55 | $response = $this->getClient()->get($uri->toString()); 56 | return new Resource($uri, $response); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/RequestHandler/RequestHandlerInterface.php: -------------------------------------------------------------------------------- 1 | 12 | * @copyright 2021 Matthijs van den Bos 13 | */ 14 | class Resource 15 | { 16 | protected DiscoveredUri $uri; 17 | protected ResponseInterface $response; 18 | protected ?Crawler $crawler = null; 19 | protected string $body; 20 | 21 | /** 22 | * @param DiscoveredUri $uri 23 | * @param ResponseInterface $response 24 | */ 25 | public function __construct(DiscoveredUri $uri, ResponseInterface $response) 26 | { 27 | $this->uri = $uri; 28 | $this->response = $response; 29 | } 30 | 31 | /** 32 | * Lazy loads a Crawler object based on the ResponseInterface; 33 | * @return Crawler 34 | */ 35 | public function getCrawler(): Crawler 36 | { 37 | if ($this->crawler == null) { 38 | $this->crawler = new Crawler('', $this->getUri()->toString()); 39 | $this->crawler->addContent( 40 | $this->getResponse()->getBody()->__toString(), 41 | $this->getResponse()->getHeaderLine('Content-Type') 42 | ); 43 | } 44 | return $this->crawler; 45 | } 46 | 47 | /** 48 | * @return DiscoveredUri 49 | */ 50 | public function getUri(): DiscoveredUri 51 | { 52 | return $this->uri; 53 | } 54 | 55 | /** 56 | * @return ResponseInterface 57 | */ 58 | public function getResponse(): ResponseInterface 59 | { 60 | return $this->response; 61 | } 62 | 63 | public function __sleep(): array 64 | { 65 | /* 66 | * Because the Crawler isn't serialized correctly, we exclude it from serialization 67 | * It will be available again after wakeup through lazy loading with getCrawler() 68 | */ 69 | 70 | // we store the response manually, because otherwise it will not get serialized. 71 | $this->body = Message::toString($this->response); 72 | 73 | return array( 74 | 'uri', 75 | 'body' 76 | ); 77 | } 78 | 79 | /** 80 | * We need to set the body again after deserialization because it was a stream that didn't get serialized 81 | */ 82 | public function __wakeup() 83 | { 84 | $this->response = Message::parseResponse($this->body); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/Uri/DiscoveredUri.php: -------------------------------------------------------------------------------- 1 | decorated = $decorated; 26 | $this->depthFound = $depthFound; 27 | } 28 | 29 | /** 30 | * @return int The depth this Uri was found on 31 | */ 32 | public function getDepthFound(): int 33 | { 34 | return $this->depthFound; 35 | } 36 | 37 | // @codeCoverageIgnoreStart 38 | // We ignore coverage for all proxy methods below: 39 | // the constructor is tested and if that is successful there is no point 40 | // to testing the behaviour of the decorated class 41 | 42 | public function toString(): string 43 | { 44 | return $this->decorated->toString(); 45 | } 46 | 47 | /** 48 | * @param UriInterface $that 49 | * @param boolean $normalized whether to compare normalized versions of the URIs 50 | * @return boolean 51 | */ 52 | public function equals(UriInterface $that, $normalized = false): bool 53 | { 54 | return $this->decorated->equals($that, $normalized); 55 | } 56 | 57 | /** 58 | * @return UriInterface 59 | */ 60 | public function normalize(): UriInterface 61 | { 62 | // This normalizes the decorated Uri in place. We don't want to return the decorated Uri, but $this. 63 | $this->decorated->normalize(); 64 | return $this; 65 | } 66 | 67 | /** 68 | * Alias of Uri::toString() 69 | * 70 | * @return string 71 | */ 72 | public function __toString(): string 73 | { 74 | return $this->decorated->__toString(); 75 | } 76 | 77 | /** 78 | * @return string|null 79 | */ 80 | public function getHost(): ?string 81 | { 82 | return $this->decorated->getHost(); 83 | } 84 | 85 | /** 86 | * @return string|null 87 | */ 88 | public function getPassword(): ?string 89 | { 90 | return $this->decorated->getPassword(); 91 | } 92 | 93 | /** 94 | * @return string 95 | */ 96 | public function getPath(): string 97 | { 98 | return $this->decorated->getPath() ?: ''; 99 | } 100 | 101 | /** 102 | * @return int|null 103 | */ 104 | public function getPort(): ?int 105 | { 106 | return $this->decorated->getPort(); 107 | } 108 | 109 | /** 110 | * @return string|null 111 | */ 112 | public function getQuery(): ?string 113 | { 114 | return $this->decorated->getQuery(); 115 | } 116 | 117 | /** 118 | * @return string|null 119 | */ 120 | public function getScheme(): ?string 121 | { 122 | return $this->decorated->getScheme(); 123 | } 124 | 125 | /** 126 | * @return string|null 127 | */ 128 | public function getUsername(): ?string 129 | { 130 | return $this->decorated->getUsername(); 131 | } 132 | 133 | /** 134 | * @return string|null 135 | */ 136 | public function getFragment(): ?string 137 | { 138 | return $this->decorated->getFragment(); 139 | } 140 | 141 | // @codeCoverageIgnoreEnd 142 | } 143 | -------------------------------------------------------------------------------- /tests/Discoverer/CssSelectorDiscovererTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Discoverer; 13 | 14 | use VDB\Spider\Discoverer\CssSelectorDiscoverer; 15 | 16 | /** 17 | * 18 | */ 19 | class CssSelectorDiscovererTest extends DiscovererTestCase 20 | { 21 | /** 22 | * @covers \VDB\Spider\Discoverer\CssSelectorDiscoverer 23 | */ 24 | public function testDiscover() 25 | { 26 | $discoverer = new CssSelectorDiscoverer("a"); 27 | $this->executeDiscoverer($discoverer); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /tests/Discoverer/DiscovererSetTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Discoverer; 13 | 14 | use ErrorException; 15 | use Exception; 16 | use VDB\Spider\Discoverer\DiscovererSet; 17 | use VDB\Spider\Discoverer\XPathExpressionDiscoverer; 18 | use VDB\Spider\Filter\Prefetch\AllowedHostsFilter; 19 | use VDB\Spider\Filter\Prefetch\AllowedPortsFilter; 20 | use VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter; 21 | use VDB\Spider\Filter\Prefetch\UriFilter; 22 | use VDB\Spider\Uri\DiscoveredUri; 23 | use VDB\Uri\Exception\UriSyntaxException; 24 | use VDB\Uri\FileUri; 25 | 26 | /** 27 | * 28 | */ 29 | class DiscovererSetTest extends DiscovererTestCase 30 | { 31 | private DiscovererSet $discovererSet; 32 | 33 | public function setUp(): void 34 | { 35 | parent::setUp(); 36 | } 37 | 38 | /** 39 | * @covers \VDB\Spider\Discoverer\DiscovererSet 40 | */ 41 | public function testMaxDepth() 42 | { 43 | $this->discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 44 | $this->discovererSet->maxDepth = 1; 45 | 46 | $uris = $this->discovererSet->discover($this->spiderResource); 47 | $this->assertCount(2, $uris); 48 | 49 | $this->discovererSet->maxDepth = 0; 50 | $urisAtDepth0 = $this->discovererSet->discover($this->spiderResource); 51 | $this->assertCount(0, $urisAtDepth0); 52 | } 53 | 54 | 55 | /** 56 | * @covers \VDB\Spider\Discoverer\DiscovererSet 57 | */ 58 | public function testConstructor() 59 | { 60 | $this->discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 61 | 62 | $uris = $this->discovererSet->discover($this->spiderResource); 63 | $this->assertCount(2, $uris); 64 | } 65 | 66 | /** 67 | * @covers \VDB\Spider\Discoverer\DiscovererSet 68 | */ 69 | public function testSetDiscoverer() 70 | { 71 | $this->discovererSet = new DiscovererSet(); 72 | $this->discovererSet->set(new XPathExpressionDiscoverer("//a")); 73 | 74 | $uris = $this->discovererSet->discover($this->spiderResource); 75 | $this->assertCount(2, $uris); 76 | } 77 | 78 | /** 79 | * @covers \VDB\Spider\Discoverer\DiscovererSet 80 | */ 81 | public function testUriFilter() 82 | { 83 | $this->discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 84 | 85 | $this->discovererSet->addFilter(new UriFilter(['/^.*contact.*$/'])); 86 | 87 | $uris = $this->discovererSet->discover($this->spiderResource); 88 | $this->assertCount(1, $uris); 89 | } 90 | 91 | /** 92 | * @covers \VDB\Spider\Discoverer\DiscovererSet 93 | * @covers \VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter 94 | * 95 | * @throws UriSyntaxException 96 | * @throws ErrorException 97 | * @throws Exception 98 | */ 99 | public function testRobotsTxtDisallowFilter() 100 | { 101 | $baseUri = "file://" . __DIR__; 102 | $resourceUri = new DiscoveredUri($baseUri, 0); 103 | $uriInBody1 = $baseUri . '/internal'; 104 | $uriInBody2 = $baseUri . '/foo'; 105 | 106 | $spiderResource = self::createResourceWithLinks($resourceUri, [$uriInBody1, $uriInBody2]); 107 | 108 | $discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 109 | $discovererSet->addFilter(new RobotsTxtDisallowFilter($baseUri)); 110 | 111 | $uris = $discovererSet->discover($spiderResource); 112 | $this->assertCount(1, $uris); 113 | $this->assertNotContains((new FileUri($uriInBody2))->toString(), array_map(fn($uri): string => (new FileUri($uri->toString()))->toString(), $uris)); 114 | } 115 | 116 | /** 117 | * @covers \VDB\Spider\Discoverer\DiscovererSet 118 | * @covers \VDB\Spider\Filter\Prefetch\AllowedPortsFilter 119 | */ 120 | public function testPortFilter() 121 | { 122 | $this->discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 123 | 124 | $this->discovererSet->addFilter(new AllowedPortsFilter([8080])); 125 | 126 | $uris = $this->discovererSet->discover($this->spiderResource); 127 | $this->assertCount(1, $uris); 128 | $this->assertEquals($this->uriInBody2, $uris[0]->toString()); 129 | } 130 | 131 | /** 132 | * @covers \VDB\Spider\Discoverer\DiscovererSet 133 | * @covers \VDB\Spider\Filter\Prefetch\AllowedHostsFilter 134 | */ 135 | public function testHostFilter() 136 | { 137 | $this->discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 138 | 139 | $this->discovererSet->addFilter(new AllowedHostsFilter(array("http://php-spider.org"))); 140 | 141 | $uris = $this->discovererSet->discover($this->spiderResource); 142 | $this->assertCount(2, $uris); 143 | } 144 | 145 | /** 146 | * @throws UriSyntaxException 147 | * @throws ErrorException 148 | * @throws Exception 149 | */ 150 | public function testInvalidUriSkipped() 151 | { 152 | $resourceUri = new DiscoveredUri('http://php-spider.org/', 0); 153 | $uriInBody1 = 'http://php-spider:org:8080:internal/'; 154 | $uriInBody2 = 'http://php-spider.org:8080/internal/'; 155 | 156 | // Setup DOM 157 | $spiderResource = self::createResourceWithLinks($resourceUri, [$uriInBody1, $uriInBody2]); 158 | 159 | $discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 160 | 161 | $uris = $discovererSet->discover($spiderResource); 162 | $this->assertCount(1, $uris); 163 | } 164 | 165 | /** 166 | * @throws UriSyntaxException 167 | * @throws ErrorException 168 | * @throws Exception 169 | */ 170 | public function testDuplicatesRemoved() 171 | { 172 | 173 | // Setup DOM 174 | $spiderResource = self::createResourceWithLinks( 175 | new DiscoveredUri('http://php-spider.org/', 0), 176 | ['http://php-spider.org:8080/internal/', 'http://php-spider.org:8080/internal/'] 177 | ); 178 | 179 | $discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 180 | 181 | $uris = $discovererSet->discover($spiderResource); 182 | $this->assertCount(1, $uris); 183 | } 184 | 185 | /** 186 | * @throws UriSyntaxException 187 | * @throws ErrorException 188 | * @throws Exception 189 | */ 190 | public function testAlreadySeenSkipped() 191 | { 192 | $resourceUri = new DiscoveredUri('http://php-spider.org:8080/internal/', 0); 193 | 194 | // Setup DOM 195 | $spiderResource = self::createResourceWithLinks( 196 | $resourceUri, 197 | ['http://php-spider.org:8080/internal/'] 198 | ); 199 | 200 | $discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]); 201 | 202 | $uris = $discovererSet->discover($spiderResource); 203 | $this->assertCount(0, $uris); 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /tests/Discoverer/DiscovererTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Discoverer; 13 | 14 | use VDB\Spider\Tests\TestCase; 15 | 16 | /** 17 | * 18 | */ 19 | class DiscovererTest extends TestCase 20 | { 21 | /** 22 | * @covers \VDB\Spider\Discoverer\Discoverer 23 | */ 24 | public function testGetName() 25 | { 26 | $stub = $this->getMockBuilder('VDB\\Spider\\Discoverer\\Discoverer') 27 | ->setMockClassName('MockDiscoverer') 28 | ->getMockForAbstractClass(); 29 | 30 | $this->assertEquals('MockDiscoverer', $stub->getName()); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tests/Discoverer/DiscovererTestCase.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Discoverer; 13 | 14 | use DOMDocument; 15 | use DomElement; 16 | use ErrorException; 17 | use Exception; 18 | use GuzzleHttp\Psr7\Response; 19 | use VDB\Spider\Discoverer\DiscovererInterface; 20 | use VDB\Spider\Resource; 21 | use VDB\Spider\Tests\TestCase; 22 | use VDB\Spider\Uri\DiscoveredUri; 23 | use VDB\Uri\Exception\UriSyntaxException; 24 | 25 | abstract class DiscovererTestCase extends TestCase 26 | { 27 | protected DomDocument $domDocument; 28 | protected DomElement $domAnchor; 29 | protected DomElement $domAnchor2; 30 | protected Resource $spiderResource; 31 | protected DiscoveredUri $uri; 32 | protected string $uriInBody1; 33 | protected string $uriInBody2; 34 | protected string $resourceContent; 35 | 36 | /** 37 | * @throws UriSyntaxException 38 | * @throws ErrorException 39 | * @throws Exception 40 | */ 41 | protected function setUp(): void 42 | { 43 | $this->uriInBody1 = 'http://php-spider.org/contact/'; 44 | $this->uriInBody2 = 'http://php-spider.org:8080/internal/'; 45 | 46 | $this->uri = new DiscoveredUri('http://php-spider.org/', 0); 47 | 48 | $this->spiderResource = self::createResourceWithLinks( 49 | $this->uri, 50 | [$this->uriInBody1, $this->uriInBody2] 51 | ); 52 | } 53 | 54 | protected function executeDiscoverer(DiscovererInterface $discoverer) 55 | { 56 | $uris = $discoverer->discover($this->spiderResource); 57 | $uri = $uris[0]; 58 | 59 | $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $uri); 60 | $this->assertEquals($this->uriInBody1, $uri->toString()); 61 | } 62 | /** 63 | * @param string[] $uris 64 | * @return false|string 65 | * @throws Exception 66 | */ 67 | public static function createDocumentWithLinks(array $uris): string 68 | { 69 | $domDocument = new DOMDocument('1', 'UTF-8'); 70 | $html = $domDocument->createElement('html'); 71 | $domDocument->appendChild($html); 72 | 73 | foreach ($uris as $i => $href) { 74 | $domAnchor = $domDocument->createElement('a', 'fake' . $i); 75 | $domAnchor->setAttribute('href', $href); 76 | $html->appendChild($domAnchor); 77 | } 78 | $doc = $domDocument->saveHTML(); 79 | if (!$doc) { 80 | throw new Exception("Could not create DOM document"); 81 | } 82 | return $doc; 83 | } 84 | 85 | /** 86 | * @param DiscoveredUri $resourceUri 87 | * @param string[] $uris 88 | * @return Resource 89 | * @throws Exception 90 | */ 91 | public static function createResourceWithLinks( 92 | DiscoveredUri $resourceUri, 93 | array $uris 94 | ): Resource { 95 | $resourceContent = self::createDocumentWithLinks($uris); 96 | 97 | return new Resource( 98 | $resourceUri, 99 | new Response(200, [], $resourceContent) 100 | ); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /tests/Discoverer/XpathExpressionDiscovererTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Discoverer; 13 | 14 | use InvalidArgumentException; 15 | use VDB\Spider\Discoverer\XPathExpressionDiscoverer; 16 | 17 | /** 18 | * 19 | */ 20 | class XpathExpressionDiscovererTest extends DiscovererTestCase 21 | { 22 | /** 23 | * @covers \VDB\Spider\Discoverer\XPathExpressionDiscoverer 24 | */ 25 | public function testDiscover() 26 | { 27 | $discoverer = new XPathExpressionDiscoverer("//a"); 28 | $this->executeDiscoverer($discoverer); 29 | } 30 | 31 | public function testDiscoverNoA() 32 | { 33 | $this->expectException(InvalidArgumentException::class); 34 | 35 | $discoverer = new XPathExpressionDiscoverer("//b"); 36 | $this->executeDiscoverer($discoverer); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/Discoverer/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /foo 3 | 4 | User-agent: PHP-Spider 5 | Disallow: /bar 6 | -------------------------------------------------------------------------------- /tests/Downloader/DownloaderTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Downloader; 13 | 14 | use ErrorException; 15 | use Exception; 16 | use GuzzleHttp\Psr7\Response; 17 | use VDB\Spider\Downloader\Downloader; 18 | use VDB\Spider\Resource; 19 | use VDB\Spider\Tests\TestCase; 20 | use VDB\Spider\Uri\DiscoveredUri; 21 | use VDB\Uri\Exception\UriSyntaxException; 22 | use VDB\Uri\Uri; 23 | 24 | /** 25 | * 26 | */ 27 | class DownloaderTest extends TestCase 28 | { 29 | private Downloader $downloader; 30 | protected Resource $resource; 31 | protected string $html; 32 | 33 | /** 34 | * @throws UriSyntaxException 35 | * @throws ErrorException 36 | */ 37 | public function setUp(): void 38 | { 39 | $this->html = file_get_contents(__DIR__ . '/../Fixtures/DownloaderTestHTMLResource.html'); 40 | $this->resource = new Resource( 41 | new DiscoveredUri(new Uri('/domains/special', 'http://example.org'), 0), 42 | new Response(200, [], $this->html) 43 | ); 44 | 45 | $this->downloader = new Downloader(); 46 | 47 | $requestHandler = $this->getMockBuilder('VDB\Spider\RequestHandler\RequestHandlerInterface')->getMock(); 48 | $requestHandler 49 | ->expects($this->any()) 50 | ->method('request') 51 | ->will($this->returnValue($this->resource)); 52 | 53 | $this->downloader->setRequestHandler($requestHandler); 54 | } 55 | 56 | /** 57 | * @covers \VDB\Spider\Downloader\Downloader 58 | */ 59 | public function testDefaultRequestHandler() 60 | { 61 | $this->assertInstanceOf( 62 | '\VDB\Spider\RequestHandler\GuzzleRequestHandler', 63 | (new Downloader())->getRequestHandler() 64 | ); 65 | } 66 | 67 | /** 68 | * @covers \VDB\Spider\Downloader\Downloader 69 | * 70 | * @throws UriSyntaxException 71 | * @throws ErrorException 72 | */ 73 | public function testDownload() 74 | { 75 | $resource = $this->downloader->download(new DiscoveredUri(new Uri('http://foobar.org'), 0)); 76 | $this->assertInstanceOf('VDB\\Spider\\Resource', $resource); 77 | } 78 | 79 | /** 80 | * @covers \VDB\Spider\Downloader\Downloader 81 | * 82 | * @throws UriSyntaxException 83 | * @throws ErrorException 84 | */ 85 | public function testDownloadFailed() 86 | { 87 | $requestHandler = $this->getMockBuilder('VDB\Spider\RequestHandler\RequestHandlerInterface')->getMock(); 88 | $requestHandler 89 | ->expects($this->any()) 90 | ->method('request') 91 | ->will($this->throwException(new Exception)); 92 | $this->downloader->setRequestHandler($requestHandler); 93 | 94 | $resource = $this->downloader->download(new DiscoveredUri(new Uri('http://foobar.org'), 0)); 95 | 96 | $this->assertFalse($resource); 97 | } 98 | 99 | /** 100 | * @covers \VDB\Spider\Downloader\Downloader 101 | * 102 | * @throws UriSyntaxException 103 | * @throws ErrorException 104 | */ 105 | public function testFilterNotMatches() 106 | { 107 | $filterNeverMatch = $this->getMockBuilder('VDB\Spider\Filter\PostFetchFilterInterface')->getMock(); 108 | $filterNeverMatch 109 | ->expects($this->any()) 110 | ->method('match') 111 | ->will($this->returnValue(false)); 112 | $this->downloader->addPostFetchFilter($filterNeverMatch); 113 | 114 | $resource = $this->downloader->download(new DiscoveredUri(new Uri('http://foobar.org'), 0)); 115 | 116 | $this->assertInstanceOf('VDB\\Spider\\Resource', $resource); 117 | } 118 | 119 | /** 120 | * @covers \VDB\Spider\Downloader\Downloader 121 | * 122 | * @throws UriSyntaxException 123 | * @throws ErrorException 124 | */ 125 | public function testDownloadLimit() 126 | { 127 | $this->downloader->setDownloadLimit(1); 128 | $this->downloader->download(new DiscoveredUri('http://foobar.org', 0)); 129 | $this->assertTrue($this->downloader->isDownLoadLimitExceeded()); 130 | } 131 | 132 | /** 133 | * @covers \VDB\Spider\Downloader\Downloader 134 | */ 135 | public function testFilterMatches() 136 | { 137 | $filterAlwaysMatch = $this->getMockBuilder('VDB\Spider\Filter\PostFetchFilterInterface')->getMock(); 138 | $filterAlwaysMatch 139 | ->expects($this->any()) 140 | ->method('match') 141 | ->will($this->returnValue(true)); 142 | $downloader = new Downloader(null, null, [$filterAlwaysMatch]); 143 | 144 | $resource = $downloader->download(new DiscoveredUri(new Uri('http://foobar.org'), 0)); 145 | 146 | $this->assertFalse($resource); 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /tests/EventListener/PolitenessPolicyListenerTest.php: -------------------------------------------------------------------------------- 1 | 6 | * 7 | * For the full copyright and license information, please view the LICENSE 8 | * file that was distributed with this source code. 9 | */ 10 | 11 | namespace VDB\Spider\Tests\EventListener; 12 | 13 | use Symfony\Component\EventDispatcher\GenericEvent; 14 | use VDB\Spider\Event\SpiderEvents; 15 | use VDB\Spider\EventListener\PolitenessPolicyListener; 16 | use VDB\Spider\Tests\TestCase; 17 | use VDB\Uri\Uri; 18 | 19 | /** 20 | * 21 | */ 22 | class PolitenessPolicyListenerTest extends TestCase 23 | { 24 | /** 25 | * @covers \VDB\Spider\EventListener\PolitenessPolicyListener 26 | */ 27 | public function testOnCrawlPreRequestSameDomain() 28 | { 29 | $politenessPolicyListener = new PolitenessPolicyListener(500); 30 | 31 | $uri = new Uri('http://php-spider.org/', 'http://php-spider.org/'); 32 | $event = new GenericEvent(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array('uri' => $uri)); 33 | 34 | $politenessPolicyListener->onCrawlPreRequest($event); 35 | 36 | $start = microtime(true); 37 | $politenessPolicyListener->onCrawlPreRequest($event); 38 | $interval = microtime(true) - $start; 39 | 40 | $this->assertGreaterThanOrEqual(0.5, $interval, 'Actual delay'); 41 | } 42 | 43 | /** 44 | * @covers \VDB\Spider\EventListener\PolitenessPolicyListener 45 | */ 46 | public function testOnCrawlPreRequestDifferentDomain() 47 | { 48 | $politenessPolicyListener = new PolitenessPolicyListener(500); 49 | 50 | $uri = new Uri('http://php-spider.org/', 'http://php-spider.org/'); 51 | $event = new GenericEvent(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array('uri' => $uri)); 52 | 53 | $uri2 = new Uri('http://example.com/', 'http://example.com/'); 54 | $event2 = new GenericEvent(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array('uri' => $uri2)); 55 | 56 | $politenessPolicyListener->onCrawlPreRequest($event); 57 | 58 | $start = microtime(true); 59 | $politenessPolicyListener->onCrawlPreRequest($event2); 60 | $interval = microtime(true) - $start; 61 | 62 | $this->assertLessThan(0.5, $interval, 'Actual delay'); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tests/Filter/Postfetch/MimeTypeFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Postfetch; 13 | 14 | use ErrorException; 15 | use GuzzleHttp\Psr7\Response; 16 | use VDB\Spider\Filter\Postfetch\MimeTypeFilter; 17 | use VDB\Spider\Resource; 18 | use VDB\Spider\Tests\TestCase; 19 | use VDB\Spider\Uri\DiscoveredUri; 20 | use VDB\Uri\Exception\UriSyntaxException; 21 | use VDB\Uri\Uri; 22 | 23 | class MimeTypeFilterTest extends TestCase 24 | { 25 | protected Resource $spiderResource; 26 | protected DiscoveredUri $uri; 27 | 28 | /** 29 | * @throws UriSyntaxException 30 | * @throws ErrorException 31 | */ 32 | protected function setUp(): void 33 | { 34 | $this->uri = new DiscoveredUri(new Uri('http://foobar.com/image.jpg'), 0); 35 | 36 | $this->spiderResource = new Resource( 37 | $this->uri, 38 | new Response(200, ['Content-Type' => 'image/jpeg'], '') 39 | ); 40 | } 41 | 42 | /** 43 | * @covers \VDB\Spider\Filter\Postfetch\MimeTypeFilter 44 | */ 45 | public function testMimeTypeFilter() 46 | { 47 | $filter = new MimeTypeFilter('text/html'); 48 | $this->assertTrue($filter->match($this->spiderResource)); 49 | 50 | $filter = new MimeTypeFilter('image/jpeg'); 51 | $this->assertFalse($filter->match($this->spiderResource)); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/AllowedHostsFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Prefetch; 13 | 14 | use VDB\Spider\Filter\Prefetch\AllowedHostsFilter; 15 | use VDB\Spider\Tests\TestCase; 16 | use VDB\Uri\Uri; 17 | 18 | /** 19 | * 20 | */ 21 | class AllowedHostsFilterTest extends TestCase 22 | { 23 | /** 24 | * @covers \VDB\Spider\Filter\Prefetch\AllowedHostsFilter 25 | */ 26 | public function testMatchFullHostname() 27 | { 28 | $filter = new AllowedHostsFilter(array('http://blog.php-spider.org')); 29 | 30 | $uri1 = new Uri('http://example.org'); 31 | $uri2 = new Uri('http://php-spider.org'); 32 | $uri3 = new Uri('http://blog.php-spider.org'); 33 | 34 | $this->assertTrue($filter->match($uri1)); 35 | $this->assertTrue($filter->match($uri2)); 36 | $this->assertFalse($filter->match($uri3)); 37 | } 38 | 39 | /** 40 | * @covers \VDB\Spider\Filter\Prefetch\AllowedHostsFilter 41 | */ 42 | public function testMatchSubdomain() 43 | { 44 | $filter = new AllowedHostsFilter(array('http://blog.php-spider.org'), true); 45 | 46 | $uri1 = new Uri('http://example.com'); 47 | $uri2 = new Uri('http://blog.php-spider.org'); 48 | $uri3 = new Uri('http://test.php-spider.org'); 49 | $uri4 = new Uri('http://php-spider.org'); 50 | 51 | $this->assertTrue($filter->match($uri1)); 52 | $this->assertFalse($filter->match($uri2)); 53 | $this->assertFalse($filter->match($uri3)); 54 | $this->assertFalse($filter->match($uri4)); 55 | } 56 | 57 | /** 58 | * @covers \VDB\Spider\Filter\Prefetch\AllowedHostsFilter 59 | */ 60 | public function testMatchMultipleDomainsAllowed() 61 | { 62 | $filter = new AllowedHostsFilter(array('http://blog.php-spider.org', 'http://example.com'), true); 63 | 64 | $uri1 = new Uri('http://example.com'); 65 | $uri2 = new Uri('http://test.example.com'); 66 | $uri3 = new Uri('http://blog.php-spider.org'); 67 | $uri4 = new Uri('http://test.php-spider.org'); 68 | $uri5 = new Uri('http://php-spider.org'); 69 | $uri6 = new Uri('http://example.org'); 70 | 71 | $this->assertFalse($filter->match($uri1)); 72 | $this->assertFalse($filter->match($uri2)); 73 | $this->assertFalse($filter->match($uri3)); 74 | $this->assertFalse($filter->match($uri4)); 75 | $this->assertFalse($filter->match($uri5)); 76 | $this->assertTrue($filter->match($uri6)); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/AllowedPortsFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Prefetch; 13 | 14 | use VDB\Spider\Filter\Prefetch\AllowedPortsFilter; 15 | use VDB\Spider\Tests\TestCase; 16 | use VDB\Uri\Uri; 17 | 18 | /** 19 | * 20 | */ 21 | class AllowedPortsFilterTest extends TestCase 22 | { 23 | /** 24 | * @covers \VDB\Spider\Filter\Prefetch\AllowedPortsFilter 25 | */ 26 | public function testMatchPort() 27 | { 28 | $filter = new AllowedPortsFilter(array(8080)); 29 | 30 | $uri1 = new Uri('http://example.org'); 31 | $uri2 = new Uri('http://php-spider.org:8080'); 32 | $uri3 = new Uri('http://blog.php-spider.org'); 33 | 34 | $this->assertTrue($filter->match($uri1)); 35 | $this->assertFalse($filter->match($uri2)); 36 | $this->assertTrue($filter->match($uri3)); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/AllowedSchemeFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Prefetch; 13 | 14 | use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter; 15 | use VDB\Spider\Tests\TestCase; 16 | use VDB\Uri\Uri; 17 | 18 | /** 19 | * 20 | */ 21 | class AllowedSchemeFilterTest extends TestCase 22 | { 23 | /** 24 | * @covers \VDB\Spider\Filter\Prefetch\AllowedSchemeFilter 25 | */ 26 | public function testMatch() 27 | { 28 | $filter = new AllowedSchemeFilter(array('http')); 29 | 30 | $currentUri = 'http://php-spider.org'; 31 | $uri = new Uri('http://php-spider.org'); 32 | $uri2 = new Uri('https://php-spider.org'); 33 | $uri3 = new Uri('#', $currentUri); 34 | $uri4 = new Uri('mailto:info@example.org'); 35 | 36 | $this->assertFalse($filter->match($uri), 'HTTP scheme filtered'); 37 | $this->assertTrue($filter->match($uri2), 'HTTPS scheme filtered'); 38 | $this->assertFalse($filter->match($uri3), 'empty/no scheme filtered'); 39 | $this->assertTrue($filter->match($uri4), 'MAILTO scheme filtered'); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/RestrictToBaseUriFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Prefetch; 13 | 14 | use VDB\Spider\Filter\Prefetch\RestrictToBaseUriFilter; 15 | use VDB\Spider\Tests\TestCase; 16 | use VDB\Uri\Uri; 17 | 18 | /** 19 | * 20 | */ 21 | class RestrictToBaseUriFilterTest extends TestCase 22 | { 23 | /** 24 | * @covers \VDB\Spider\Filter\Prefetch\RestrictToBaseUriFilter 25 | */ 26 | public function testInvalidUri() 27 | { 28 | $this->expectException(\InvalidArgumentException::class); 29 | new RestrictToBaseUriFilter('1gdf://fdsfds'); 30 | } 31 | 32 | /** 33 | * @covers \VDB\Spider\Filter\Prefetch\RestrictToBaseUriFilter 34 | * @dataProvider matchURIProvider 35 | */ 36 | public function testMatch($href, $expected) 37 | { 38 | $filter = new RestrictToBaseUriFilter('http://php-spider.org'); 39 | 40 | $uri = new Uri($href); 41 | 42 | $this->assertEquals($expected, $filter->match($uri)); 43 | } 44 | 45 | public function matchURIProvider(): array 46 | { 47 | return array( 48 | array('http://example.org', true), 49 | array('http://php-spider.org', false), 50 | array('http://blog.php-spider.org', true), 51 | 52 | ); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/RobotsTxtDisallowFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Prefetch; 13 | 14 | use VDB\Spider\Filter\Prefetch\ExtractRobotsTxtException; 15 | use VDB\Spider\Filter\Prefetch\FetchRobotsTxtException; 16 | use VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter; 17 | use VDB\Spider\Tests\TestCase; 18 | use VDB\Uri\Http; 19 | use VDB\Uri\UriInterface; 20 | 21 | /** 22 | * 23 | */ 24 | class RobotsTxtDisallowFilterTest extends TestCase 25 | { 26 | /** 27 | * @covers \VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter 28 | */ 29 | public function testNoRobotsTxt() 30 | { 31 | $bogusDomain = "http://bar/baz"; 32 | $this->expectException(FetchRobotsTxtException::class); 33 | new RobotsTxtDisallowFilter($bogusDomain); 34 | } 35 | 36 | /** 37 | * @covers \VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter 38 | */ 39 | public function testUnsupportedUrlScheme() 40 | { 41 | $unsupported = "ftp://example.com"; 42 | $this->expectException(ExtractRobotsTxtException::class); 43 | new RobotsTxtDisallowFilter($unsupported); 44 | } 45 | 46 | 47 | /** 48 | * @covers \VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter 49 | * @dataProvider userAgentMatchURIProvider 50 | */ 51 | public function testUserAgentMatch(UriInterface $href, bool $expected) 52 | { 53 | $robotsTxtFilter = new RobotsTxtDisallowFilter(seedUrl: "file://" . __DIR__, userAgent: 'PHP-Spider'); 54 | $this->assertEquals($expected, $robotsTxtFilter->match($href)); 55 | } 56 | 57 | /** 58 | * @covers \VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter 59 | * @dataProvider noUserAgentMatchURIProvider 60 | */ 61 | public function testNoUserAgentMatch(UriInterface $href, bool $expected) 62 | { 63 | $robotsTxtFilter = new RobotsTxtDisallowFilter(seedUrl: "file://" . __DIR__); 64 | $this->assertEquals($expected, $robotsTxtFilter->match($href)); 65 | } 66 | 67 | public function noUserAgentMatchURIProvider(): array 68 | { 69 | return array( 70 | array(new Http('http://example.com'), false), 71 | array(new Http('http://example.com/foo'), true), 72 | array(new Http('http://example.com/bar'), false), 73 | ); 74 | } 75 | 76 | public function userAgentMatchURIProvider(): array 77 | { 78 | return array( 79 | array(new Http('http://example.com'), false), 80 | array(new Http('http://example.com/foo'), false), 81 | array(new Http('http://example.com/bar'), true), 82 | ); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/UriFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Prefetch; 13 | 14 | use ErrorException; 15 | use VDB\Spider\Filter\Prefetch\RestrictToBaseUriFilter; 16 | use VDB\Spider\Filter\Prefetch\UriFilter; 17 | use VDB\Spider\Tests\TestCase; 18 | use VDB\Uri\Exception\UriSyntaxException; 19 | use VDB\Uri\Uri; 20 | 21 | /** 22 | * 23 | */ 24 | class UriFilterTest extends TestCase 25 | { 26 | /** 27 | * @param string[] $regexes 28 | * @param string $href 29 | * @param bool $expected 30 | * 31 | * @covers \VDB\Spider\Filter\Prefetch\UriFilter 32 | * @dataProvider matchURIProvider 33 | * 34 | * @throws ErrorException 35 | * @throws UriSyntaxException 36 | */ 37 | public function testMatch(array $regexes, string $href, bool $expected) 38 | { 39 | $filter = new UriFilter($regexes); 40 | $uri = new Uri($href); 41 | $this->assertEquals($expected, $filter->match($uri)); 42 | } 43 | 44 | public function matchURIProvider(): array 45 | { 46 | return array( 47 | array(['/^.*\.com$/'], 'http://example.com', true), 48 | array(['/^.*\.org$/'], 'http://example.com', false), 49 | array(['/^.*\.bogus$/', '/^.*\.com$/'], 'http://example.com', true), 50 | array(['/^https:\/\/.*$/'], 'https://blog.php-spider.org', true), 51 | ); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/UriWithHashFragmentFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Prefetch; 13 | 14 | use ErrorException; 15 | use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter; 16 | use VDB\Spider\Tests\TestCase; 17 | use VDB\Uri\Exception\UriSyntaxException; 18 | use VDB\Uri\Uri; 19 | 20 | /** 21 | * 22 | */ 23 | class UriWithHashFragmentFilterTest extends TestCase 24 | { 25 | /** 26 | * @covers \VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter 27 | * 28 | * @throws UriSyntaxException 29 | * @throws ErrorException 30 | */ 31 | public function testMatch() 32 | { 33 | $filter = new UriWithHashFragmentFilter(); 34 | 35 | $currentUri = 'http://php-spider.org'; 36 | $uri1 = new Uri('#', $currentUri); 37 | $uri2 = new Uri('#foo', $currentUri); 38 | $uri3 = new Uri('http://php-spider.org/foo#bar', $currentUri); 39 | $uri4 = new Uri('http://php-spider.org/foo/#bar', $currentUri); 40 | $uri5 = new Uri('http://php-spider.org#/foo/bar', $currentUri); 41 | 42 | $this->assertTrue($filter->match($uri1), '# filtered'); 43 | $this->assertTrue($filter->match($uri2), '#foo'); 44 | $this->assertTrue($filter->match($uri3), 'http://php-spider.org/foo#bar'); 45 | $this->assertTrue($filter->match($uri4), 'http://php-spider.org/foo/#bar'); 46 | $this->asserttrue($filter->match($uri5), 'http://php-spider.org#/foo/bar'); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/UriWithQueryStringFilterTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\Filter\Prefetch; 13 | 14 | use ErrorException; 15 | use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter; 16 | use VDB\Spider\Tests\TestCase; 17 | use VDB\Uri\Exception\UriSyntaxException; 18 | use VDB\Uri\Uri; 19 | 20 | /** 21 | * 22 | */ 23 | class UriWithQueryStringFilterTest extends TestCase 24 | { 25 | /** 26 | * @covers \VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter 27 | * 28 | * @throws ErrorException 29 | * @throws UriSyntaxException 30 | */ 31 | public function testMatch() 32 | { 33 | $filter = new UriWithQueryStringFilter(); 34 | 35 | $currentUri = 'http://php-spider.org'; 36 | $uri1 = new Uri('?', $currentUri); 37 | $uri2 = new Uri('?foo=2', $currentUri); 38 | $uri3 = new Uri('http://php-spider.org/foo?bar=baz', $currentUri); 39 | $uri4 = new Uri('http://php-spider.org/foo/?bar=baz', $currentUri); 40 | $uri5 = new Uri('http://php-spider.org?/foo/bar', $currentUri); 41 | 42 | $this->assertTrue($filter->match($uri1), '->match(\'?\')'); 43 | $this->assertTrue($filter->match($uri2)); 44 | $this->assertTrue($filter->match($uri3)); 45 | $this->assertTrue($filter->match($uri4)); 46 | $this->assertTrue($filter->match($uri5)); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/Filter/Prefetch/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /foo 3 | 4 | User-agent: PHP-Spider 5 | Disallow: /bar 6 | -------------------------------------------------------------------------------- /tests/Fixtures/DownloaderTestHTMLResource.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Domain 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Domain

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

More information...

55 |
56 | 57 | 58 | -------------------------------------------------------------------------------- /tests/Fixtures/ResourceTestHTMLResource.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Domain 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Domain

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

More information...

55 |
56 | 57 | 58 | -------------------------------------------------------------------------------- /tests/Fixtures/SpiderTestHTMLResourceA.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | A 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Page 1

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

More information B...

55 |

More information C...

56 |

More information E...

57 |
58 | 59 | 60 | -------------------------------------------------------------------------------- /tests/Fixtures/SpiderTestHTMLResourceB.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | B 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Page 2

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

Back to page A...

55 | 56 |

One level deeper to page D...

57 | 58 |

One level deeper to page F...

59 |
60 | 61 | 62 | -------------------------------------------------------------------------------- /tests/Fixtures/SpiderTestHTMLResourceC.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | C 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Page 3

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

Back to page A...

55 |

Deeper to page G...

56 |
57 | 58 | 59 | -------------------------------------------------------------------------------- /tests/Fixtures/SpiderTestHTMLResourceD.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | D 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Page 4

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

Back to page B...

55 |
56 | 57 | 58 | -------------------------------------------------------------------------------- /tests/Fixtures/SpiderTestHTMLResourceE.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | E 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Page 5

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

Back to page A...

55 |

Deeper to page F...

56 |
57 | 58 | 59 | -------------------------------------------------------------------------------- /tests/Fixtures/SpiderTestHTMLResourceF.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | F 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Page 5

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

Back to page B...

55 |

To page E...

56 |
57 | 58 | 59 | -------------------------------------------------------------------------------- /tests/Fixtures/SpiderTestHTMLResourceG.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | G 5 | 6 | 7 | 8 | 9 | 44 | 45 | 46 | 47 |
48 |

Example Page 5

49 | 50 |

This domain is established to be used for illustrative examples in documents. You do not need to 51 | coordinate or ask for permission to use this domain in examples, and it is not available for 52 | registration.

53 | 54 |

Back to page C...

55 |
56 | 57 | 58 | -------------------------------------------------------------------------------- /tests/PersistenceHandler/FileSerializedResourcePersistenceHandlerTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\PersistenceHandler; 13 | 14 | use ErrorException; 15 | use GuzzleHttp\Psr7\Response; 16 | use VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler; 17 | use VDB\Spider\Resource; 18 | use VDB\Spider\Tests\TestCase; 19 | use VDB\Spider\Uri\DiscoveredUri; 20 | use VDB\Uri\Exception\UriSyntaxException; 21 | 22 | /** 23 | * @SuppressWarnings(PHPMD.LongClassName) 24 | */ 25 | class FileSerializedResourcePersistenceHandlerTest extends TestCase 26 | { 27 | /** 28 | * @var FileSerializedResourcePersistenceHandler 29 | */ 30 | protected FileSerializedResourcePersistenceHandler $handler; 31 | 32 | protected ?string $persistenceRootPath = null; 33 | 34 | 35 | public function setUp(): void 36 | { 37 | $this->persistenceRootPath = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'spider-UT' . DIRECTORY_SEPARATOR; 38 | exec('rm -rf ' . $this->persistenceRootPath); 39 | 40 | $this->handler = new FileSerializedResourcePersistenceHandler(sys_get_temp_dir()); 41 | $this->handler->setSpiderId('spider-UT'); 42 | } 43 | 44 | /** 45 | * @covers \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler 46 | * @covers \VDB\Spider\PersistenceHandler\FilePersistenceHandler 47 | * 48 | * @throws ErrorException 49 | * @throws UriSyntaxException 50 | */ 51 | public function testPathExtension() 52 | { 53 | $resource1 = new Resource( 54 | new DiscoveredUri("http://example.com", 0), 55 | new Response(200, [], "Test Body Contents 1") 56 | ); 57 | 58 | $this->assertEquals('', $resource1->getUri()->getPath()); 59 | 60 | $this->handler->persist($resource1); 61 | 62 | $this->assertEquals(1, $this->handler->count()); 63 | /** @SuppressWarnings(PHPMD.UnusedLocalVariable) */ 64 | foreach ($this->handler as $path => $resource) { 65 | $this->assertStringEndsWith('/index.html', $path); 66 | } 67 | } 68 | 69 | /** 70 | * @covers \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler 71 | * @covers \VDB\Spider\PersistenceHandler\FilePersistenceHandler 72 | * 73 | * @dataProvider persistenceProvider 74 | */ 75 | public function testPersist($resource, $expectedFilePath, $expectedFileContents) 76 | { 77 | $this->handler->persist($resource); 78 | 79 | $this->assertFileExists($expectedFilePath); 80 | 81 | $this->assertEquals(1, $this->handler->count()); 82 | // Check the file contents through iterator access and directly 83 | foreach ($this->handler as $path => $resource) { 84 | $savedResource = unserialize(file_get_contents($path)); 85 | $this->assertEquals( 86 | $expectedFileContents, 87 | $savedResource->getResponse()->getBody() 88 | ); 89 | 90 | $this->assertEquals( 91 | $expectedFileContents, 92 | $resource->getResponse()->getBody() 93 | ); 94 | } 95 | } 96 | 97 | /** 98 | * @covers \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler 99 | * @covers \VDB\Spider\PersistenceHandler\FilePersistenceHandler 100 | * 101 | * @dataProvider persistenceWithoutFilenameProvider 102 | */ 103 | public function testPersistResourcesWithoutFilename($resource, $expectedFilePath, $expectedFileContents) 104 | { 105 | $this->handler->persist($resource); 106 | 107 | $this->assertFileExists($expectedFilePath); 108 | 109 | $savedResource = unserialize(file_get_contents($expectedFilePath)); 110 | $this->assertEquals( 111 | $expectedFileContents, 112 | $savedResource->getResponse()->getBody() 113 | ); 114 | } 115 | 116 | /** 117 | * @return array 118 | */ 119 | public function persistenceWithoutFilenameProvider(): array 120 | { 121 | // This must be set here instead of in setup methods, because providers 122 | // get executed first 123 | if (is_null($this->persistenceRootPath)) { 124 | $this->persistenceRootPath = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'spider-UT' . DIRECTORY_SEPARATOR; 125 | } 126 | 127 | $data = []; 128 | 129 | $data[] = $this->buildPersistenceProviderRecord( 130 | __DIR__ . '/../Fixtures/DownloaderTestHTMLResource.html', 131 | 'http://example.org/domains/Internet/' 132 | ); 133 | 134 | $data[] = $this->buildPersistenceProviderRecord( 135 | __DIR__ . '/../Fixtures/DownloaderTestHTMLResource.html', 136 | 'http://example.org/domains/Internet/Abuse/' 137 | ); 138 | 139 | return $data; 140 | } 141 | 142 | /** 143 | * @throws UriSyntaxException 144 | * @throws ErrorException 145 | */ 146 | protected function buildPersistenceProviderRecord($fixturePath, $uriString): array 147 | { 148 | $resource = $this->buildResourceFromFixture( 149 | $fixturePath, 150 | $uriString 151 | ); 152 | $expectedFileContents = $this->getFixtureContent(__DIR__ . '/../Fixtures/DownloaderTestHTMLResource.html'); 153 | $expectedFilePath = $this->buildExpectedFilePath($uriString); 154 | 155 | return [$resource, $expectedFilePath, $expectedFileContents]; 156 | } 157 | 158 | protected function buildExpectedFilePath($uriString): string 159 | { 160 | $expectedFilePath = $this->persistenceRootPath . parse_url($uriString)['host'] . parse_url($uriString)['path']; 161 | if (substr($expectedFilePath, -1, 1) === '/') { 162 | $expectedFilePath .= 'index.html'; 163 | } 164 | 165 | return $expectedFilePath; 166 | } 167 | 168 | /** 169 | * @return array 170 | * 171 | * @throws ErrorException 172 | * @throws UriSyntaxException 173 | */ 174 | public function persistenceProvider(): array 175 | { 176 | // This must be set here instead of in setup methods, because providers 177 | // get executed first 178 | if (is_null($this->persistenceRootPath)) { 179 | $this->persistenceRootPath = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'spider-UT' . DIRECTORY_SEPARATOR; 180 | } 181 | 182 | $data = []; 183 | 184 | $data[] = $this->buildPersistenceProviderRecord( 185 | __DIR__ . '/../Fixtures/DownloaderTestHTMLResource.html', 186 | 'http://example.org/domains/special/test1.html' 187 | ); 188 | 189 | $data[] = $this->buildPersistenceProviderRecord( 190 | __DIR__ . '/../Fixtures/DownloaderTestHTMLResource.html', 191 | 'http://example.org/domains/special/test2.html' 192 | ); 193 | 194 | $data[] = $this->buildPersistenceProviderRecord( 195 | __DIR__ . '/../Fixtures/DownloaderTestHTMLResource.html', 196 | 'http://example.org/domains/special/subdir/test3.html' 197 | ); 198 | 199 | return $data; 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /tests/PersistenceHandler/MemoryPersistenceHandlerTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\PersistenceHandler; 13 | 14 | use ErrorException; 15 | use GuzzleHttp\Psr7\Response; 16 | use VDB\Spider\PersistenceHandler\MemoryPersistenceHandler; 17 | use VDB\Spider\Resource; 18 | use VDB\Spider\Tests\TestCase; 19 | use VDB\Spider\Uri\DiscoveredUri; 20 | use VDB\Uri\Exception\UriSyntaxException; 21 | 22 | class MemoryPersistenceHandlerTest extends TestCase 23 | { 24 | /** 25 | * @var MemoryPersistenceHandler 26 | */ 27 | protected MemoryPersistenceHandler $handler; 28 | 29 | public function setUp(): void 30 | { 31 | $this->handler = new MemoryPersistenceHandler(); 32 | $this->handler->setSpiderId('spider-UT'); 33 | } 34 | 35 | /** 36 | * @covers \VDB\Spider\PersistenceHandler\MemoryPersistenceHandler 37 | * @covers \VDB\Spider\PersistenceHandler\FilePersistenceHandler 38 | * 39 | * @throws ErrorException 40 | * @throws UriSyntaxException 41 | */ 42 | public function testPersist() 43 | { 44 | $resource1 = new Resource( 45 | new DiscoveredUri("http://example.com/1", 0), 46 | new Response(200, [], "Test Body Contents 1") 47 | ); 48 | 49 | $resource2 = new Resource( 50 | new DiscoveredUri("http://example.com/1", 0), 51 | new Response(200, [], "Test Body Contents 2") 52 | ); 53 | 54 | $expectedResources = [$resource1, $resource2]; 55 | 56 | $this->handler->persist($resource1); 57 | $this->handler->persist($resource2); 58 | 59 | $this->assertEquals(2, $this->handler->count()); 60 | 61 | // Check the contents through iterator access and directly 62 | foreach ($this->handler as $path => $resource) { 63 | $this->assertEquals( 64 | $expectedResources[$path], 65 | $resource 66 | ); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /tests/QueueManager/InMemoryQueueManagerTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\QueueManager; 13 | 14 | use ErrorException; 15 | use InvalidArgumentException; 16 | use VDB\Spider\Exception\MaxQueueSizeExceededException; 17 | use VDB\Spider\QueueManager\InMemoryQueueManager; 18 | use VDB\Spider\QueueManager\QueueManagerInterface; 19 | use VDB\Spider\Tests\TestCase; 20 | use VDB\Spider\Uri\DiscoveredUri; 21 | use VDB\Uri\Exception\UriSyntaxException; 22 | 23 | /** 24 | * 25 | */ 26 | class InMemoryQueueManagerTest extends TestCase 27 | { 28 | /** 29 | * @covers \VDB\Spider\QueueManager\InMemoryQueueManager 30 | */ 31 | public function testInvalidTraversalAlgo() 32 | { 33 | $this->expectException(InvalidArgumentException::class); 34 | new InMemoryQueueManager(53242); 35 | } 36 | 37 | /** 38 | * @covers \VDB\Spider\QueueManager\InMemoryQueueManager 39 | */ 40 | public function testSetTraversalAlgo() 41 | { 42 | $qm = new InMemoryQueueManager(QueueManagerInterface::ALGORITHM_BREADTH_FIRST); 43 | $this->assertEquals(QueueManagerInterface::ALGORITHM_BREADTH_FIRST, $qm->getTraversalAlgorithm()); 44 | } 45 | 46 | /** 47 | * @covers \VDB\Spider\QueueManager\InMemoryQueueManager 48 | * 49 | * @throws ErrorException 50 | * @throws UriSyntaxException 51 | */ 52 | public function testMaxQueueSizeExceeded() 53 | { 54 | $this->expectException(MaxQueueSizeExceededException::class); 55 | $qm = new InMemoryQueueManager(); 56 | $qm->maxQueueSize = 1; 57 | $qm->addUri(new DiscoveredUri("foo", 0)); 58 | $qm->addUri(new DiscoveredUri("bar", 0)); 59 | } 60 | 61 | /** 62 | * @covers \VDB\Spider\QueueManager\InMemoryQueueManager 63 | * 64 | * @throws ErrorException 65 | * @throws UriSyntaxException 66 | * @throws MaxQueueSizeExceededException 67 | */ 68 | public function testDepthFirst() 69 | { 70 | $qm = new InMemoryQueueManager(); 71 | $uri1 = new DiscoveredUri("foo", 0); 72 | $uri2 = new DiscoveredUri("bar", 0); 73 | $uri3 = new DiscoveredUri("baz", 0); 74 | $qm->addUri($uri1); 75 | $qm->addUri($uri2); 76 | $qm->addUri($uri3); 77 | 78 | $this->assertEquals($uri3, $qm->next()); 79 | $this->assertEquals($uri2, $qm->next()); 80 | } 81 | 82 | /** 83 | * @covers \VDB\Spider\QueueManager\InMemoryQueueManager 84 | * 85 | * @throws ErrorException 86 | * @throws UriSyntaxException 87 | * @throws MaxQueueSizeExceededException 88 | */ 89 | public function testBreadthFirst() 90 | { 91 | $qm = new InMemoryQueueManager(QueueManagerInterface::ALGORITHM_BREADTH_FIRST); 92 | $uri1 = new DiscoveredUri("foo", 0); 93 | $uri2 = new DiscoveredUri("bar", 0); 94 | $uri3 = new DiscoveredUri("baz", 0); 95 | $qm->addUri($uri1); 96 | $qm->addUri($uri2); 97 | $qm->addUri($uri3); 98 | 99 | $this->assertEquals($uri1, $qm->next()); 100 | $this->assertEquals($uri2, $qm->next()); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /tests/RequestHandler/GuzzleRequestHandlerTest.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests\RequestHandler; 13 | 14 | use ErrorException; 15 | use GuzzleHttp\Exception\GuzzleException; 16 | use GuzzleHttp\Psr7\Response; 17 | use VDB\Spider\RequestHandler\GuzzleRequestHandler; 18 | use VDB\Spider\Tests\TestCase; 19 | use VDB\Spider\Uri\DiscoveredUri; 20 | use VDB\Uri\Exception\UriSyntaxException; 21 | 22 | /** 23 | * 24 | */ 25 | class GuzzleRequestHandlerTest extends TestCase 26 | { 27 | /** 28 | * @covers \VDB\Spider\RequestHandler\GuzzleRequestHandler 29 | * 30 | * @throws GuzzleException 31 | * @throws ErrorException 32 | * @throws UriSyntaxException 33 | */ 34 | public function testCustomClient() 35 | { 36 | $uri = 'http://example.com'; 37 | $expectedResponse = new Response(200, [], "Test"); 38 | $client = $this->getMockBuilder('GuzzleHttp\Client')->getMock(); 39 | $client 40 | ->expects($this->once()) 41 | ->method('get') 42 | ->with($uri) 43 | ->will($this->returnValue($expectedResponse)); 44 | 45 | $handler = new GuzzleRequestHandler($client); 46 | $actualResponse = $handler->request(new DiscoveredUri($uri, 0))->getResponse(); 47 | $this->assertEquals($expectedResponse, $actualResponse); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tests/ResourceTest.php: -------------------------------------------------------------------------------- 1 | html = file_get_contents(__DIR__ . '/Fixtures/ResourceTestHTMLResource.html'); 32 | $this->resource = new Resource( 33 | new DiscoveredUri(new Uri('/domains/special', 'http://example.org'), 0), 34 | new Response(200, [], $this->html) 35 | ); 36 | } 37 | 38 | /** 39 | * @covers \VDB\Spider\Resource 40 | */ 41 | public function testGetCrawler() 42 | { 43 | $this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Crawler', $this->resource->getCrawler()); 44 | } 45 | 46 | /** 47 | * @covers \VDB\Spider\Resource 48 | */ 49 | public function testGetUri() 50 | { 51 | $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $this->resource->getUri()); 52 | $this->assertEquals('http://example.org/domains/special', $this->resource->getUri()->toString()); 53 | } 54 | 55 | /** 56 | * @covers \VDB\Spider\Resource 57 | */ 58 | public function testGetResponse() 59 | { 60 | $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $this->resource->getResponse()); 61 | $this->assertEquals($this->html, $this->resource->getResponse()->getBody()->__toString()); 62 | } 63 | 64 | /** 65 | * @covers \VDB\Spider\Resource 66 | */ 67 | public function testSerialization() 68 | { 69 | $serialized = serialize($this->resource); 70 | $deserialized = unserialize($serialized); 71 | 72 | $this->assertInstanceOf('VDB\\Spider\\Resource', $deserialized); 73 | $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $deserialized->getResponse()); 74 | $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $deserialized->getUri()); 75 | $this->assertEquals($this->resource->getUri()->__toString(), $deserialized->getUri()->__toString()); 76 | $this->assertEquals($this->html, $deserialized->getResponse()->getBody()->__toString()); 77 | $this->assertEquals($this->resource->getCrawler()->html(), $deserialized->getCrawler()->html()); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /tests/TestCase.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace VDB\Spider\Tests; 13 | 14 | use ErrorException; 15 | use GuzzleHttp\Psr7\Response; 16 | use PHPUnit\Framework\TestCase as PHPUnitTestCase; 17 | use VDB\Spider\Resource; 18 | use VDB\Spider\Uri\DiscoveredUri; 19 | use VDB\Uri\Exception\UriSyntaxException; 20 | use VDB\Uri\Uri; 21 | 22 | /** 23 | * 24 | */ 25 | class TestCase extends PHPUnitTestCase 26 | { 27 | /** 28 | * @param DiscoveredUri $uri 29 | * @param Response $response 30 | * @return Resource 31 | */ 32 | protected function getResource(DiscoveredUri $uri, Response $response): Resource 33 | { 34 | return new Resource($uri, $response); 35 | } 36 | 37 | /** 38 | * @throws UriSyntaxException 39 | * @throws ErrorException 40 | */ 41 | protected function buildResourceFromFixture($fixturePath, $uriString): Resource 42 | { 43 | return $this->getResource( 44 | new DiscoveredUri(new Uri($uriString), 0), 45 | new Response(200, [], $this->getFixtureContent($fixturePath)) 46 | ); 47 | } 48 | 49 | /** 50 | * @param $filePath /absolute/path/to/fixture 51 | */ 52 | protected function getFixtureContent($filePath) 53 | { 54 | return file_get_contents($filePath); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tests/Uri/DiscoveredUriTest.php: -------------------------------------------------------------------------------- 1 | 6 | * 7 | * For the full copyright and license information, please view the LICENSE 8 | * file that was distributed with this source code. 9 | */ 10 | 11 | namespace VDB\Spider\Tests\Uri; 12 | 13 | use ErrorException; 14 | use VDB\Spider\Tests\TestCase; 15 | use VDB\Spider\Uri\DiscoveredUri; 16 | use VDB\Uri\Exception\UriSyntaxException; 17 | 18 | /** 19 | * 20 | */ 21 | class DiscoveredUriTest extends TestCase 22 | { 23 | /** 24 | * @covers \VDB\Spider\Uri\DiscoveredUri 25 | * 26 | * @throws ErrorException 27 | * @throws UriSyntaxException 28 | */ 29 | public function testDepthFound() 30 | { 31 | $uri = new DiscoveredUri('http://example.org', 12); 32 | $this->assertEquals(12, $uri->getDepthFound()); 33 | } 34 | } 35 | --------------------------------------------------------------------------------