├── .gitignore ├── doc ├── usage.rst └── example │ ├── basic.php │ ├── limit.php │ ├── logger.php │ ├── stop_on_error.php │ ├── stop_on_exception.php │ ├── whitelist.php │ ├── blacklist.php │ └── normalizer.php ├── src ├── Url.php ├── Exception │ ├── CrawlerException.php │ ├── RequestException.php │ ├── EmptyCollectionException.php │ └── UnsupportedUrlException.php ├── Client │ ├── GoutteClient.php │ ├── PrerenderIoClient.php │ ├── CrawlerClientInterface.php │ └── PrerenderClient.php ├── Url │ ├── Matcher │ │ ├── UrlMatcherInterface.php │ │ ├── PathRegexUrlMatcher.php │ │ └── CallbackUrlMatcher.php │ ├── Normalizer │ │ ├── UrlNormalizerInterface.php │ │ ├── CallbackUrlNormalizer.php │ │ └── RemoveQueryParameterUrlNormalizer.php │ └── UrlCollection.php ├── Page.php └── Crawler.php ├── .scrutinizer.yml ├── tests ├── src │ ├── UrlTest.php │ ├── PageTest.php │ ├── Url │ │ ├── Matcher │ │ │ ├── CallbackUrlMatcherTest.php │ │ │ └── PathRegexUrlMatcherTest.php │ │ └── Normalizer │ │ │ ├── CallbackUrlNormalizerTest.php │ │ │ └── RemoveQueryParameterUrlNormalizerTest.php │ ├── Client │ │ ├── PrerenderIoClientTest.php │ │ └── PrerenderClientTest.php │ └── CrawlerTest.php └── bootstrap.php ├── .travis.yml ├── phpunit.xml ├── CHANGELOG.md ├── LICENSE ├── composer.json └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /vendor/ 2 | /composer.lock 3 | -------------------------------------------------------------------------------- /doc/usage.rst: -------------------------------------------------------------------------------- 1 | Usage 2 | ===== 3 | 4 | The examples should be self descriptive. 5 | -------------------------------------------------------------------------------- /src/Url.php: -------------------------------------------------------------------------------- 1 | crawl('https://www.yourwebsite.com') as $page) { 9 | echo $page->getUrl() . PHP_EOL; 10 | } 11 | -------------------------------------------------------------------------------- /doc/example/limit.php: -------------------------------------------------------------------------------- 1 | setLimit(10); 11 | 12 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) { 13 | echo $page->getUrl() . PHP_EOL; 14 | } 15 | -------------------------------------------------------------------------------- /doc/example/logger.php: -------------------------------------------------------------------------------- 1 | setLogger(new \Psr\Log\NullLogger()); 11 | 12 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) { 13 | echo $page->getUrl() . PHP_EOL; 14 | } 15 | -------------------------------------------------------------------------------- /doc/example/stop_on_error.php: -------------------------------------------------------------------------------- 1 | setStopOnError(true); 11 | 12 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) { 13 | echo $page->getUrl() . PHP_EOL; 14 | } 15 | -------------------------------------------------------------------------------- /doc/example/stop_on_exception.php: -------------------------------------------------------------------------------- 1 | setExceptionOnError(true); 11 | 12 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) { 13 | echo $page->getUrl() . PHP_EOL; 14 | } 15 | -------------------------------------------------------------------------------- /doc/example/whitelist.php: -------------------------------------------------------------------------------- 1 | addWhitelistUrlMatcher(new Matcher\PathRegexUrlMatcher('~^/foo~')); 12 | 13 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) { 14 | echo $page->getUrl() . PHP_EOL; 15 | } 16 | -------------------------------------------------------------------------------- /tests/src/UrlTest.php: -------------------------------------------------------------------------------- 1 | assertInstanceOf(Url::class, $url); 15 | $this->assertInstanceOf(Http::class, $url); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /doc/example/blacklist.php: -------------------------------------------------------------------------------- 1 | addBlacklistUrlMatcher(new Matcher\PathRegexUrlMatcher('~^/foo~')); 12 | 13 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) { 14 | echo $page->getUrl() . PHP_EOL; 15 | } 16 | -------------------------------------------------------------------------------- /doc/example/normalizer.php: -------------------------------------------------------------------------------- 1 | addUrlNormalizer(new Normalizer\RemoveQueryParameterUrlNormalizer('q')); 12 | 13 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) { 14 | echo $page->getUrl() . PHP_EOL; 15 | } 16 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | getMethod($methodName); 17 | $method->setAccessible(true); 18 | 19 | return $method; 20 | } 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - 5.5 5 | - 5.6 6 | - 7.0 7 | - 7.1 8 | - 7.2 9 | 10 | sudo: false 11 | 12 | cache: 13 | directories: 14 | - $HOME/.composer/cache/files 15 | 16 | before_install: 17 | - composer self-update 18 | 19 | install: 20 | - composer update --prefer-source $COMPOSER_FLAGS 21 | 22 | script: 23 | - if [ "$TRAVIS_PHP_VERSION" == "5.6" ]; then phpunit --coverage-clover=coverage.clover; else phpunit; fi; 24 | 25 | after_script: 26 | - if [ "$TRAVIS_PHP_VERSION" == "5.6" ]; then wget https://scrutinizer-ci.com/ocular.phar && php ocular.phar code-coverage:upload --format=php-clover coverage.clover; fi; 27 | -------------------------------------------------------------------------------- /src/Url/Matcher/PathRegexUrlMatcher.php: -------------------------------------------------------------------------------- 1 | pattern = $pattern; 20 | } 21 | 22 | /** 23 | * @param Url $url 24 | * @return bool 25 | */ 26 | public function matches(Url $url) 27 | { 28 | return (bool)preg_match($this->pattern, $url->getPath()); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/Url/Matcher/CallbackUrlMatcher.php: -------------------------------------------------------------------------------- 1 | callback = $callback; 20 | } 21 | 22 | /** 23 | * @param Url $url 24 | * @return bool 25 | */ 26 | public function matches(Url $url) 27 | { 28 | return call_user_func($this->callback, $url); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/Url/Normalizer/CallbackUrlNormalizer.php: -------------------------------------------------------------------------------- 1 | callback = $callback; 20 | } 21 | 22 | /** 23 | * @param Url $url 24 | * @return Url 25 | */ 26 | public function normalize(Url $url) 27 | { 28 | return call_user_func($this->callback, $url); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tests/src/PageTest.php: -------------------------------------------------------------------------------- 1 | assertEquals($url, $page->getUrl()); 22 | $this->assertEquals($domCrawler, $page->getCrawler()); 23 | $this->assertEquals($response, $page->getResponse()); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tests/src/Url/Matcher/CallbackUrlMatcherTest.php: -------------------------------------------------------------------------------- 1 | __toString(); 15 | }; 16 | 17 | $url = m::mock(Url::class); 18 | $url->shouldReceive('__toString')->once(); 19 | 20 | $callbackUrlMatcher = new CallbackUrlMatcher($callback); 21 | $callbackUrlMatcher->matches($url); 22 | } 23 | 24 | protected function tearDown() 25 | { 26 | parent::tearDown(); 27 | 28 | m::close(); 29 | } 30 | } -------------------------------------------------------------------------------- /tests/src/Url/Normalizer/CallbackUrlNormalizerTest.php: -------------------------------------------------------------------------------- 1 | __toString(); 15 | }; 16 | 17 | $url = m::mock(Url::class); 18 | $url->shouldReceive('__toString')->once(); 19 | 20 | $callbackUrlMatcher = new CallbackUrlNormalizer($callback); 21 | $callbackUrlMatcher->normalize($url); 22 | } 23 | 24 | protected function tearDown() 25 | { 26 | parent::tearDown(); 27 | 28 | m::close(); 29 | } 30 | } -------------------------------------------------------------------------------- /src/Url/Normalizer/RemoveQueryParameterUrlNormalizer.php: -------------------------------------------------------------------------------- 1 | keys = $keys; 24 | } 25 | 26 | /** 27 | * @param Url $url 28 | * @return Url 29 | */ 30 | public function normalize(Url $url) 31 | { 32 | $query = $url->query; 33 | $query = $query->without($this->keys); 34 | 35 | return $url->withQuery((string)$query); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tests/src/Url/Matcher/PathRegexUrlMatcherTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($callbackUrlMatcher->matches(Url::createFromString('http://my-project/foo'))); 16 | $this->assertTrue($callbackUrlMatcher->matches(Url::createFromString('http://my-project/foo/bat'))); 17 | $this->assertFalse($callbackUrlMatcher->matches(Url::createFromString('http://my-project/bar'))); 18 | $this->assertFalse($callbackUrlMatcher->matches(Url::createFromString('http://my-project/bar/foo'))); 19 | } 20 | } -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 14 | 15 | 16 | ./tests/src/ 17 | 18 | 19 | 20 | 21 | 22 | ./ 23 | 24 | ./tests 25 | ./vendor 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. 3 | This project adheres to [Semantic Versioning](http://semver.org/). 4 | 5 | ## [2.0.0](https://github.com/mediamonks/crawler/compare/1.1.0...2.0.0) - 2017-10-19 6 | ### Changed 7 | - Use interface for crawler client to allow more flexible (decorated) clients 8 | - Redirects are returned as redirected URL instead of the original URL 9 | - Prerender.io uses https instead of http transport 10 | 11 | ## [1.1.0](https://github.com/mediamonks/crawler/compare/v1.0.1...1.1.0) - 2017-08-11 12 | ### Added 13 | - Make Response available in Page 14 | 15 | ### Removed 16 | - Support for hhvm 17 | 18 | ## [1.0.1](https://github.com/mediamonks/crawler/compare/v1.0.0...v1.0.1) - 2017-03-31 19 | ### Added 20 | - Tests 21 | 22 | ### Fixed 23 | - Add "psr-log" to required packages 24 | 25 | ## [1.0.0](https://github.com/mediamonks/crawler/tree/v1.0.0) - 2016-11-28 26 | ### Added 27 | - Initial version 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 MediaMonks 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/src/Client/PrerenderIoClientTest.php: -------------------------------------------------------------------------------- 1 | invokeArgs($client, [$websiteUrl]); 17 | 18 | $this->assertEquals(PrerenderIoClient::URL.$websiteUrl, $result); 19 | $this->assertEquals( 20 | PrerenderIoClient::USER_AGENT, 21 | $client->getServerParameter(PrerenderIoClient::HEADER_USER_AGENT) 22 | ); 23 | $this->assertEquals($token, $client->getServerParameter(PrerenderIoClient::HEADER_TOKEN)); 24 | } 25 | 26 | public function test_getRequest() 27 | { 28 | $token = 'my-prerender.io-token'; 29 | $client = new PrerenderIoClient($token); 30 | $this->assertNull($client->getRequest()); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/Page.php: -------------------------------------------------------------------------------- 1 | url = $url; 33 | $this->crawler = $crawler; 34 | $this->response = $response; 35 | } 36 | 37 | /** 38 | * @return Url 39 | */ 40 | public function getUrl() 41 | { 42 | return $this->url; 43 | } 44 | 45 | /** 46 | * @return DomCrawler 47 | */ 48 | public function getCrawler() 49 | { 50 | return $this->crawler; 51 | } 52 | 53 | /** 54 | * @return Response 55 | */ 56 | public function getResponse() 57 | { 58 | return $this->response; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mediamonks/crawler", 3 | "type": "library", 4 | "description": "Crawl your own website with various clients for SEO and indexing purposes. ", 5 | "keywords": [ 6 | "crawler", 7 | "seo", 8 | "dom", 9 | "spider", 10 | "robot", 11 | "prerender", 12 | "prerender.io", 13 | "search", 14 | "index", 15 | "goutte", 16 | "domcrawler" 17 | ], 18 | "homepage": "https://www.mediamonks.com/", 19 | "license": "MIT", 20 | "authors": [ 21 | { 22 | "name": "Robert Slootjes", 23 | "email": "robert@mediamonks.com", 24 | "homepage": "https://github.com/slootjes" 25 | } 26 | ], 27 | "require": { 28 | "php": "^5.5|^7.0", 29 | "symfony/dom-crawler": "^2.8|^3.0|^4.0", 30 | "fabpot/goutte": "^3.0", 31 | "league/uri": "^4.2", 32 | "psr/log": "^1.0" 33 | }, 34 | "require-dev": { 35 | "codeclimate/php-test-reporter": "dev-master@dev", 36 | "phpunit/phpunit": "^4.8", 37 | "mockery/mockery": "^0.9.4", 38 | "monolog/monolog": "^1.21" 39 | }, 40 | "autoload": { 41 | "psr-4": { 42 | "MediaMonks\\Crawler\\": "src/" 43 | } 44 | }, 45 | "extra": { 46 | "branch-alias": { 47 | "dev-master": "1.0-dev" 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/Client/PrerenderIoClient.php: -------------------------------------------------------------------------------- 1 | token = $token; 31 | 32 | $server[self::HEADER_TOKEN] = $token; 33 | $server[self::HEADER_USER_AGENT] = self::USER_AGENT; 34 | 35 | parent::__construct(self::URL, $server, $history, $cookieJar); 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /tests/src/Client/PrerenderClientTest.php: -------------------------------------------------------------------------------- 1 | invokeArgs($client, [$websiteUrl]); 18 | 19 | $this->assertEquals($prerenderUrl.$websiteUrl, $result); 20 | } 21 | 22 | public function test_url_is_corrected() 23 | { 24 | $prerenderUrl = 'http://my-prerender-server/'; 25 | $websiteUrl = 'http://my-website/'; 26 | 27 | $request = new Request($prerenderUrl.$websiteUrl, 'GET'); 28 | 29 | $rp = new \ReflectionProperty(PrerenderClient::class, 'request'); 30 | $rp->setAccessible(true); 31 | 32 | $client = new PrerenderClient($prerenderUrl); 33 | $rp->setValue($client, $request); 34 | 35 | $this->assertEquals($client->getRequest()->getUri(), $websiteUrl); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/Client/CrawlerClientInterface.php: -------------------------------------------------------------------------------- 1 | applyNormalization('http://my-project/?foo=bar', 'foo', 'http://my-project/'); 14 | } 15 | 16 | public function test_query_parameters_are_removed() 17 | { 18 | $this->applyNormalization('http://my-project/?foo=bar&bar=baz', ['foo', 'bar'], 'http://my-project/'); 19 | } 20 | 21 | public function test_query_parameters_are_not_removed() 22 | { 23 | $this->applyNormalization('http://my-project/?foo=bar&bar=baz', ['foo2'], 'http://my-project/?foo=bar&bar=baz'); 24 | } 25 | 26 | /** 27 | * @param $urlInput 28 | * @param array $removeKeys 29 | * @param $urlExpectedOutput 30 | */ 31 | protected function applyNormalization($urlInput, $removeKeys, $urlExpectedOutput) 32 | { 33 | $url = Url::createFromString($urlInput); 34 | $callbackUrlMatcher = new RemoveQueryParameterUrlNormalizer($removeKeys); 35 | $url = $callbackUrlMatcher->normalize($url); 36 | 37 | $this->assertEquals($urlExpectedOutput, $url->__toString()); 38 | } 39 | } -------------------------------------------------------------------------------- /src/Url/UrlCollection.php: -------------------------------------------------------------------------------- 1 | contains($url)) { 22 | $this->urls[$url->__toString()] = $url; 23 | } 24 | } 25 | 26 | /** 27 | * @return Url|false 28 | * @throws EmptyCollectionException 29 | */ 30 | public function pop() 31 | { 32 | $url = array_pop($this->urls); 33 | if (empty($url)) { 34 | return false; 35 | } 36 | 37 | return $url; 38 | } 39 | 40 | /** 41 | * @param Url $url 42 | * 43 | * @return bool 44 | */ 45 | public function contains(Url $url) 46 | { 47 | return isset($this->urls[$url->__toString()]); 48 | } 49 | 50 | /** 51 | * @return int 52 | */ 53 | public function count() 54 | { 55 | return count($this->urls); 56 | } 57 | 58 | /** 59 | * @return void 60 | */ 61 | public function reset() 62 | { 63 | $this->urls = []; 64 | } 65 | 66 | /** 67 | * @return array 68 | */ 69 | public function toArray() 70 | { 71 | $stringified = []; 72 | foreach ($this->urls as $url) { 73 | $stringified[] = $url->__toString(); 74 | } 75 | 76 | return $stringified; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/Client/PrerenderClient.php: -------------------------------------------------------------------------------- 1 | prerenderUrl = $prerenderUrl; 25 | 26 | parent::__construct($server, $history, $cookieJar); 27 | } 28 | 29 | /** 30 | * @param string $uri 31 | * @return string 32 | */ 33 | protected function getAbsoluteUri($uri) 34 | { 35 | return $this->prerenderUrl.parent::getAbsoluteUri($uri); 36 | } 37 | 38 | /** 39 | * @inheritdoc 40 | */ 41 | public function getRequest() 42 | { 43 | $request = parent::getRequest(); 44 | if (!empty($request)) { 45 | return new Request($this->correctUrl($request->getUri()), 46 | $request->getMethod(), $request->getParameters(), 47 | $request->getFiles(), $request->getCookies(), $request->getServer(), 48 | $request->getContent()); 49 | } 50 | } 51 | 52 | /** 53 | * @param $url 54 | * 55 | * @return string 56 | */ 57 | protected function correctUrl($url) 58 | { 59 | return str_replace($this->prerenderUrl, '', $url); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/mediamonks/crawler.svg?branch=master)](https://travis-ci.org/mediamonks/crawler) 2 | [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/mediamonks/crawler/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/mediamonks/crawler/?branch=master) 3 | [![Code Coverage](https://scrutinizer-ci.com/g/mediamonks/crawler/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/mediamonks/crawler/?branch=master) 4 | [![Total Downloads](https://poser.pugx.org/mediamonks/crawler/downloads)](https://packagist.org/packages/mediamonks/crawler) 5 | [![Latest Stable Version](https://poser.pugx.org/mediamonks/crawler/v/stable)](https://packagist.org/packages/mediamonks/crawler) 6 | [![Latest Unstable Version](https://poser.pugx.org/mediamonks/crawler/v/unstable)](https://packagist.org/packages/mediamonks/crawler) 7 | [![SensioLabs Insight](https://img.shields.io/sensiolabs/i/2fd407ee-3228-46c1-9ebb-40745787d454.svg)](https://insight.sensiolabs.com/projects/2fd407ee-3228-46c1-9ebb-40745787d454) 8 | [![License](https://poser.pugx.org/mediamonks/crawler/license)](https://packagist.org/packages/mediamonks/crawler) 9 | 10 | # MediaMonks Crawler 11 | 12 | This tool allows you to easily crawl a website and get a DOM object for every url that was found. 13 | We use this to crawl our own site pages regardless if it was generated with server and/or client side content by using the Prerender.io client. 14 | The resulting data can be used for creating a full site search and/or improving SEO for single-page applications. 15 | 16 | ## Highlights 17 | 18 | - Ships with Prerender & Prerender.io clients, uses Goutte by default 19 | - Supports any Symfony BrowserKit client 20 | - Supports both whitelisting and blacklisting of urls 21 | - Supports url normalization which allow you to prevent duplicates based on minor url differences 22 | - Implements the [PSR-3 Logger Interface](http://www.php-fig.org/psr/psr-3/) 23 | 24 | ## Documentation 25 | 26 | Documentation and examples can be found in the [/doc](/doc) folder. 27 | 28 | ## System Requirements 29 | 30 | You need: 31 | 32 | - **PHP >= 5.5.0** 33 | 34 | To use the library. 35 | 36 | ## Install 37 | 38 | Install this package by using Composer. 39 | 40 | ``` 41 | $ composer require mediamonks/crawler 42 | ``` 43 | 44 | ## Security 45 | 46 | If you discover any security related issues, please email devmonk@mediamonks.com instead of using the issue tracker. 47 | 48 | ## License 49 | 50 | The MIT License (MIT). Please see [License File](LICENSE) for more information. 51 | -------------------------------------------------------------------------------- /tests/src/CrawlerTest.php: -------------------------------------------------------------------------------- 1 | assertInstanceOf(Client::class, $crawler->getClient()); 26 | $this->assertEquals(0, $crawler->getLimit()); 27 | $this->assertInstanceOf(NullLogger::class, $crawler->getLogger()); 28 | $this->assertFalse($crawler->getStopOnError()); 29 | $this->assertCount(0, $crawler->getUrlsCrawled()); 30 | $this->assertCount(0, $crawler->getUrlsQueued()); 31 | $this->assertCount(0, $crawler->getUrlsRejected()); 32 | $this->assertCount(0, $crawler->getUrlsReturned()); 33 | $this->assertCount(0, $crawler->getWhitelistUrlMatchers()); 34 | $this->assertCount(0, $crawler->getBlacklistUrlMatchers()); 35 | } 36 | 37 | public function test_getters_setters() 38 | { 39 | $crawler = new Crawler(); 40 | 41 | $client = new GoutteClient(); 42 | $crawler->setClient($client); 43 | $this->assertEquals($client, $crawler->getClient()); 44 | 45 | $limit = 1; 46 | $crawler->setLimit($limit); 47 | $this->assertEquals($limit, $crawler->getLimit()); 48 | 49 | $crawler->setStopOnError(true); 50 | $this->assertTrue($crawler->getStopOnError()); 51 | 52 | $logger = m::mock(NullLogger::class); 53 | $crawler->setLogger($logger); 54 | $this->assertEquals($logger, $crawler->getLogger()); 55 | } 56 | 57 | public function test_clear_matchers() 58 | { 59 | $matcher = m::mock(UrlMatcherInterface::class); 60 | 61 | $crawler = new Crawler(); 62 | $crawler->addBlacklistUrlMatcher($matcher); 63 | $crawler->addWhitelistUrlMatcher($matcher); 64 | 65 | $this->assertCount(1, $crawler->getBlacklistUrlMatchers()); 66 | $this->assertCount(1, $crawler->getWhitelistUrlMatchers()); 67 | 68 | $crawler->clearBlacklistUrlMatchers(); 69 | $crawler->clearWhitelistUrlMatchers(); 70 | 71 | $this->assertCount(0, $crawler->getBlacklistUrlMatchers()); 72 | $this->assertCount(0, $crawler->getWhitelistUrlMatchers()); 73 | 74 | $crawler->setBlacklistUrlMatchers([$matcher]); 75 | $crawler->setWhitelistUrlMatchers([$matcher]); 76 | 77 | $this->assertCount(1, $crawler->getBlacklistUrlMatchers()); 78 | $this->assertCount(1, $crawler->getWhitelistUrlMatchers()); 79 | 80 | $crawler->clearBlacklistUrlMatchers(); 81 | $crawler->clearWhitelistUrlMatchers(); 82 | 83 | $this->assertCount(0, $crawler->getBlacklistUrlMatchers()); 84 | $this->assertCount(0, $crawler->getWhitelistUrlMatchers()); 85 | } 86 | 87 | public function test_clear_normalizers() 88 | { 89 | $normalizer = m::mock(Url\Normalizer\UrlNormalizerInterface::class); 90 | 91 | $crawler = new Crawler(); 92 | 93 | $crawler->addUrlNormalizer($normalizer); 94 | $this->assertCount(1, $crawler->getUrlNormalizers()); 95 | 96 | $crawler->clearUrlNormalizers(); 97 | $this->assertCount(0, $crawler->getUrlNormalizers()); 98 | 99 | $crawler->setUrlNormalizers([$normalizer]); 100 | $this->assertCount(1, $crawler->getUrlNormalizers()); 101 | 102 | $crawler->clearUrlNormalizers(); 103 | $this->assertCount(0, $crawler->getUrlNormalizers()); 104 | } 105 | 106 | public function test_crawl_single_page() 107 | { 108 | $domCrawler = new DomCrawler(''); 109 | 110 | $client = $this->getClient(); 111 | $client->shouldReceive('request')->once()->andReturn($domCrawler); 112 | $client->shouldReceive('getRequest')->once()->andReturn(null); 113 | 114 | $crawler = new Crawler($client); 115 | 116 | foreach ($crawler->crawl('http://my-test') as $page) { 117 | } 118 | 119 | $this->assertCount(1, $crawler->getUrlsCrawled()); 120 | } 121 | 122 | public function test_crawl_multiple_pages() 123 | { 124 | $crawler = new Crawler($this->getDummyClient()); 125 | 126 | foreach ($crawler->crawl('http://my-test') as $page) { 127 | } 128 | 129 | $this->assertCount(5, $crawler->getUrlsCrawled()); 130 | $this->assertCount(5, $crawler->getUrlsReturned()); 131 | $this->assertCount(2, $crawler->getUrlsRejected()); 132 | } 133 | 134 | public function test_crawl_with_limit() 135 | { 136 | $crawler = new Crawler($this->getDummyClient()); 137 | $crawler->setLimit(3); 138 | 139 | foreach ($crawler->crawl('http://my-test') as $page) { 140 | } 141 | 142 | $this->assertCount(3, $crawler->getUrlsCrawled()); 143 | $this->assertCount(2, $crawler->getUrlsQueued()); 144 | } 145 | 146 | public function test_crawl_with_whitelist() 147 | { 148 | $crawler = new Crawler($this->getDummyClient()); 149 | $crawler->addWhitelistUrlMatcher(new PathRegexUrlMatcher('~^/page_1.html~')); 150 | 151 | foreach ($crawler->crawl('http://my-test') as $page) { 152 | } 153 | 154 | $this->assertCount(5, $crawler->getUrlsCrawled()); 155 | $this->assertCount(1, $crawler->getUrlsReturned()); 156 | } 157 | 158 | public function test_crawl_with_blacklist() 159 | { 160 | $crawler = new Crawler($this->getDummyClient()); 161 | $crawler->addBlacklistUrlMatcher(new PathRegexUrlMatcher('~^/page_1.html~')); 162 | 163 | foreach ($crawler->crawl('http://my-test') as $page) { 164 | } 165 | 166 | $this->assertCount(5, $crawler->getUrlsCrawled()); 167 | $this->assertCount(4, $crawler->getUrlsReturned()); 168 | } 169 | 170 | public function test_crawl_with_normalizer() 171 | { 172 | $crawler = new Crawler($this->getDummyClient()); 173 | $crawler->addUrlNormalizer( 174 | new CallbackUrlNormalizer( 175 | function (Url $url) { 176 | if ($url->getPath() === '/page_4.html') { 177 | $url = $url->withPath('/page_3.html'); 178 | } 179 | 180 | return $url; 181 | } 182 | ) 183 | ); 184 | 185 | foreach ($crawler->crawl('http://my-test') as $page) { 186 | } 187 | 188 | $this->assertCount(4, $crawler->getUrlsCrawled()); 189 | } 190 | 191 | public function test_crawler_stop_on_error() 192 | { 193 | $client = $this->getClient(); 194 | 195 | $i = 0; 196 | $client->shouldReceive('request')->andReturnUsing( 197 | function () use (&$i) { 198 | $i++; 199 | switch ($i) { 200 | case 1: 201 | $html = 'Page 1Page 2'; 202 | break; 203 | case 2: 204 | throw new \Exception('foo'); 205 | case 3: 206 | $html = 'Page 4Invalid'; 207 | break; 208 | default: 209 | $html = 'Page 1External'; 210 | break; 211 | } 212 | 213 | return new DomCrawler($html, 'http://my-test'); 214 | } 215 | ); 216 | $client->shouldReceive('getRequest')->once()->andReturn(null); 217 | 218 | $crawler = new Crawler($client); 219 | $crawler->setStopOnError(true); 220 | 221 | foreach ($crawler->crawl('http://my-test') as $page) { 222 | } 223 | 224 | $this->assertCount(1, $crawler->getUrlsCrawled()); 225 | } 226 | 227 | public function test_crawler_exception_on_error() 228 | { 229 | $this->setExpectedException(RequestException::class); 230 | $client = $this->getClient(); 231 | 232 | $i = 0; 233 | $client->shouldReceive('request')->andReturnUsing( 234 | function () use (&$i) { 235 | $i++; 236 | switch ($i) { 237 | case 1: 238 | $html = 'Page 1Page 2'; 239 | break; 240 | case 2: 241 | throw new \Exception('foo'); 242 | case 3: 243 | $html = 'Page 4Invalid'; 244 | break; 245 | default: 246 | $html = 'Page 1External'; 247 | break; 248 | } 249 | 250 | return new DomCrawler($html, 'http://my-test'); 251 | } 252 | ); 253 | 254 | $crawler = new Crawler($client); 255 | $crawler->setExceptionOnError(true); 256 | 257 | foreach ($crawler->crawl('http://my-test') as $page) { 258 | } 259 | 260 | $this->assertCount(1, $crawler->getUrlsCrawled()); 261 | } 262 | 263 | public function test_crawler_does_not_stop_on_error() 264 | { 265 | $client = $this->getClient(); 266 | 267 | $i = 0; 268 | $client->shouldReceive('request')->andReturnUsing( 269 | function () use (&$i) { 270 | $i++; 271 | switch ($i) { 272 | case 1: 273 | $html = 'Page 1Page 2'; 274 | break; 275 | case 2: 276 | throw new \Exception('foo'); 277 | case 3: 278 | $html = 'Page 4Invalid'; 279 | break; 280 | default: 281 | $html = 'Page 1External'; 282 | break; 283 | } 284 | 285 | return new DomCrawler($html, 'http://my-test'); 286 | } 287 | ); 288 | $client->shouldReceive('getRequest')->andReturn(null); 289 | 290 | $crawler = new Crawler($client); 291 | 292 | foreach ($crawler->crawl('http://my-test') as $page) { 293 | } 294 | 295 | $this->assertCount(3, $crawler->getUrlsCrawled()); 296 | } 297 | 298 | /** 299 | * @return m\MockInterface 300 | */ 301 | protected function getDummyClient() 302 | { 303 | $client = $this->getClient(); 304 | 305 | $i = 0; 306 | $client->shouldReceive('request')->andReturnUsing( 307 | function () use (&$i) { 308 | $i++; 309 | switch ($i) { 310 | case 1: 311 | $html = 'Page 1Page 2'; 312 | break; 313 | case 2: 314 | $html = 'Page 3External'; 315 | break; 316 | case 3: 317 | $html = 'Page 4Invalid'; 318 | break; 319 | default: 320 | $html = 'Page 1External'; 321 | break; 322 | } 323 | 324 | return new DomCrawler($html, 'http://my-test'); 325 | } 326 | ); 327 | $client->shouldReceive('getRequest')->andReturn(null); 328 | 329 | return $client; 330 | } 331 | 332 | public function test_should_crawl_url() 333 | { 334 | $reset = get_non_public_method(Crawler::class, 'reset'); 335 | $shouldCrawlUrl = get_non_public_method(Crawler::class, 'shouldCrawlUrl'); 336 | 337 | $client = new Crawler(); 338 | 339 | $reset->invokeArgs($client, [Url::createFromString('http://my-website')]); 340 | 341 | // already in queue as it is the base url 342 | $this->assertFalse($shouldCrawlUrl->invokeArgs($client, [Url::createFromString('http://my-website')])); 343 | 344 | // new page, should be crawled 345 | $this->assertTrue($shouldCrawlUrl->invokeArgs($client, [Url::createFromString('http://my-website/foo')])); 346 | 347 | // different host, should not be crawled 348 | $this->assertFalse($shouldCrawlUrl->invokeArgs($client, [Url::createFromString('http://other-host')])); 349 | 350 | // already rejected, should not be crawled 351 | $this->assertFalse($shouldCrawlUrl->invokeArgs($client, [Url::createFromString('http://other-host')])); 352 | } 353 | 354 | public function test_add_rejected_url() 355 | { 356 | $addRejectedUrl = get_non_public_method(Crawler::class, 'addRejectedUrl'); 357 | 358 | $client = new Crawler(); 359 | $addRejectedUrl->invokeArgs($client, [Url::createFromString('http://my-website')]); 360 | $this->assertEquals(1, count($client->getUrlsRejected())); 361 | 362 | $addRejectedUrl->invokeArgs($client, ['http://my-website/foo']); 363 | $this->assertEquals(2, count($client->getUrlsRejected())); 364 | } 365 | 366 | public function test_add_rejected_url_invalid() 367 | { 368 | $this->setExpectedException(\InvalidArgumentException::class); 369 | 370 | $addRejectedUrl = get_non_public_method(Crawler::class, 'addRejectedUrl'); 371 | 372 | $client = new Crawler(); 373 | $addRejectedUrl->invokeArgs($client, [new \stdClass()]); 374 | $this->assertEquals(0, count($client->getUrlsRejected())); 375 | } 376 | 377 | public function test_update_url() 378 | { 379 | $request = m::mock(Request::class); 380 | $request->shouldReceive('getUri')->andReturn('http://redirected-url'); 381 | 382 | $domCrawler = new DomCrawler(''); 383 | 384 | $client = $this->getClient(); 385 | $client->shouldReceive('request')->once()->andReturn($domCrawler); 386 | $client->shouldReceive('getRequest')->andReturn($request); 387 | 388 | $client = new Crawler($client); 389 | foreach ($client->crawl('http://original-url') as $page) { 390 | $this->assertEquals('http://redirected-url', $page->getUrl()->__toString()); 391 | } 392 | } 393 | 394 | protected function tearDown() 395 | { 396 | parent::tearDown(); 397 | 398 | m::close(); 399 | } 400 | 401 | /** 402 | * @return \Mockery\MockInterface 403 | */ 404 | protected function getClient() 405 | { 406 | $client = m::mock(CrawlerClientInterface::class); 407 | $client->shouldReceive('getResponse')->andReturnNull(); 408 | 409 | return $client; 410 | } 411 | } 412 | -------------------------------------------------------------------------------- /src/Crawler.php: -------------------------------------------------------------------------------- 1 | setClient($client); 95 | 96 | $this->urlsCrawled = new UrlCollection(); 97 | $this->urlsQueued = new UrlCollection(); 98 | $this->urlsReturned = new UrlCollection(); 99 | 100 | return $this; 101 | } 102 | 103 | /** 104 | * @param CrawlerClientInterface $client 105 | */ 106 | public function setClient(CrawlerClientInterface $client) 107 | { 108 | $this->client = $client; 109 | } 110 | 111 | /** 112 | * @return Client 113 | */ 114 | public function getClient() 115 | { 116 | return $this->client; 117 | } 118 | 119 | /** 120 | * @return int 121 | */ 122 | public function getLimit() 123 | { 124 | return $this->limit; 125 | } 126 | 127 | /** 128 | * @param int $limit 129 | * @return $this 130 | */ 131 | public function setLimit($limit) 132 | { 133 | $this->limit = $limit; 134 | 135 | return $this; 136 | } 137 | 138 | /** 139 | * @return boolean 140 | */ 141 | public function getStopOnError() 142 | { 143 | return $this->stopOnError; 144 | } 145 | 146 | /** 147 | * @param boolean $stopOnError 148 | * @return $this 149 | */ 150 | public function setStopOnError($stopOnError) 151 | { 152 | $this->stopOnError = $stopOnError; 153 | 154 | return $this; 155 | } 156 | 157 | /** 158 | * @return boolean 159 | */ 160 | public function getExceptionOnError() 161 | { 162 | return $this->exceptionOnError; 163 | } 164 | 165 | /** 166 | * @param boolean $exceptionOnError 167 | * @return $this 168 | */ 169 | public function setExceptionOnError($exceptionOnError) 170 | { 171 | $this->exceptionOnError = $exceptionOnError; 172 | 173 | return $this; 174 | } 175 | 176 | /** 177 | * @return array 178 | */ 179 | public function getUrlsCrawled() 180 | { 181 | return $this->urlsCrawled->toArray(); 182 | } 183 | 184 | /** 185 | * @return array 186 | */ 187 | public function getUrlsQueued() 188 | { 189 | return $this->urlsQueued->toArray(); 190 | } 191 | 192 | /** 193 | * @return array 194 | */ 195 | public function getUrlsReturned() 196 | { 197 | return $this->urlsReturned->toArray(); 198 | } 199 | 200 | /** 201 | * @return array 202 | */ 203 | public function getUrlsRejected() 204 | { 205 | return $this->urlsRejected; 206 | } 207 | 208 | /** 209 | * @param $urlMatchers 210 | * @return $this 211 | */ 212 | public function setWhitelistUrlMatchers(array $urlMatchers) 213 | { 214 | $this->clearWhitelistUrlMatchers(); 215 | foreach ($urlMatchers as $matcher) { 216 | $this->addWhitelistUrlMatcher($matcher); 217 | } 218 | 219 | return $this; 220 | } 221 | 222 | /** 223 | * @return Url\Matcher\UrlMatcherInterface[] 224 | */ 225 | public function getWhitelistUrlMatchers() 226 | { 227 | return $this->whitelistUrlMatchers; 228 | } 229 | 230 | /** 231 | * @param UrlMatcherInterface $urlMatcher 232 | * @return $this 233 | */ 234 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher) 235 | { 236 | $this->whitelistUrlMatchers[] = $urlMatcher; 237 | 238 | return $this; 239 | } 240 | 241 | /** 242 | * @return $this 243 | */ 244 | public function clearWhitelistUrlMatchers() 245 | { 246 | $this->whitelistUrlMatchers = []; 247 | 248 | return $this; 249 | } 250 | 251 | /** 252 | * @param array $urlMatchers 253 | * @return $this 254 | */ 255 | public function setBlacklistUrlMatchers(array $urlMatchers) 256 | { 257 | $this->clearBlacklistUrlMatchers(); 258 | foreach ($urlMatchers as $matcher) { 259 | $this->addBlacklistUrlMatcher($matcher); 260 | } 261 | 262 | return $this; 263 | } 264 | 265 | /** 266 | * @return UrlMatcherInterface[] 267 | */ 268 | public function getBlacklistUrlMatchers() 269 | { 270 | return $this->blacklistUrlMatchers; 271 | } 272 | 273 | /** 274 | * @param UrlMatcherInterface $urlMatcher 275 | * @return $this 276 | */ 277 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher) 278 | { 279 | $this->blacklistUrlMatchers[] = $urlMatcher; 280 | 281 | return $this; 282 | } 283 | 284 | /** 285 | * @return $this 286 | */ 287 | public function clearBlacklistUrlMatchers() 288 | { 289 | $this->blacklistUrlMatchers = []; 290 | 291 | return $this; 292 | } 293 | 294 | /** 295 | * @param array $normalizers 296 | * @return $this 297 | */ 298 | public function setUrlNormalizers(array $normalizers) 299 | { 300 | $this->clearUrlNormalizers(); 301 | 302 | foreach ($normalizers as $normalizer) { 303 | $this->addUrlNormalizer($normalizer); 304 | } 305 | 306 | return $this; 307 | } 308 | 309 | /** 310 | * @return UrlNormalizerInterface[] 311 | */ 312 | public function getUrlNormalizers() 313 | { 314 | return $this->urlNormalizers; 315 | } 316 | 317 | /** 318 | * @param UrlNormalizerInterface $normalizer 319 | * @return $this 320 | */ 321 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer) 322 | { 323 | $this->urlNormalizers[] = $normalizer; 324 | 325 | return $this; 326 | } 327 | 328 | /** 329 | * @return $this 330 | */ 331 | public function clearUrlNormalizers() 332 | { 333 | $this->urlNormalizers = []; 334 | 335 | return $this; 336 | } 337 | 338 | /** 339 | * @return LoggerInterface 340 | */ 341 | public function getLogger() 342 | { 343 | if (is_null($this->logger)) { 344 | $this->logger = new NullLogger(); 345 | } 346 | 347 | return $this->logger; 348 | } 349 | 350 | /** 351 | * @param LoggerInterface $logger 352 | * @return $this 353 | */ 354 | public function setLogger(LoggerInterface $logger) 355 | { 356 | $this->logger = $logger; 357 | 358 | return $this; 359 | } 360 | 361 | /** 362 | * @param $url 363 | * @return Url 364 | * @throws \Exception 365 | */ 366 | protected function createHttpUrlString($url) 367 | { 368 | try { 369 | return $this->normalizeUrl(Url::createFromString($url)); 370 | } 371 | catch (\Exception $e) { 372 | $this->getLogger()->warning( 373 | sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage()) 374 | ); 375 | 376 | throw new UnsupportedUrlException($url); 377 | } 378 | } 379 | 380 | /** 381 | * @param Url $url 382 | */ 383 | protected function reset(Url $url) 384 | { 385 | $this->baseUrl = $url; 386 | 387 | $this->urlsCrawled->reset(); 388 | $this->urlsQueued->reset(); 389 | $this->urlsReturned->reset(); 390 | $this->urlsRejected = []; 391 | 392 | $this->urlsQueued->push($url); 393 | } 394 | 395 | /** 396 | * @param string $url 397 | * @return \Generator|Page[] 398 | * @throws RequestException 399 | */ 400 | public function crawl($url) 401 | { 402 | $this->reset($this->createHttpUrlString($url)); 403 | 404 | while ($url = $this->urlsQueued->pop()) { 405 | 406 | try { 407 | $crawler = $this->requestPage($url); 408 | $url = $this->updateResolvedUrl($url); 409 | } catch (\Exception $e) { 410 | $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage())); 411 | 412 | if ($this->getStopOnError()) { 413 | return; 414 | } 415 | if ($this->getExceptionOnError()) { 416 | throw new RequestException($e->getMessage(), $e->getCode(), $e); 417 | } 418 | 419 | continue; 420 | } 421 | 422 | $this->urlsCrawled->push($url); 423 | $this->updateQueue($crawler); 424 | 425 | if ($this->shouldReturnUrl($url)) { 426 | $this->getLogger()->debug(sprintf('Return url "%s"', $url)); 427 | $this->urlsReturned->push($url); 428 | 429 | yield new Page($url, $crawler, $this->client->getResponse()); 430 | } 431 | 432 | if ($this->isLimitReached()) { 433 | $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit)); 434 | 435 | return; 436 | } 437 | } 438 | } 439 | 440 | /** 441 | * @param Url $url 442 | * @return Url 443 | */ 444 | protected function updateResolvedUrl(Url $url) 445 | { 446 | $request = $this->client->getRequest(); 447 | if (!empty($request)) { 448 | $url = $this->createHttpUrlString($request->getUri()); 449 | } 450 | 451 | return $url; 452 | } 453 | 454 | /** 455 | * @param DomCrawler $crawler 456 | */ 457 | protected function updateQueue(DomCrawler $crawler) 458 | { 459 | foreach ($this->extractUrlsFromCrawler($crawler) as $url) { 460 | $this->getLogger()->debug(sprintf('Found url %s in page', $url)); 461 | try { 462 | $url = $this->createHttpUrlString($url); 463 | 464 | if ($this->shouldCrawlUrl($url)) { 465 | $this->urlsQueued->push($url); 466 | } 467 | } catch (\Exception $e) { 468 | $this->addRejectedUrl($url); 469 | } 470 | } 471 | } 472 | 473 | /** 474 | * @param Url $url 475 | * @return Url 476 | */ 477 | protected function normalizeUrl(Url $url) 478 | { 479 | foreach ($this->urlNormalizers as $normalizer) { 480 | $url = $normalizer->normalize($url); 481 | } 482 | 483 | return $url; 484 | } 485 | 486 | /** 487 | * @param Url $url 488 | * @return bool 489 | */ 490 | protected function shouldReturnUrl(Url $url) 491 | { 492 | if (!empty($this->whitelistUrlMatchers)) { 493 | if (!$this->isUrlWhitelisted($url)) { 494 | $this->getLogger()->info(sprintf('Skipping "%s" because it is not whitelisted', $url)); 495 | 496 | return false; 497 | } 498 | } 499 | 500 | if ($this->isUrlBlacklisted($url)) { 501 | $this->getLogger()->info(sprintf('Skipping "%s" because it is blacklisted', $url)); 502 | 503 | return false; 504 | } 505 | 506 | return true; 507 | } 508 | 509 | /** 510 | * @param Url $url 511 | * @return bool 512 | */ 513 | protected function isUrlWhitelisted(Url $url) 514 | { 515 | foreach ($this->whitelistUrlMatchers as $matcher) { 516 | if ($matcher->matches($url)) { 517 | return true; 518 | } 519 | } 520 | 521 | return false; 522 | } 523 | 524 | /** 525 | * @param Url $url 526 | * @return bool 527 | */ 528 | protected function isUrlBlacklisted(Url $url) 529 | { 530 | foreach ($this->blacklistUrlMatchers as $matcher) { 531 | if ($matcher->matches($url)) { 532 | return true; 533 | } 534 | } 535 | 536 | return false; 537 | } 538 | 539 | /** 540 | * @param Url $url 541 | * @return bool 542 | */ 543 | protected function shouldCrawlUrl(Url $url) 544 | { 545 | if ($this->urlsCrawled->contains($url) || $this->urlsQueued->contains($url)) { 546 | return false; 547 | } 548 | 549 | if (!$this->isUrlPartOfBaseUrl($url)) { 550 | $this->addRejectedUrl($url); 551 | 552 | return false; 553 | } 554 | 555 | return true; 556 | } 557 | 558 | /** 559 | * @param $url 560 | */ 561 | protected function addRejectedUrl($url) 562 | { 563 | if ($url instanceof Url) { 564 | $url = $url->__toString(); 565 | } 566 | if (!is_string($url)) { 567 | throw new \InvalidArgumentException('Url should be a string or an instance of '.Url::class); 568 | } 569 | 570 | $this->urlsRejected[$url] = $url; 571 | } 572 | 573 | /** 574 | * @param Url $url 575 | * @return bool 576 | */ 577 | protected function isUrlPartOfBaseUrl(Url $url) 578 | { 579 | $baseUrlString = (string)$this->baseUrl; 580 | $this->getLogger()->debug($baseUrlString.' - '.$url); 581 | if (strpos((string)$url, $baseUrlString) === false) { 582 | return false; 583 | } 584 | 585 | return true; 586 | } 587 | 588 | /** 589 | * @return bool 590 | */ 591 | protected function isLimitReached() 592 | { 593 | return (!empty($this->limit) && count($this->urlsReturned) === $this->limit); 594 | } 595 | 596 | /** 597 | * @param DomCrawler $crawler 598 | * @return array 599 | */ 600 | protected function extractUrlsFromCrawler(DomCrawler $crawler) 601 | { 602 | return $crawler->filter('a')->each( 603 | function (DomCrawler $node) { 604 | return $node->link()->getUri(); 605 | } 606 | ); 607 | } 608 | 609 | /** 610 | * @param Url $url 611 | * @return DomCrawler 612 | */ 613 | protected function requestPage(Url $url) 614 | { 615 | $this->getLogger()->info(sprintf('Crawling page %s', $url)); 616 | $crawler = $this->client->request('GET', (string)$url); 617 | $this->getLogger()->info(sprintf('Crawled page %s', $url)); 618 | 619 | return $crawler; 620 | } 621 | } 622 | --------------------------------------------------------------------------------