├── .gitignore
├── doc
├── usage.rst
└── example
│ ├── basic.php
│ ├── limit.php
│ ├── logger.php
│ ├── stop_on_error.php
│ ├── stop_on_exception.php
│ ├── whitelist.php
│ ├── blacklist.php
│ └── normalizer.php
├── src
├── Url.php
├── Exception
│ ├── CrawlerException.php
│ ├── RequestException.php
│ ├── EmptyCollectionException.php
│ └── UnsupportedUrlException.php
├── Client
│ ├── GoutteClient.php
│ ├── PrerenderIoClient.php
│ ├── CrawlerClientInterface.php
│ └── PrerenderClient.php
├── Url
│ ├── Matcher
│ │ ├── UrlMatcherInterface.php
│ │ ├── PathRegexUrlMatcher.php
│ │ └── CallbackUrlMatcher.php
│ ├── Normalizer
│ │ ├── UrlNormalizerInterface.php
│ │ ├── CallbackUrlNormalizer.php
│ │ └── RemoveQueryParameterUrlNormalizer.php
│ └── UrlCollection.php
├── Page.php
└── Crawler.php
├── .scrutinizer.yml
├── tests
├── src
│ ├── UrlTest.php
│ ├── PageTest.php
│ ├── Url
│ │ ├── Matcher
│ │ │ ├── CallbackUrlMatcherTest.php
│ │ │ └── PathRegexUrlMatcherTest.php
│ │ └── Normalizer
│ │ │ ├── CallbackUrlNormalizerTest.php
│ │ │ └── RemoveQueryParameterUrlNormalizerTest.php
│ ├── Client
│ │ ├── PrerenderIoClientTest.php
│ │ └── PrerenderClientTest.php
│ └── CrawlerTest.php
└── bootstrap.php
├── .travis.yml
├── phpunit.xml
├── CHANGELOG.md
├── LICENSE
├── composer.json
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | /vendor/
2 | /composer.lock
3 |
--------------------------------------------------------------------------------
/doc/usage.rst:
--------------------------------------------------------------------------------
1 | Usage
2 | =====
3 |
4 | The examples should be self descriptive.
5 |
--------------------------------------------------------------------------------
/src/Url.php:
--------------------------------------------------------------------------------
1 | crawl('https://www.yourwebsite.com') as $page) {
9 | echo $page->getUrl() . PHP_EOL;
10 | }
11 |
--------------------------------------------------------------------------------
/doc/example/limit.php:
--------------------------------------------------------------------------------
1 | setLimit(10);
11 |
12 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) {
13 | echo $page->getUrl() . PHP_EOL;
14 | }
15 |
--------------------------------------------------------------------------------
/doc/example/logger.php:
--------------------------------------------------------------------------------
1 | setLogger(new \Psr\Log\NullLogger());
11 |
12 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) {
13 | echo $page->getUrl() . PHP_EOL;
14 | }
15 |
--------------------------------------------------------------------------------
/doc/example/stop_on_error.php:
--------------------------------------------------------------------------------
1 | setStopOnError(true);
11 |
12 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) {
13 | echo $page->getUrl() . PHP_EOL;
14 | }
15 |
--------------------------------------------------------------------------------
/doc/example/stop_on_exception.php:
--------------------------------------------------------------------------------
1 | setExceptionOnError(true);
11 |
12 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) {
13 | echo $page->getUrl() . PHP_EOL;
14 | }
15 |
--------------------------------------------------------------------------------
/doc/example/whitelist.php:
--------------------------------------------------------------------------------
1 | addWhitelistUrlMatcher(new Matcher\PathRegexUrlMatcher('~^/foo~'));
12 |
13 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) {
14 | echo $page->getUrl() . PHP_EOL;
15 | }
16 |
--------------------------------------------------------------------------------
/tests/src/UrlTest.php:
--------------------------------------------------------------------------------
1 | assertInstanceOf(Url::class, $url);
15 | $this->assertInstanceOf(Http::class, $url);
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/doc/example/blacklist.php:
--------------------------------------------------------------------------------
1 | addBlacklistUrlMatcher(new Matcher\PathRegexUrlMatcher('~^/foo~'));
12 |
13 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) {
14 | echo $page->getUrl() . PHP_EOL;
15 | }
16 |
--------------------------------------------------------------------------------
/doc/example/normalizer.php:
--------------------------------------------------------------------------------
1 | addUrlNormalizer(new Normalizer\RemoveQueryParameterUrlNormalizer('q'));
12 |
13 | foreach($crawler->crawl('https://www.yourwebsite.com') as $page) {
14 | echo $page->getUrl() . PHP_EOL;
15 | }
16 |
--------------------------------------------------------------------------------
/tests/bootstrap.php:
--------------------------------------------------------------------------------
1 | getMethod($methodName);
17 | $method->setAccessible(true);
18 |
19 | return $method;
20 | }
21 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: php
2 |
3 | php:
4 | - 5.5
5 | - 5.6
6 | - 7.0
7 | - 7.1
8 | - 7.2
9 |
10 | sudo: false
11 |
12 | cache:
13 | directories:
14 | - $HOME/.composer/cache/files
15 |
16 | before_install:
17 | - composer self-update
18 |
19 | install:
20 | - composer update --prefer-source $COMPOSER_FLAGS
21 |
22 | script:
23 | - if [ "$TRAVIS_PHP_VERSION" == "5.6" ]; then phpunit --coverage-clover=coverage.clover; else phpunit; fi;
24 |
25 | after_script:
26 | - if [ "$TRAVIS_PHP_VERSION" == "5.6" ]; then wget https://scrutinizer-ci.com/ocular.phar && php ocular.phar code-coverage:upload --format=php-clover coverage.clover; fi;
27 |
--------------------------------------------------------------------------------
/src/Url/Matcher/PathRegexUrlMatcher.php:
--------------------------------------------------------------------------------
1 | pattern = $pattern;
20 | }
21 |
22 | /**
23 | * @param Url $url
24 | * @return bool
25 | */
26 | public function matches(Url $url)
27 | {
28 | return (bool)preg_match($this->pattern, $url->getPath());
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/Url/Matcher/CallbackUrlMatcher.php:
--------------------------------------------------------------------------------
1 | callback = $callback;
20 | }
21 |
22 | /**
23 | * @param Url $url
24 | * @return bool
25 | */
26 | public function matches(Url $url)
27 | {
28 | return call_user_func($this->callback, $url);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/Url/Normalizer/CallbackUrlNormalizer.php:
--------------------------------------------------------------------------------
1 | callback = $callback;
20 | }
21 |
22 | /**
23 | * @param Url $url
24 | * @return Url
25 | */
26 | public function normalize(Url $url)
27 | {
28 | return call_user_func($this->callback, $url);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/tests/src/PageTest.php:
--------------------------------------------------------------------------------
1 | assertEquals($url, $page->getUrl());
22 | $this->assertEquals($domCrawler, $page->getCrawler());
23 | $this->assertEquals($response, $page->getResponse());
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/tests/src/Url/Matcher/CallbackUrlMatcherTest.php:
--------------------------------------------------------------------------------
1 | __toString();
15 | };
16 |
17 | $url = m::mock(Url::class);
18 | $url->shouldReceive('__toString')->once();
19 |
20 | $callbackUrlMatcher = new CallbackUrlMatcher($callback);
21 | $callbackUrlMatcher->matches($url);
22 | }
23 |
24 | protected function tearDown()
25 | {
26 | parent::tearDown();
27 |
28 | m::close();
29 | }
30 | }
--------------------------------------------------------------------------------
/tests/src/Url/Normalizer/CallbackUrlNormalizerTest.php:
--------------------------------------------------------------------------------
1 | __toString();
15 | };
16 |
17 | $url = m::mock(Url::class);
18 | $url->shouldReceive('__toString')->once();
19 |
20 | $callbackUrlMatcher = new CallbackUrlNormalizer($callback);
21 | $callbackUrlMatcher->normalize($url);
22 | }
23 |
24 | protected function tearDown()
25 | {
26 | parent::tearDown();
27 |
28 | m::close();
29 | }
30 | }
--------------------------------------------------------------------------------
/src/Url/Normalizer/RemoveQueryParameterUrlNormalizer.php:
--------------------------------------------------------------------------------
1 | keys = $keys;
24 | }
25 |
26 | /**
27 | * @param Url $url
28 | * @return Url
29 | */
30 | public function normalize(Url $url)
31 | {
32 | $query = $url->query;
33 | $query = $query->without($this->keys);
34 |
35 | return $url->withQuery((string)$query);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/tests/src/Url/Matcher/PathRegexUrlMatcherTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($callbackUrlMatcher->matches(Url::createFromString('http://my-project/foo')));
16 | $this->assertTrue($callbackUrlMatcher->matches(Url::createFromString('http://my-project/foo/bat')));
17 | $this->assertFalse($callbackUrlMatcher->matches(Url::createFromString('http://my-project/bar')));
18 | $this->assertFalse($callbackUrlMatcher->matches(Url::createFromString('http://my-project/bar/foo')));
19 | }
20 | }
--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
14 |
15 |
16 | ./tests/src/
17 |
18 |
19 |
20 |
21 |
22 | ./
23 |
24 | ./tests
25 | ./vendor
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 | All notable changes to this project will be documented in this file.
3 | This project adheres to [Semantic Versioning](http://semver.org/).
4 |
5 | ## [2.0.0](https://github.com/mediamonks/crawler/compare/1.1.0...2.0.0) - 2017-10-19
6 | ### Changed
7 | - Use interface for crawler client to allow more flexible (decorated) clients
8 | - Redirects are returned as redirected URL instead of the original URL
9 | - Prerender.io uses https instead of http transport
10 |
11 | ## [1.1.0](https://github.com/mediamonks/crawler/compare/v1.0.1...1.1.0) - 2017-08-11
12 | ### Added
13 | - Make Response available in Page
14 |
15 | ### Removed
16 | - Support for hhvm
17 |
18 | ## [1.0.1](https://github.com/mediamonks/crawler/compare/v1.0.0...v1.0.1) - 2017-03-31
19 | ### Added
20 | - Tests
21 |
22 | ### Fixed
23 | - Add "psr-log" to required packages
24 |
25 | ## [1.0.0](https://github.com/mediamonks/crawler/tree/v1.0.0) - 2016-11-28
26 | ### Added
27 | - Initial version
28 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 MediaMonks
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/src/Client/PrerenderIoClientTest.php:
--------------------------------------------------------------------------------
1 | invokeArgs($client, [$websiteUrl]);
17 |
18 | $this->assertEquals(PrerenderIoClient::URL.$websiteUrl, $result);
19 | $this->assertEquals(
20 | PrerenderIoClient::USER_AGENT,
21 | $client->getServerParameter(PrerenderIoClient::HEADER_USER_AGENT)
22 | );
23 | $this->assertEquals($token, $client->getServerParameter(PrerenderIoClient::HEADER_TOKEN));
24 | }
25 |
26 | public function test_getRequest()
27 | {
28 | $token = 'my-prerender.io-token';
29 | $client = new PrerenderIoClient($token);
30 | $this->assertNull($client->getRequest());
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/Page.php:
--------------------------------------------------------------------------------
1 | url = $url;
33 | $this->crawler = $crawler;
34 | $this->response = $response;
35 | }
36 |
37 | /**
38 | * @return Url
39 | */
40 | public function getUrl()
41 | {
42 | return $this->url;
43 | }
44 |
45 | /**
46 | * @return DomCrawler
47 | */
48 | public function getCrawler()
49 | {
50 | return $this->crawler;
51 | }
52 |
53 | /**
54 | * @return Response
55 | */
56 | public function getResponse()
57 | {
58 | return $this->response;
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "mediamonks/crawler",
3 | "type": "library",
4 | "description": "Crawl your own website with various clients for SEO and indexing purposes. ",
5 | "keywords": [
6 | "crawler",
7 | "seo",
8 | "dom",
9 | "spider",
10 | "robot",
11 | "prerender",
12 | "prerender.io",
13 | "search",
14 | "index",
15 | "goutte",
16 | "domcrawler"
17 | ],
18 | "homepage": "https://www.mediamonks.com/",
19 | "license": "MIT",
20 | "authors": [
21 | {
22 | "name": "Robert Slootjes",
23 | "email": "robert@mediamonks.com",
24 | "homepage": "https://github.com/slootjes"
25 | }
26 | ],
27 | "require": {
28 | "php": "^5.5|^7.0",
29 | "symfony/dom-crawler": "^2.8|^3.0|^4.0",
30 | "fabpot/goutte": "^3.0",
31 | "league/uri": "^4.2",
32 | "psr/log": "^1.0"
33 | },
34 | "require-dev": {
35 | "codeclimate/php-test-reporter": "dev-master@dev",
36 | "phpunit/phpunit": "^4.8",
37 | "mockery/mockery": "^0.9.4",
38 | "monolog/monolog": "^1.21"
39 | },
40 | "autoload": {
41 | "psr-4": {
42 | "MediaMonks\\Crawler\\": "src/"
43 | }
44 | },
45 | "extra": {
46 | "branch-alias": {
47 | "dev-master": "1.0-dev"
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/Client/PrerenderIoClient.php:
--------------------------------------------------------------------------------
1 | token = $token;
31 |
32 | $server[self::HEADER_TOKEN] = $token;
33 | $server[self::HEADER_USER_AGENT] = self::USER_AGENT;
34 |
35 | parent::__construct(self::URL, $server, $history, $cookieJar);
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/tests/src/Client/PrerenderClientTest.php:
--------------------------------------------------------------------------------
1 | invokeArgs($client, [$websiteUrl]);
18 |
19 | $this->assertEquals($prerenderUrl.$websiteUrl, $result);
20 | }
21 |
22 | public function test_url_is_corrected()
23 | {
24 | $prerenderUrl = 'http://my-prerender-server/';
25 | $websiteUrl = 'http://my-website/';
26 |
27 | $request = new Request($prerenderUrl.$websiteUrl, 'GET');
28 |
29 | $rp = new \ReflectionProperty(PrerenderClient::class, 'request');
30 | $rp->setAccessible(true);
31 |
32 | $client = new PrerenderClient($prerenderUrl);
33 | $rp->setValue($client, $request);
34 |
35 | $this->assertEquals($client->getRequest()->getUri(), $websiteUrl);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/Client/CrawlerClientInterface.php:
--------------------------------------------------------------------------------
1 | applyNormalization('http://my-project/?foo=bar', 'foo', 'http://my-project/');
14 | }
15 |
16 | public function test_query_parameters_are_removed()
17 | {
18 | $this->applyNormalization('http://my-project/?foo=bar&bar=baz', ['foo', 'bar'], 'http://my-project/');
19 | }
20 |
21 | public function test_query_parameters_are_not_removed()
22 | {
23 | $this->applyNormalization('http://my-project/?foo=bar&bar=baz', ['foo2'], 'http://my-project/?foo=bar&bar=baz');
24 | }
25 |
26 | /**
27 | * @param $urlInput
28 | * @param array $removeKeys
29 | * @param $urlExpectedOutput
30 | */
31 | protected function applyNormalization($urlInput, $removeKeys, $urlExpectedOutput)
32 | {
33 | $url = Url::createFromString($urlInput);
34 | $callbackUrlMatcher = new RemoveQueryParameterUrlNormalizer($removeKeys);
35 | $url = $callbackUrlMatcher->normalize($url);
36 |
37 | $this->assertEquals($urlExpectedOutput, $url->__toString());
38 | }
39 | }
--------------------------------------------------------------------------------
/src/Url/UrlCollection.php:
--------------------------------------------------------------------------------
1 | contains($url)) {
22 | $this->urls[$url->__toString()] = $url;
23 | }
24 | }
25 |
26 | /**
27 | * @return Url|false
28 | * @throws EmptyCollectionException
29 | */
30 | public function pop()
31 | {
32 | $url = array_pop($this->urls);
33 | if (empty($url)) {
34 | return false;
35 | }
36 |
37 | return $url;
38 | }
39 |
40 | /**
41 | * @param Url $url
42 | *
43 | * @return bool
44 | */
45 | public function contains(Url $url)
46 | {
47 | return isset($this->urls[$url->__toString()]);
48 | }
49 |
50 | /**
51 | * @return int
52 | */
53 | public function count()
54 | {
55 | return count($this->urls);
56 | }
57 |
58 | /**
59 | * @return void
60 | */
61 | public function reset()
62 | {
63 | $this->urls = [];
64 | }
65 |
66 | /**
67 | * @return array
68 | */
69 | public function toArray()
70 | {
71 | $stringified = [];
72 | foreach ($this->urls as $url) {
73 | $stringified[] = $url->__toString();
74 | }
75 |
76 | return $stringified;
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/Client/PrerenderClient.php:
--------------------------------------------------------------------------------
1 | prerenderUrl = $prerenderUrl;
25 |
26 | parent::__construct($server, $history, $cookieJar);
27 | }
28 |
29 | /**
30 | * @param string $uri
31 | * @return string
32 | */
33 | protected function getAbsoluteUri($uri)
34 | {
35 | return $this->prerenderUrl.parent::getAbsoluteUri($uri);
36 | }
37 |
38 | /**
39 | * @inheritdoc
40 | */
41 | public function getRequest()
42 | {
43 | $request = parent::getRequest();
44 | if (!empty($request)) {
45 | return new Request($this->correctUrl($request->getUri()),
46 | $request->getMethod(), $request->getParameters(),
47 | $request->getFiles(), $request->getCookies(), $request->getServer(),
48 | $request->getContent());
49 | }
50 | }
51 |
52 | /**
53 | * @param $url
54 | *
55 | * @return string
56 | */
57 | protected function correctUrl($url)
58 | {
59 | return str_replace($this->prerenderUrl, '', $url);
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/mediamonks/crawler)
2 | [](https://scrutinizer-ci.com/g/mediamonks/crawler/?branch=master)
3 | [](https://scrutinizer-ci.com/g/mediamonks/crawler/?branch=master)
4 | [](https://packagist.org/packages/mediamonks/crawler)
5 | [](https://packagist.org/packages/mediamonks/crawler)
6 | [](https://packagist.org/packages/mediamonks/crawler)
7 | [](https://insight.sensiolabs.com/projects/2fd407ee-3228-46c1-9ebb-40745787d454)
8 | [](https://packagist.org/packages/mediamonks/crawler)
9 |
10 | # MediaMonks Crawler
11 |
12 | This tool allows you to easily crawl a website and get a DOM object for every url that was found.
13 | We use this to crawl our own site pages regardless if it was generated with server and/or client side content by using the Prerender.io client.
14 | The resulting data can be used for creating a full site search and/or improving SEO for single-page applications.
15 |
16 | ## Highlights
17 |
18 | - Ships with Prerender & Prerender.io clients, uses Goutte by default
19 | - Supports any Symfony BrowserKit client
20 | - Supports both whitelisting and blacklisting of urls
21 | - Supports url normalization which allow you to prevent duplicates based on minor url differences
22 | - Implements the [PSR-3 Logger Interface](http://www.php-fig.org/psr/psr-3/)
23 |
24 | ## Documentation
25 |
26 | Documentation and examples can be found in the [/doc](/doc) folder.
27 |
28 | ## System Requirements
29 |
30 | You need:
31 |
32 | - **PHP >= 5.5.0**
33 |
34 | To use the library.
35 |
36 | ## Install
37 |
38 | Install this package by using Composer.
39 |
40 | ```
41 | $ composer require mediamonks/crawler
42 | ```
43 |
44 | ## Security
45 |
46 | If you discover any security related issues, please email devmonk@mediamonks.com instead of using the issue tracker.
47 |
48 | ## License
49 |
50 | The MIT License (MIT). Please see [License File](LICENSE) for more information.
51 |
--------------------------------------------------------------------------------
/tests/src/CrawlerTest.php:
--------------------------------------------------------------------------------
1 | assertInstanceOf(Client::class, $crawler->getClient());
26 | $this->assertEquals(0, $crawler->getLimit());
27 | $this->assertInstanceOf(NullLogger::class, $crawler->getLogger());
28 | $this->assertFalse($crawler->getStopOnError());
29 | $this->assertCount(0, $crawler->getUrlsCrawled());
30 | $this->assertCount(0, $crawler->getUrlsQueued());
31 | $this->assertCount(0, $crawler->getUrlsRejected());
32 | $this->assertCount(0, $crawler->getUrlsReturned());
33 | $this->assertCount(0, $crawler->getWhitelistUrlMatchers());
34 | $this->assertCount(0, $crawler->getBlacklistUrlMatchers());
35 | }
36 |
37 | public function test_getters_setters()
38 | {
39 | $crawler = new Crawler();
40 |
41 | $client = new GoutteClient();
42 | $crawler->setClient($client);
43 | $this->assertEquals($client, $crawler->getClient());
44 |
45 | $limit = 1;
46 | $crawler->setLimit($limit);
47 | $this->assertEquals($limit, $crawler->getLimit());
48 |
49 | $crawler->setStopOnError(true);
50 | $this->assertTrue($crawler->getStopOnError());
51 |
52 | $logger = m::mock(NullLogger::class);
53 | $crawler->setLogger($logger);
54 | $this->assertEquals($logger, $crawler->getLogger());
55 | }
56 |
57 | public function test_clear_matchers()
58 | {
59 | $matcher = m::mock(UrlMatcherInterface::class);
60 |
61 | $crawler = new Crawler();
62 | $crawler->addBlacklistUrlMatcher($matcher);
63 | $crawler->addWhitelistUrlMatcher($matcher);
64 |
65 | $this->assertCount(1, $crawler->getBlacklistUrlMatchers());
66 | $this->assertCount(1, $crawler->getWhitelistUrlMatchers());
67 |
68 | $crawler->clearBlacklistUrlMatchers();
69 | $crawler->clearWhitelistUrlMatchers();
70 |
71 | $this->assertCount(0, $crawler->getBlacklistUrlMatchers());
72 | $this->assertCount(0, $crawler->getWhitelistUrlMatchers());
73 |
74 | $crawler->setBlacklistUrlMatchers([$matcher]);
75 | $crawler->setWhitelistUrlMatchers([$matcher]);
76 |
77 | $this->assertCount(1, $crawler->getBlacklistUrlMatchers());
78 | $this->assertCount(1, $crawler->getWhitelistUrlMatchers());
79 |
80 | $crawler->clearBlacklistUrlMatchers();
81 | $crawler->clearWhitelistUrlMatchers();
82 |
83 | $this->assertCount(0, $crawler->getBlacklistUrlMatchers());
84 | $this->assertCount(0, $crawler->getWhitelistUrlMatchers());
85 | }
86 |
87 | public function test_clear_normalizers()
88 | {
89 | $normalizer = m::mock(Url\Normalizer\UrlNormalizerInterface::class);
90 |
91 | $crawler = new Crawler();
92 |
93 | $crawler->addUrlNormalizer($normalizer);
94 | $this->assertCount(1, $crawler->getUrlNormalizers());
95 |
96 | $crawler->clearUrlNormalizers();
97 | $this->assertCount(0, $crawler->getUrlNormalizers());
98 |
99 | $crawler->setUrlNormalizers([$normalizer]);
100 | $this->assertCount(1, $crawler->getUrlNormalizers());
101 |
102 | $crawler->clearUrlNormalizers();
103 | $this->assertCount(0, $crawler->getUrlNormalizers());
104 | }
105 |
106 | public function test_crawl_single_page()
107 | {
108 | $domCrawler = new DomCrawler('');
109 |
110 | $client = $this->getClient();
111 | $client->shouldReceive('request')->once()->andReturn($domCrawler);
112 | $client->shouldReceive('getRequest')->once()->andReturn(null);
113 |
114 | $crawler = new Crawler($client);
115 |
116 | foreach ($crawler->crawl('http://my-test') as $page) {
117 | }
118 |
119 | $this->assertCount(1, $crawler->getUrlsCrawled());
120 | }
121 |
122 | public function test_crawl_multiple_pages()
123 | {
124 | $crawler = new Crawler($this->getDummyClient());
125 |
126 | foreach ($crawler->crawl('http://my-test') as $page) {
127 | }
128 |
129 | $this->assertCount(5, $crawler->getUrlsCrawled());
130 | $this->assertCount(5, $crawler->getUrlsReturned());
131 | $this->assertCount(2, $crawler->getUrlsRejected());
132 | }
133 |
134 | public function test_crawl_with_limit()
135 | {
136 | $crawler = new Crawler($this->getDummyClient());
137 | $crawler->setLimit(3);
138 |
139 | foreach ($crawler->crawl('http://my-test') as $page) {
140 | }
141 |
142 | $this->assertCount(3, $crawler->getUrlsCrawled());
143 | $this->assertCount(2, $crawler->getUrlsQueued());
144 | }
145 |
146 | public function test_crawl_with_whitelist()
147 | {
148 | $crawler = new Crawler($this->getDummyClient());
149 | $crawler->addWhitelistUrlMatcher(new PathRegexUrlMatcher('~^/page_1.html~'));
150 |
151 | foreach ($crawler->crawl('http://my-test') as $page) {
152 | }
153 |
154 | $this->assertCount(5, $crawler->getUrlsCrawled());
155 | $this->assertCount(1, $crawler->getUrlsReturned());
156 | }
157 |
158 | public function test_crawl_with_blacklist()
159 | {
160 | $crawler = new Crawler($this->getDummyClient());
161 | $crawler->addBlacklistUrlMatcher(new PathRegexUrlMatcher('~^/page_1.html~'));
162 |
163 | foreach ($crawler->crawl('http://my-test') as $page) {
164 | }
165 |
166 | $this->assertCount(5, $crawler->getUrlsCrawled());
167 | $this->assertCount(4, $crawler->getUrlsReturned());
168 | }
169 |
170 | public function test_crawl_with_normalizer()
171 | {
172 | $crawler = new Crawler($this->getDummyClient());
173 | $crawler->addUrlNormalizer(
174 | new CallbackUrlNormalizer(
175 | function (Url $url) {
176 | if ($url->getPath() === '/page_4.html') {
177 | $url = $url->withPath('/page_3.html');
178 | }
179 |
180 | return $url;
181 | }
182 | )
183 | );
184 |
185 | foreach ($crawler->crawl('http://my-test') as $page) {
186 | }
187 |
188 | $this->assertCount(4, $crawler->getUrlsCrawled());
189 | }
190 |
191 | public function test_crawler_stop_on_error()
192 | {
193 | $client = $this->getClient();
194 |
195 | $i = 0;
196 | $client->shouldReceive('request')->andReturnUsing(
197 | function () use (&$i) {
198 | $i++;
199 | switch ($i) {
200 | case 1:
201 | $html = '
Page 1Page 2';
202 | break;
203 | case 2:
204 | throw new \Exception('foo');
205 | case 3:
206 | $html = 'Page 4Invalid';
207 | break;
208 | default:
209 | $html = 'Page 1External';
210 | break;
211 | }
212 |
213 | return new DomCrawler($html, 'http://my-test');
214 | }
215 | );
216 | $client->shouldReceive('getRequest')->once()->andReturn(null);
217 |
218 | $crawler = new Crawler($client);
219 | $crawler->setStopOnError(true);
220 |
221 | foreach ($crawler->crawl('http://my-test') as $page) {
222 | }
223 |
224 | $this->assertCount(1, $crawler->getUrlsCrawled());
225 | }
226 |
227 | public function test_crawler_exception_on_error()
228 | {
229 | $this->setExpectedException(RequestException::class);
230 | $client = $this->getClient();
231 |
232 | $i = 0;
233 | $client->shouldReceive('request')->andReturnUsing(
234 | function () use (&$i) {
235 | $i++;
236 | switch ($i) {
237 | case 1:
238 | $html = 'Page 1Page 2';
239 | break;
240 | case 2:
241 | throw new \Exception('foo');
242 | case 3:
243 | $html = 'Page 4Invalid';
244 | break;
245 | default:
246 | $html = 'Page 1External';
247 | break;
248 | }
249 |
250 | return new DomCrawler($html, 'http://my-test');
251 | }
252 | );
253 |
254 | $crawler = new Crawler($client);
255 | $crawler->setExceptionOnError(true);
256 |
257 | foreach ($crawler->crawl('http://my-test') as $page) {
258 | }
259 |
260 | $this->assertCount(1, $crawler->getUrlsCrawled());
261 | }
262 |
263 | public function test_crawler_does_not_stop_on_error()
264 | {
265 | $client = $this->getClient();
266 |
267 | $i = 0;
268 | $client->shouldReceive('request')->andReturnUsing(
269 | function () use (&$i) {
270 | $i++;
271 | switch ($i) {
272 | case 1:
273 | $html = 'Page 1Page 2';
274 | break;
275 | case 2:
276 | throw new \Exception('foo');
277 | case 3:
278 | $html = 'Page 4Invalid';
279 | break;
280 | default:
281 | $html = 'Page 1External';
282 | break;
283 | }
284 |
285 | return new DomCrawler($html, 'http://my-test');
286 | }
287 | );
288 | $client->shouldReceive('getRequest')->andReturn(null);
289 |
290 | $crawler = new Crawler($client);
291 |
292 | foreach ($crawler->crawl('http://my-test') as $page) {
293 | }
294 |
295 | $this->assertCount(3, $crawler->getUrlsCrawled());
296 | }
297 |
298 | /**
299 | * @return m\MockInterface
300 | */
301 | protected function getDummyClient()
302 | {
303 | $client = $this->getClient();
304 |
305 | $i = 0;
306 | $client->shouldReceive('request')->andReturnUsing(
307 | function () use (&$i) {
308 | $i++;
309 | switch ($i) {
310 | case 1:
311 | $html = 'Page 1Page 2';
312 | break;
313 | case 2:
314 | $html = 'Page 3External';
315 | break;
316 | case 3:
317 | $html = 'Page 4Invalid';
318 | break;
319 | default:
320 | $html = 'Page 1External';
321 | break;
322 | }
323 |
324 | return new DomCrawler($html, 'http://my-test');
325 | }
326 | );
327 | $client->shouldReceive('getRequest')->andReturn(null);
328 |
329 | return $client;
330 | }
331 |
332 | public function test_should_crawl_url()
333 | {
334 | $reset = get_non_public_method(Crawler::class, 'reset');
335 | $shouldCrawlUrl = get_non_public_method(Crawler::class, 'shouldCrawlUrl');
336 |
337 | $client = new Crawler();
338 |
339 | $reset->invokeArgs($client, [Url::createFromString('http://my-website')]);
340 |
341 | // already in queue as it is the base url
342 | $this->assertFalse($shouldCrawlUrl->invokeArgs($client, [Url::createFromString('http://my-website')]));
343 |
344 | // new page, should be crawled
345 | $this->assertTrue($shouldCrawlUrl->invokeArgs($client, [Url::createFromString('http://my-website/foo')]));
346 |
347 | // different host, should not be crawled
348 | $this->assertFalse($shouldCrawlUrl->invokeArgs($client, [Url::createFromString('http://other-host')]));
349 |
350 | // already rejected, should not be crawled
351 | $this->assertFalse($shouldCrawlUrl->invokeArgs($client, [Url::createFromString('http://other-host')]));
352 | }
353 |
354 | public function test_add_rejected_url()
355 | {
356 | $addRejectedUrl = get_non_public_method(Crawler::class, 'addRejectedUrl');
357 |
358 | $client = new Crawler();
359 | $addRejectedUrl->invokeArgs($client, [Url::createFromString('http://my-website')]);
360 | $this->assertEquals(1, count($client->getUrlsRejected()));
361 |
362 | $addRejectedUrl->invokeArgs($client, ['http://my-website/foo']);
363 | $this->assertEquals(2, count($client->getUrlsRejected()));
364 | }
365 |
366 | public function test_add_rejected_url_invalid()
367 | {
368 | $this->setExpectedException(\InvalidArgumentException::class);
369 |
370 | $addRejectedUrl = get_non_public_method(Crawler::class, 'addRejectedUrl');
371 |
372 | $client = new Crawler();
373 | $addRejectedUrl->invokeArgs($client, [new \stdClass()]);
374 | $this->assertEquals(0, count($client->getUrlsRejected()));
375 | }
376 |
377 | public function test_update_url()
378 | {
379 | $request = m::mock(Request::class);
380 | $request->shouldReceive('getUri')->andReturn('http://redirected-url');
381 |
382 | $domCrawler = new DomCrawler('');
383 |
384 | $client = $this->getClient();
385 | $client->shouldReceive('request')->once()->andReturn($domCrawler);
386 | $client->shouldReceive('getRequest')->andReturn($request);
387 |
388 | $client = new Crawler($client);
389 | foreach ($client->crawl('http://original-url') as $page) {
390 | $this->assertEquals('http://redirected-url', $page->getUrl()->__toString());
391 | }
392 | }
393 |
394 | protected function tearDown()
395 | {
396 | parent::tearDown();
397 |
398 | m::close();
399 | }
400 |
401 | /**
402 | * @return \Mockery\MockInterface
403 | */
404 | protected function getClient()
405 | {
406 | $client = m::mock(CrawlerClientInterface::class);
407 | $client->shouldReceive('getResponse')->andReturnNull();
408 |
409 | return $client;
410 | }
411 | }
412 |
--------------------------------------------------------------------------------
/src/Crawler.php:
--------------------------------------------------------------------------------
1 | setClient($client);
95 |
96 | $this->urlsCrawled = new UrlCollection();
97 | $this->urlsQueued = new UrlCollection();
98 | $this->urlsReturned = new UrlCollection();
99 |
100 | return $this;
101 | }
102 |
103 | /**
104 | * @param CrawlerClientInterface $client
105 | */
106 | public function setClient(CrawlerClientInterface $client)
107 | {
108 | $this->client = $client;
109 | }
110 |
111 | /**
112 | * @return Client
113 | */
114 | public function getClient()
115 | {
116 | return $this->client;
117 | }
118 |
119 | /**
120 | * @return int
121 | */
122 | public function getLimit()
123 | {
124 | return $this->limit;
125 | }
126 |
127 | /**
128 | * @param int $limit
129 | * @return $this
130 | */
131 | public function setLimit($limit)
132 | {
133 | $this->limit = $limit;
134 |
135 | return $this;
136 | }
137 |
138 | /**
139 | * @return boolean
140 | */
141 | public function getStopOnError()
142 | {
143 | return $this->stopOnError;
144 | }
145 |
146 | /**
147 | * @param boolean $stopOnError
148 | * @return $this
149 | */
150 | public function setStopOnError($stopOnError)
151 | {
152 | $this->stopOnError = $stopOnError;
153 |
154 | return $this;
155 | }
156 |
157 | /**
158 | * @return boolean
159 | */
160 | public function getExceptionOnError()
161 | {
162 | return $this->exceptionOnError;
163 | }
164 |
165 | /**
166 | * @param boolean $exceptionOnError
167 | * @return $this
168 | */
169 | public function setExceptionOnError($exceptionOnError)
170 | {
171 | $this->exceptionOnError = $exceptionOnError;
172 |
173 | return $this;
174 | }
175 |
176 | /**
177 | * @return array
178 | */
179 | public function getUrlsCrawled()
180 | {
181 | return $this->urlsCrawled->toArray();
182 | }
183 |
184 | /**
185 | * @return array
186 | */
187 | public function getUrlsQueued()
188 | {
189 | return $this->urlsQueued->toArray();
190 | }
191 |
192 | /**
193 | * @return array
194 | */
195 | public function getUrlsReturned()
196 | {
197 | return $this->urlsReturned->toArray();
198 | }
199 |
200 | /**
201 | * @return array
202 | */
203 | public function getUrlsRejected()
204 | {
205 | return $this->urlsRejected;
206 | }
207 |
208 | /**
209 | * @param $urlMatchers
210 | * @return $this
211 | */
212 | public function setWhitelistUrlMatchers(array $urlMatchers)
213 | {
214 | $this->clearWhitelistUrlMatchers();
215 | foreach ($urlMatchers as $matcher) {
216 | $this->addWhitelistUrlMatcher($matcher);
217 | }
218 |
219 | return $this;
220 | }
221 |
222 | /**
223 | * @return Url\Matcher\UrlMatcherInterface[]
224 | */
225 | public function getWhitelistUrlMatchers()
226 | {
227 | return $this->whitelistUrlMatchers;
228 | }
229 |
230 | /**
231 | * @param UrlMatcherInterface $urlMatcher
232 | * @return $this
233 | */
234 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
235 | {
236 | $this->whitelistUrlMatchers[] = $urlMatcher;
237 |
238 | return $this;
239 | }
240 |
241 | /**
242 | * @return $this
243 | */
244 | public function clearWhitelistUrlMatchers()
245 | {
246 | $this->whitelistUrlMatchers = [];
247 |
248 | return $this;
249 | }
250 |
251 | /**
252 | * @param array $urlMatchers
253 | * @return $this
254 | */
255 | public function setBlacklistUrlMatchers(array $urlMatchers)
256 | {
257 | $this->clearBlacklistUrlMatchers();
258 | foreach ($urlMatchers as $matcher) {
259 | $this->addBlacklistUrlMatcher($matcher);
260 | }
261 |
262 | return $this;
263 | }
264 |
265 | /**
266 | * @return UrlMatcherInterface[]
267 | */
268 | public function getBlacklistUrlMatchers()
269 | {
270 | return $this->blacklistUrlMatchers;
271 | }
272 |
273 | /**
274 | * @param UrlMatcherInterface $urlMatcher
275 | * @return $this
276 | */
277 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
278 | {
279 | $this->blacklistUrlMatchers[] = $urlMatcher;
280 |
281 | return $this;
282 | }
283 |
284 | /**
285 | * @return $this
286 | */
287 | public function clearBlacklistUrlMatchers()
288 | {
289 | $this->blacklistUrlMatchers = [];
290 |
291 | return $this;
292 | }
293 |
294 | /**
295 | * @param array $normalizers
296 | * @return $this
297 | */
298 | public function setUrlNormalizers(array $normalizers)
299 | {
300 | $this->clearUrlNormalizers();
301 |
302 | foreach ($normalizers as $normalizer) {
303 | $this->addUrlNormalizer($normalizer);
304 | }
305 |
306 | return $this;
307 | }
308 |
309 | /**
310 | * @return UrlNormalizerInterface[]
311 | */
312 | public function getUrlNormalizers()
313 | {
314 | return $this->urlNormalizers;
315 | }
316 |
317 | /**
318 | * @param UrlNormalizerInterface $normalizer
319 | * @return $this
320 | */
321 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
322 | {
323 | $this->urlNormalizers[] = $normalizer;
324 |
325 | return $this;
326 | }
327 |
328 | /**
329 | * @return $this
330 | */
331 | public function clearUrlNormalizers()
332 | {
333 | $this->urlNormalizers = [];
334 |
335 | return $this;
336 | }
337 |
338 | /**
339 | * @return LoggerInterface
340 | */
341 | public function getLogger()
342 | {
343 | if (is_null($this->logger)) {
344 | $this->logger = new NullLogger();
345 | }
346 |
347 | return $this->logger;
348 | }
349 |
350 | /**
351 | * @param LoggerInterface $logger
352 | * @return $this
353 | */
354 | public function setLogger(LoggerInterface $logger)
355 | {
356 | $this->logger = $logger;
357 |
358 | return $this;
359 | }
360 |
361 | /**
362 | * @param $url
363 | * @return Url
364 | * @throws \Exception
365 | */
366 | protected function createHttpUrlString($url)
367 | {
368 | try {
369 | return $this->normalizeUrl(Url::createFromString($url));
370 | }
371 | catch (\Exception $e) {
372 | $this->getLogger()->warning(
373 | sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
374 | );
375 |
376 | throw new UnsupportedUrlException($url);
377 | }
378 | }
379 |
380 | /**
381 | * @param Url $url
382 | */
383 | protected function reset(Url $url)
384 | {
385 | $this->baseUrl = $url;
386 |
387 | $this->urlsCrawled->reset();
388 | $this->urlsQueued->reset();
389 | $this->urlsReturned->reset();
390 | $this->urlsRejected = [];
391 |
392 | $this->urlsQueued->push($url);
393 | }
394 |
395 | /**
396 | * @param string $url
397 | * @return \Generator|Page[]
398 | * @throws RequestException
399 | */
400 | public function crawl($url)
401 | {
402 | $this->reset($this->createHttpUrlString($url));
403 |
404 | while ($url = $this->urlsQueued->pop()) {
405 |
406 | try {
407 | $crawler = $this->requestPage($url);
408 | $url = $this->updateResolvedUrl($url);
409 | } catch (\Exception $e) {
410 | $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
411 |
412 | if ($this->getStopOnError()) {
413 | return;
414 | }
415 | if ($this->getExceptionOnError()) {
416 | throw new RequestException($e->getMessage(), $e->getCode(), $e);
417 | }
418 |
419 | continue;
420 | }
421 |
422 | $this->urlsCrawled->push($url);
423 | $this->updateQueue($crawler);
424 |
425 | if ($this->shouldReturnUrl($url)) {
426 | $this->getLogger()->debug(sprintf('Return url "%s"', $url));
427 | $this->urlsReturned->push($url);
428 |
429 | yield new Page($url, $crawler, $this->client->getResponse());
430 | }
431 |
432 | if ($this->isLimitReached()) {
433 | $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
434 |
435 | return;
436 | }
437 | }
438 | }
439 |
440 | /**
441 | * @param Url $url
442 | * @return Url
443 | */
444 | protected function updateResolvedUrl(Url $url)
445 | {
446 | $request = $this->client->getRequest();
447 | if (!empty($request)) {
448 | $url = $this->createHttpUrlString($request->getUri());
449 | }
450 |
451 | return $url;
452 | }
453 |
454 | /**
455 | * @param DomCrawler $crawler
456 | */
457 | protected function updateQueue(DomCrawler $crawler)
458 | {
459 | foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
460 | $this->getLogger()->debug(sprintf('Found url %s in page', $url));
461 | try {
462 | $url = $this->createHttpUrlString($url);
463 |
464 | if ($this->shouldCrawlUrl($url)) {
465 | $this->urlsQueued->push($url);
466 | }
467 | } catch (\Exception $e) {
468 | $this->addRejectedUrl($url);
469 | }
470 | }
471 | }
472 |
473 | /**
474 | * @param Url $url
475 | * @return Url
476 | */
477 | protected function normalizeUrl(Url $url)
478 | {
479 | foreach ($this->urlNormalizers as $normalizer) {
480 | $url = $normalizer->normalize($url);
481 | }
482 |
483 | return $url;
484 | }
485 |
486 | /**
487 | * @param Url $url
488 | * @return bool
489 | */
490 | protected function shouldReturnUrl(Url $url)
491 | {
492 | if (!empty($this->whitelistUrlMatchers)) {
493 | if (!$this->isUrlWhitelisted($url)) {
494 | $this->getLogger()->info(sprintf('Skipping "%s" because it is not whitelisted', $url));
495 |
496 | return false;
497 | }
498 | }
499 |
500 | if ($this->isUrlBlacklisted($url)) {
501 | $this->getLogger()->info(sprintf('Skipping "%s" because it is blacklisted', $url));
502 |
503 | return false;
504 | }
505 |
506 | return true;
507 | }
508 |
509 | /**
510 | * @param Url $url
511 | * @return bool
512 | */
513 | protected function isUrlWhitelisted(Url $url)
514 | {
515 | foreach ($this->whitelistUrlMatchers as $matcher) {
516 | if ($matcher->matches($url)) {
517 | return true;
518 | }
519 | }
520 |
521 | return false;
522 | }
523 |
524 | /**
525 | * @param Url $url
526 | * @return bool
527 | */
528 | protected function isUrlBlacklisted(Url $url)
529 | {
530 | foreach ($this->blacklistUrlMatchers as $matcher) {
531 | if ($matcher->matches($url)) {
532 | return true;
533 | }
534 | }
535 |
536 | return false;
537 | }
538 |
539 | /**
540 | * @param Url $url
541 | * @return bool
542 | */
543 | protected function shouldCrawlUrl(Url $url)
544 | {
545 | if ($this->urlsCrawled->contains($url) || $this->urlsQueued->contains($url)) {
546 | return false;
547 | }
548 |
549 | if (!$this->isUrlPartOfBaseUrl($url)) {
550 | $this->addRejectedUrl($url);
551 |
552 | return false;
553 | }
554 |
555 | return true;
556 | }
557 |
558 | /**
559 | * @param $url
560 | */
561 | protected function addRejectedUrl($url)
562 | {
563 | if ($url instanceof Url) {
564 | $url = $url->__toString();
565 | }
566 | if (!is_string($url)) {
567 | throw new \InvalidArgumentException('Url should be a string or an instance of '.Url::class);
568 | }
569 |
570 | $this->urlsRejected[$url] = $url;
571 | }
572 |
573 | /**
574 | * @param Url $url
575 | * @return bool
576 | */
577 | protected function isUrlPartOfBaseUrl(Url $url)
578 | {
579 | $baseUrlString = (string)$this->baseUrl;
580 | $this->getLogger()->debug($baseUrlString.' - '.$url);
581 | if (strpos((string)$url, $baseUrlString) === false) {
582 | return false;
583 | }
584 |
585 | return true;
586 | }
587 |
588 | /**
589 | * @return bool
590 | */
591 | protected function isLimitReached()
592 | {
593 | return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
594 | }
595 |
596 | /**
597 | * @param DomCrawler $crawler
598 | * @return array
599 | */
600 | protected function extractUrlsFromCrawler(DomCrawler $crawler)
601 | {
602 | return $crawler->filter('a')->each(
603 | function (DomCrawler $node) {
604 | return $node->link()->getUri();
605 | }
606 | );
607 | }
608 |
609 | /**
610 | * @param Url $url
611 | * @return DomCrawler
612 | */
613 | protected function requestPage(Url $url)
614 | {
615 | $this->getLogger()->info(sprintf('Crawling page %s', $url));
616 | $crawler = $this->client->request('GET', (string)$url);
617 | $this->getLogger()->info(sprintf('Crawled page %s', $url));
618 |
619 | return $crawler;
620 | }
621 | }
622 |
--------------------------------------------------------------------------------