├── src
    ├── Exceptions
    │   ├── UrlNotFoundByIndex.php
    │   ├── InvalidCrawlRequestHandler.php
    │   └── InvalidUrl.php
    ├── CrawlProfiles
    │   ├── CrawlProfile.php
    │   ├── CrawlAllUrls.php
    │   ├── CrawlInternalUrls.php
    │   └── CrawlSubdomains.php
    ├── UrlParsers
    │   ├── UrlParser.php
    │   ├── SitemapUrlParser.php
    │   └── LinkUrlParser.php
    ├── Url.php
    ├── CrawlQueues
    │   ├── CrawlQueue.php
    │   └── ArrayCrawlQueue.php
    ├── ResponseWithCachedBody.php
    ├── Handlers
    │   ├── CrawlRequestFailed.php
    │   └── CrawlRequestFulfilled.php
    ├── CrawlObservers
    │   ├── CrawlObserver.php
    │   └── CrawlObserverCollection.php
    ├── CrawlUrl.php
    ├── CrawlerRobots.php
    └── Crawler.php
├── phpunit.xml
├── LICENSE.md
├── composer.json
└── README.md


/src/Exceptions/UrlNotFoundByIndex.php:
--------------------------------------------------------------------------------
1 | <?php
2 | 
3 | namespace Spatie\Crawler\Exceptions;
4 | 
5 | use RuntimeException;
6 | 
7 | class UrlNotFoundByIndex extends RuntimeException {}
8 | 


--------------------------------------------------------------------------------
/src/CrawlProfiles/CrawlProfile.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\CrawlProfiles;
 4 | 
 5 | use Psr\Http\Message\UriInterface;
 6 | 
 7 | abstract class CrawlProfile
 8 | {
 9 |     abstract public function shouldCrawl(UriInterface $url): bool;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/CrawlProfiles/CrawlAllUrls.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\CrawlProfiles;
 4 | 
 5 | use Psr\Http\Message\UriInterface;
 6 | 
 7 | class CrawlAllUrls extends CrawlProfile
 8 | {
 9 |     public function shouldCrawl(UriInterface $url): bool
10 |     {
11 |         return true;
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/UrlParsers/UrlParser.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\UrlParsers;
 4 | 
 5 | use Psr\Http\Message\UriInterface;
 6 | use Spatie\Crawler\Crawler;
 7 | 
 8 | interface UrlParser
 9 | {
10 |     public function __construct(Crawler $crawler);
11 | 
12 |     public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/Url.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler;
 4 | 
 5 | use GuzzleHttp\Psr7\Uri;
 6 | 
 7 | class Url extends Uri
 8 | {
 9 |     public function __construct(
10 |         protected string $link,
11 |         protected ?string $linkText,
12 |     ) {
13 |         parent::__construct($link);
14 |     }
15 | 
16 |     public function linkText(): ?string
17 |     {
18 |         return $this->linkText;
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/Exceptions/InvalidCrawlRequestHandler.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\Exceptions;
 4 | 
 5 | use RuntimeException;
 6 | 
 7 | class InvalidCrawlRequestHandler extends RuntimeException
 8 | {
 9 |     public static function doesNotExtendBaseClass(string $handlerClass, string $baseClass): static
10 |     {
11 |         return new static("`{$handlerClass} is not a valid handler class. A valid handler class should extend `{$baseClass}`.");
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd"
 4 |          bootstrap="vendor/autoload.php"
 5 |          colors="true"
 6 | >
 7 |     <testsuites>
 8 |         <testsuite name="all">
 9 |             <directory>tests</directory>
10 |         </testsuite>
11 |     </testsuites>
12 |     <source>
13 |         <include>
14 |             <directory>src</directory>
15 |         </include>
16 |     </source>
17 | </phpunit>
18 | 


--------------------------------------------------------------------------------
/src/CrawlProfiles/CrawlInternalUrls.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\CrawlProfiles;
 4 | 
 5 | use GuzzleHttp\Psr7\Uri;
 6 | use Psr\Http\Message\UriInterface;
 7 | 
 8 | class CrawlInternalUrls extends CrawlProfile
 9 | {
10 |     protected mixed $baseUrl;
11 | 
12 |     public function __construct($baseUrl)
13 |     {
14 |         if (! $baseUrl instanceof UriInterface) {
15 |             $baseUrl = new Uri($baseUrl);
16 |         }
17 | 
18 |         $this->baseUrl = $baseUrl;
19 |     }
20 | 
21 |     public function shouldCrawl(UriInterface $url): bool
22 |     {
23 |         return $this->baseUrl->getHost() === $url->getHost();
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/Exceptions/InvalidUrl.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\Exceptions;
 4 | 
 5 | use Exception;
 6 | use Psr\Http\Message\UriInterface;
 7 | use Spatie\Crawler\CrawlUrl;
 8 | 
 9 | class InvalidUrl extends Exception
10 | {
11 |     public static function unexpectedType(mixed $url): static
12 |     {
13 |         $crawlUrlClass = CrawlUrl::class;
14 |         $uriInterfaceClass = UriInterface::class;
15 |         $givenUrlClass = is_object($url) ? get_class($url) : gettype($url);
16 | 
17 |         return new static("You passed an invalid url of type `{$givenUrlClass}`. This should be either a {$crawlUrlClass} or `{$uriInterfaceClass}`");
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/CrawlQueues/CrawlQueue.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\CrawlQueues;
 4 | 
 5 | use Psr\Http\Message\UriInterface;
 6 | use Spatie\Crawler\CrawlUrl;
 7 | 
 8 | interface CrawlQueue
 9 | {
10 |     public function add(CrawlUrl $url): self;
11 | 
12 |     public function has(CrawlUrl|UriInterface $crawlUrl): bool;
13 | 
14 |     public function hasPendingUrls(): bool;
15 | 
16 |     public function getUrlById($id): CrawlUrl;
17 | 
18 |     public function getPendingUrl(): ?CrawlUrl;
19 | 
20 |     public function hasAlreadyBeenProcessed(CrawlUrl $url): bool;
21 | 
22 |     public function markAsProcessed(CrawlUrl $crawlUrl): void;
23 | 
24 |     public function getProcessedUrlCount(): int;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/CrawlProfiles/CrawlSubdomains.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\CrawlProfiles;
 4 | 
 5 | use GuzzleHttp\Psr7\Uri;
 6 | use Psr\Http\Message\UriInterface;
 7 | 
 8 | class CrawlSubdomains extends CrawlProfile
 9 | {
10 |     protected mixed $baseUrl;
11 | 
12 |     public function __construct($baseUrl)
13 |     {
14 |         if (! $baseUrl instanceof UriInterface) {
15 |             $baseUrl = new Uri($baseUrl);
16 |         }
17 | 
18 |         $this->baseUrl = $baseUrl;
19 |     }
20 | 
21 |     public function shouldCrawl(UriInterface $url): bool
22 |     {
23 |         return $this->isSubdomainOfHost($url);
24 |     }
25 | 
26 |     public function isSubdomainOfHost(UriInterface $url): bool
27 |     {
28 |         return str_ends_with($url->getHost(), $this->baseUrl->getHost());
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/ResponseWithCachedBody.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler;
 4 | 
 5 | use GuzzleHttp\Psr7\Response;
 6 | use Psr\Http\Message\ResponseInterface;
 7 | 
 8 | class ResponseWithCachedBody extends Response
 9 | {
10 |     protected ?string $cachedBody = null;
11 | 
12 |     public static function fromGuzzlePsr7Response(ResponseInterface $response): static
13 |     {
14 |         return new static(
15 |             $response->getStatusCode(),
16 |             $response->getHeaders(),
17 |             $response->getBody(),
18 |             $response->getProtocolVersion(),
19 |             $response->getReasonPhrase()
20 |         );
21 |     }
22 | 
23 |     public function setCachedBody(?string $body = null): void
24 |     {
25 |         $this->cachedBody = $body;
26 |     }
27 | 
28 |     public function getCachedBody(): ?string
29 |     {
30 |         return $this->cachedBody;
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/Handlers/CrawlRequestFailed.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\Handlers;
 4 | 
 5 | use Exception;
 6 | use GuzzleHttp\Exception\ConnectException;
 7 | use GuzzleHttp\Exception\RequestException;
 8 | use Spatie\Crawler\Crawler;
 9 | 
10 | class CrawlRequestFailed
11 | {
12 |     public function __construct(protected Crawler $crawler)
13 |     {
14 |         //
15 |     }
16 | 
17 |     public function __invoke(Exception $exception, $index)
18 |     {
19 |         if ($exception instanceof ConnectException) {
20 |             $exception = new RequestException($exception->getMessage(), $exception->getRequest());
21 |         }
22 | 
23 |         if ($exception instanceof RequestException) {
24 |             $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
25 | 
26 |             $this->crawler->getCrawlObservers()->crawlFailed($crawlUrl, $exception);
27 |         }
28 | 
29 |         usleep($this->crawler->getDelayBetweenRequests());
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Spatie bvba <info@spatie.be>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/CrawlObservers/CrawlObserver.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\CrawlObservers;
 4 | 
 5 | use GuzzleHttp\Exception\RequestException;
 6 | use Psr\Http\Message\ResponseInterface;
 7 | use Psr\Http\Message\UriInterface;
 8 | 
 9 | abstract class CrawlObserver
10 | {
11 |     /*
12 |      * Called when the crawler will crawl the url.
13 |      */
14 |     public function willCrawl(UriInterface $url, ?string $linkText): void {}
15 | 
16 |     /*
17 |      * Called when the crawler has crawled the given url successfully.
18 |      */
19 |     public function crawled(
20 |         UriInterface $url,
21 |         ResponseInterface $response,
22 |         ?UriInterface $foundOnUrl = null,
23 |         ?string $linkText = null,
24 |     ): void {}
25 | 
26 |     /*
27 |      * Called when the crawler had a problem crawling the given url.
28 |      */
29 |     public function crawlFailed(
30 |         UriInterface $url,
31 |         RequestException $requestException,
32 |         ?UriInterface $foundOnUrl = null,
33 |         ?string $linkText = null,
34 |     ): void {}
35 | 
36 |     /*
37 |      * Called when the crawl has ended.
38 |      */
39 |     public function finishedCrawling(): void {}
40 | }
41 | 


--------------------------------------------------------------------------------
/src/CrawlUrl.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler;
 4 | 
 5 | use Psr\Http\Message\UriInterface;
 6 | 
 7 | class CrawlUrl
 8 | {
 9 |     public UriInterface $url;
10 | 
11 |     public ?UriInterface $foundOnUrl = null;
12 | 
13 |     public ?string $linkText = null;
14 | 
15 |     protected mixed $id = null;
16 | 
17 |     public static function create(
18 |         UriInterface $url,
19 |         ?UriInterface $foundOnUrl = null,
20 |         $id = null,
21 |         ?string $linkText = null,
22 |     ): static {
23 |         $static = new static($url, $foundOnUrl, linkText: $linkText);
24 | 
25 |         if ($id !== null) {
26 |             $static->setId($id);
27 |         }
28 | 
29 |         return $static;
30 |     }
31 | 
32 |     protected function __construct(UriInterface $url, $foundOnUrl = null, $linkText = null)
33 |     {
34 |         $this->url = $url;
35 | 
36 |         $this->foundOnUrl = $foundOnUrl;
37 | 
38 |         $this->linkText = $linkText;
39 | 
40 |         $this->id = null;
41 |     }
42 | 
43 |     public function getId(): mixed
44 |     {
45 |         return $this->id;
46 |     }
47 | 
48 |     public function setId($id): void
49 |     {
50 |         $this->id = $id;
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/CrawlerRobots.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler;
 4 | 
 5 | use Spatie\Robots\RobotsHeaders;
 6 | use Spatie\Robots\RobotsMeta;
 7 | 
 8 | class CrawlerRobots
 9 | {
10 |     protected RobotsHeaders $robotsHeaders;
11 | 
12 |     protected RobotsMeta $robotsMeta;
13 | 
14 |     protected bool $mustRespectRobots;
15 | 
16 |     public function __construct(array $headers, string $body, bool $mustRespectRobots)
17 |     {
18 |         $this->robotsHeaders = RobotsHeaders::create($headers);
19 | 
20 |         $this->robotsMeta = RobotsMeta::create($body);
21 | 
22 |         $this->mustRespectRobots = $mustRespectRobots;
23 |     }
24 | 
25 |     public function mayIndex(): bool
26 |     {
27 |         if (! $this->mustRespectRobots) {
28 |             return true;
29 |         }
30 | 
31 |         if (! $this->robotsHeaders->mayIndex()) {
32 |             return false;
33 |         }
34 | 
35 |         if (! $this->robotsMeta->mayIndex()) {
36 |             return false;
37 |         }
38 | 
39 |         return true;
40 |     }
41 | 
42 |     public function mayFollow(): bool
43 |     {
44 |         if (! $this->mustRespectRobots) {
45 |             return true;
46 |         }
47 | 
48 |         if (! $this->robotsHeaders->mayFollow()) {
49 |             return false;
50 |         }
51 | 
52 |         if (! $this->robotsMeta->mayFollow()) {
53 |             return false;
54 |         }
55 | 
56 |         return true;
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "spatie/crawler",
 3 |     "description": "Crawl all internal links found on a website",
 4 |     "keywords": [
 5 |         "spatie",
 6 |         "crawler",
 7 |         "link",
 8 |         "website"
 9 |     ],
10 |     "homepage": "https://github.com/spatie/crawler",
11 |     "license": "MIT",
12 |     "authors": [
13 |         {
14 |             "name": "Freek Van der Herten",
15 |             "email": "freek@spatie.be"
16 |         }
17 |     ],
18 |     "require": {
19 |         "php": "^8.2",
20 |         "guzzlehttp/guzzle": "^7.3",
21 |         "guzzlehttp/psr7": "^2.0",
22 |         "illuminate/collections": "^10.0|^11.0|^12.0",
23 |         "nicmart/tree": "^0.10",
24 |         "spatie/browsershot": "^5.0.5",
25 |         "spatie/robots-txt": "^2.0",
26 |         "symfony/dom-crawler": "^6.0|^7.0|^8.0"
27 |     },
28 |     "require-dev": {
29 |         "pestphp/pest": "^2.0|^3.0|^4.0",
30 |         "spatie/ray": "^1.37"
31 |     },
32 |     "config": {
33 |         "sort-packages": true,
34 |         "allow-plugins": {
35 |             "pestphp/pest-plugin": true,
36 |             "phpstan/extension-installer": true
37 |         }
38 |     },
39 |     "autoload": {
40 |         "psr-4": {
41 |             "Spatie\\Crawler\\": "src"
42 |         }
43 |     },
44 |     "autoload-dev": {
45 |         "psr-4": {
46 |             "Spatie\\Crawler\\Test\\": "tests"
47 |         }
48 |     },
49 |     "minimum-stability": "dev",
50 |     "prefer-stable": true
51 | }
52 | 


--------------------------------------------------------------------------------
/src/CrawlObservers/CrawlObserverCollection.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\CrawlObservers;
 4 | 
 5 | use ArrayAccess;
 6 | use GuzzleHttp\Exception\RequestException;
 7 | use Iterator;
 8 | use Psr\Http\Message\ResponseInterface;
 9 | use Spatie\Crawler\CrawlUrl;
10 | 
11 | class CrawlObserverCollection implements ArrayAccess, Iterator
12 | {
13 |     protected int $position;
14 | 
15 |     public function __construct(protected array $observers = [])
16 |     {
17 |         $this->position = 0;
18 |     }
19 | 
20 |     public function addObserver(CrawlObserver $observer): void
21 |     {
22 |         $this->observers[] = $observer;
23 |     }
24 | 
25 |     public function crawled(CrawlUrl $crawlUrl, ResponseInterface $response): void
26 |     {
27 |         foreach ($this->observers as $crawlObserver) {
28 |             $crawlObserver->crawled(
29 |                 $crawlUrl->url,
30 |                 $response,
31 |                 $crawlUrl->foundOnUrl,
32 |                 $crawlUrl->linkText,
33 |             );
34 |         }
35 |     }
36 | 
37 |     public function crawlFailed(CrawlUrl $crawlUrl, RequestException $exception): void
38 |     {
39 |         foreach ($this->observers as $crawlObserver) {
40 |             $crawlObserver->crawlFailed(
41 |                 $crawlUrl->url,
42 |                 $exception,
43 |                 $crawlUrl->foundOnUrl,
44 |                 $crawlUrl->linkText,
45 |             );
46 |         }
47 |     }
48 | 
49 |     public function current(): mixed
50 |     {
51 |         return $this->observers[$this->position];
52 |     }
53 | 
54 |     public function offsetGet(mixed $offset): mixed
55 |     {
56 |         return $this->observers[$offset] ?? null;
57 |     }
58 | 
59 |     public function offsetSet(mixed $offset, mixed $value): void
60 |     {
61 |         if (is_null($offset)) {
62 |             $this->observers[] = $value;
63 |         } else {
64 |             $this->observers[$offset] = $value;
65 |         }
66 |     }
67 | 
68 |     public function offsetExists(mixed $offset): bool
69 |     {
70 |         return isset($this->observers[$offset]);
71 |     }
72 | 
73 |     public function offsetUnset(mixed $offset): void
74 |     {
75 |         unset($this->observers[$offset]);
76 |     }
77 | 
78 |     public function next(): void
79 |     {
80 |         $this->position++;
81 |     }
82 | 
83 |     public function key(): mixed
84 |     {
85 |         return $this->position;
86 |     }
87 | 
88 |     public function valid(): bool
89 |     {
90 |         return isset($this->observers[$this->position]);
91 |     }
92 | 
93 |     public function rewind(): void
94 |     {
95 |         $this->position = 0;
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/CrawlQueues/ArrayCrawlQueue.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spatie\Crawler\CrawlQueues;
  4 | 
  5 | use Psr\Http\Message\UriInterface;
  6 | use Spatie\Crawler\CrawlUrl;
  7 | use Spatie\Crawler\Exceptions\InvalidUrl;
  8 | use Spatie\Crawler\Exceptions\UrlNotFoundByIndex;
  9 | 
 10 | class ArrayCrawlQueue implements CrawlQueue
 11 | {
 12 |     /**
 13 |      * All known URLs, indexed by URL string.
 14 |      *
 15 |      * @var CrawlUrl[]
 16 |      */
 17 |     protected array $urls = [];
 18 | 
 19 |     /**
 20 |      * Pending URLs, indexed by URL string.
 21 |      *
 22 |      * @var CrawlUrl[]
 23 |      */
 24 |     protected array $pendingUrls = [];
 25 | 
 26 |     public function add(CrawlUrl $crawlUrl): CrawlQueue
 27 |     {
 28 |         $urlString = (string) $crawlUrl->url;
 29 | 
 30 |         if (! isset($this->urls[$urlString])) {
 31 |             $crawlUrl->setId($urlString);
 32 | 
 33 |             $this->urls[$urlString] = $crawlUrl;
 34 |             $this->pendingUrls[$urlString] = $crawlUrl;
 35 |         }
 36 | 
 37 |         return $this;
 38 |     }
 39 | 
 40 |     public function hasPendingUrls(): bool
 41 |     {
 42 |         return (bool) $this->pendingUrls;
 43 |     }
 44 | 
 45 |     public function getUrlById($id): CrawlUrl
 46 |     {
 47 |         if (! isset($this->urls[$id])) {
 48 |             throw new UrlNotFoundByIndex("Crawl url {$id} not found in collection.");
 49 |         }
 50 | 
 51 |         return $this->urls[$id];
 52 |     }
 53 | 
 54 |     public function hasAlreadyBeenProcessed(CrawlUrl $crawlUrl): bool
 55 |     {
 56 |         $urlString = (string) $crawlUrl->url;
 57 | 
 58 |         if (isset($this->pendingUrls[$urlString])) {
 59 |             return false;
 60 |         }
 61 | 
 62 |         if (isset($this->urls[$urlString])) {
 63 |             return true;
 64 |         }
 65 | 
 66 |         return false;
 67 |     }
 68 | 
 69 |     public function markAsProcessed(CrawlUrl $crawlUrl): void
 70 |     {
 71 |         $urlString = (string) $crawlUrl->url;
 72 | 
 73 |         unset($this->pendingUrls[$urlString]);
 74 |     }
 75 | 
 76 |     public function getProcessedUrlCount(): int
 77 |     {
 78 |         return count($this->urls) - count($this->pendingUrls);
 79 |     }
 80 | 
 81 |     public function has(CrawlUrl|UriInterface $crawlUrl): bool
 82 |     {
 83 |         if ($crawlUrl instanceof CrawlUrl) {
 84 |             $urlString = (string) $crawlUrl->url;
 85 |         } elseif ($crawlUrl instanceof UriInterface) {
 86 |             $urlString = (string) $crawlUrl;
 87 |         } else {
 88 |             throw InvalidUrl::unexpectedType($crawlUrl);
 89 |         }
 90 | 
 91 |         return isset($this->urls[$urlString]);
 92 |     }
 93 | 
 94 |     public function getPendingUrl(): ?CrawlUrl
 95 |     {
 96 |         foreach ($this->pendingUrls as $pendingUrl) {
 97 |             return $pendingUrl;
 98 |         }
 99 | 
100 |         return null;
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/UrlParsers/SitemapUrlParser.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spatie\Crawler\UrlParsers;
 4 | 
 5 | use Illuminate\Support\Collection;
 6 | use InvalidArgumentException;
 7 | use Psr\Http\Message\UriInterface;
 8 | use Spatie\Crawler\Crawler;
 9 | use Spatie\Crawler\CrawlUrl;
10 | use Spatie\Crawler\Url;
11 | use Symfony\Component\DomCrawler\Crawler as DomCrawler;
12 | use Tree\Node\Node;
13 | 
14 | class SitemapUrlParser implements UrlParser
15 | {
16 |     protected Crawler $crawler;
17 | 
18 |     public function __construct(Crawler $crawler)
19 |     {
20 |         $this->crawler = $crawler;
21 |     }
22 | 
23 |     public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void
24 |     {
25 |         $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
26 | 
27 |         collect($allLinks)
28 |             ->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
29 |             ->map(fn (Url $url) => $this->normalizeUrl($url))
30 |             ->filter(function (Url $url) use ($foundOnUrl, $originalUrl) {
31 |                 if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) {
32 |                     return false;
33 |                 }
34 | 
35 |                 return $this->shouldCrawl($node);
36 |             })
37 |             ->filter(fn (Url $url) => ! str_contains($url->getPath(), '/tel:'))
38 |             ->each(function (Url $url) use ($foundOnUrl) {
39 |                 $crawlUrl = CrawlUrl::create($url, $foundOnUrl, linkText: $url->linkText());
40 | 
41 |                 $this->crawler->addToCrawlQueue($crawlUrl);
42 |             });
43 |     }
44 | 
45 |     protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl): ?Collection
46 |     {
47 |         $domCrawler = new DomCrawler($html, $foundOnUrl);
48 | 
49 |         return collect($domCrawler->filterXPath('//loc')
50 |             ->each(function (DomCrawler $node) {
51 |                 try {
52 |                     $linkText = $node->text();
53 | 
54 |                     if ($linkText) {
55 |                         $linkText = substr($linkText, 0, 4000);
56 |                     }
57 | 
58 |                     return new Url($linkText, $linkText);
59 |                 } catch (InvalidArgumentException $exception) {
60 |                     return null;
61 |                 }
62 |             }));
63 |     }
64 | 
65 |     protected function hasCrawlableScheme(UriInterface $uri): bool
66 |     {
67 |         return in_array($uri->getScheme(), ['http', 'https']);
68 |     }
69 | 
70 |     protected function normalizeUrl(UriInterface $url): UriInterface
71 |     {
72 |         return $url->withFragment('');
73 |     }
74 | 
75 |     protected function shouldCrawl(Node $node): bool
76 |     {
77 |         $mustRespectRobots = $this->crawler->mustRespectRobots();
78 |         $robotsTxt = $this->crawler->getRobotsTxt();
79 | 
80 |         if ($mustRespectRobots && $robotsTxt !== null) {
81 |             $isAllowed = $robotsTxt->allows($node->getValue(), $this->crawler->getUserAgent());
82 |             if (! $isAllowed) {
83 |                 return false;
84 |             }
85 |         }
86 | 
87 |         $maximumDepth = $this->crawler->getMaximumDepth();
88 | 
89 |         if (is_null($maximumDepth)) {
90 |             return true;
91 |         }
92 | 
93 |         return $node->getDepth() <= $maximumDepth;
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/UrlParsers/LinkUrlParser.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spatie\Crawler\UrlParsers;
  4 | 
  5 | use Illuminate\Support\Collection;
  6 | use InvalidArgumentException;
  7 | use Psr\Http\Message\UriInterface;
  8 | use Spatie\Crawler\Crawler;
  9 | use Spatie\Crawler\CrawlUrl;
 10 | use Spatie\Crawler\Url;
 11 | use Symfony\Component\DomCrawler\Crawler as DomCrawler;
 12 | use Symfony\Component\DomCrawler\Link;
 13 | use Tree\Node\Node;
 14 | 
 15 | class LinkUrlParser implements UrlParser
 16 | {
 17 |     protected Crawler $crawler;
 18 | 
 19 |     public function __construct(Crawler $crawler)
 20 |     {
 21 |         $this->crawler = $crawler;
 22 |     }
 23 | 
 24 |     public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void
 25 |     {
 26 |         $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
 27 | 
 28 |         collect($allLinks)
 29 |             ->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
 30 |             ->map(fn (Url $url) => $this->normalizeUrl($url))
 31 |             ->filter(function (Url $url) use ($foundOnUrl, $originalUrl) {
 32 |                 if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) {
 33 |                     return false;
 34 |                 }
 35 | 
 36 |                 return $this->shouldCrawl($node);
 37 |             })
 38 |             ->filter(fn (Url $url) => ! str_contains($url->getPath(), '/tel:'))
 39 |             ->each(function (Url $url) use ($foundOnUrl) {
 40 |                 $crawlUrl = CrawlUrl::create($url, $foundOnUrl, linkText: $url->linkText());
 41 | 
 42 |                 $this->crawler->addToCrawlQueue($crawlUrl);
 43 |             });
 44 |     }
 45 | 
 46 |     protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl): ?Collection
 47 |     {
 48 |         $domCrawler = new DomCrawler($html, $foundOnUrl);
 49 | 
 50 |         return collect($domCrawler->filterXpath('//a | //link[@rel="next" or @rel="prev"]')->links())
 51 |             ->reject(function (Link $link) {
 52 |                 if ($this->isInvalidHrefNode($link)) {
 53 |                     return true;
 54 |                 }
 55 | 
 56 |                 if ($this->crawler->mustRejectNofollowLinks() && str_contains($link->getNode()->getAttribute('rel'), 'nofollow')) {
 57 |                     return true;
 58 |                 }
 59 | 
 60 |                 return false;
 61 |             })
 62 |             ->map(function (Link $link) {
 63 |                 try {
 64 |                     $linkText = $link->getNode()->textContent;
 65 | 
 66 |                     if ($linkText) {
 67 |                         $linkText = substr($linkText, 0, 4000);
 68 |                     }
 69 | 
 70 |                     return new Url($link->getUri(), $linkText);
 71 |                 } catch (InvalidArgumentException $exception) {
 72 |                     return null;
 73 |                 }
 74 |             })
 75 |             ->filter();
 76 |     }
 77 | 
 78 |     protected function hasCrawlableScheme(UriInterface $uri): bool
 79 |     {
 80 |         return in_array($uri->getScheme(), ['http', 'https']);
 81 |     }
 82 | 
 83 |     protected function normalizeUrl(UriInterface $url): UriInterface
 84 |     {
 85 |         return $url->withFragment('');
 86 |     }
 87 | 
 88 |     protected function shouldCrawl(Node $node): bool
 89 |     {
 90 |         $mustRespectRobots = $this->crawler->mustRespectRobots();
 91 |         $robotsTxt = $this->crawler->getRobotsTxt();
 92 | 
 93 |         if ($mustRespectRobots && $robotsTxt !== null) {
 94 |             $isAllowed = $robotsTxt->allows($node->getValue(), $this->crawler->getUserAgent());
 95 |             if (! $isAllowed) {
 96 |                 return false;
 97 |             }
 98 |         }
 99 | 
100 |         $maximumDepth = $this->crawler->getMaximumDepth();
101 | 
102 |         if (is_null($maximumDepth)) {
103 |             return true;
104 |         }
105 | 
106 |         return $node->getDepth() <= $maximumDepth;
107 |     }
108 | 
109 |     protected function isInvalidHrefNode(Link $link): bool
110 |     {
111 |         if ($link->getNode()->nodeName !== 'a') {
112 |             return false;
113 |         }
114 | 
115 |         if ($link->getNode()->nextSibling !== null) {
116 |             return false;
117 |         }
118 | 
119 |         if ($link->getNode()->childNodes->length !== 0) {
120 |             return false;
121 |         }
122 | 
123 |         return true;
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/src/Handlers/CrawlRequestFulfilled.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spatie\Crawler\Handlers;
  4 | 
  5 | use Exception;
  6 | use GuzzleHttp\Exception\RequestException;
  7 | use GuzzleHttp\Psr7\Request;
  8 | use GuzzleHttp\Psr7\Uri;
  9 | use GuzzleHttp\Psr7\Utils;
 10 | use GuzzleHttp\RedirectMiddleware;
 11 | use Psr\Http\Message\ResponseInterface;
 12 | use Psr\Http\Message\StreamInterface;
 13 | use Psr\Http\Message\UriInterface;
 14 | use Spatie\Crawler\Crawler;
 15 | use Spatie\Crawler\CrawlerRobots;
 16 | use Spatie\Crawler\CrawlProfiles\CrawlSubdomains;
 17 | use Spatie\Crawler\CrawlUrl;
 18 | use Spatie\Crawler\ResponseWithCachedBody;
 19 | use Spatie\Crawler\UrlParsers\UrlParser;
 20 | use Symfony\Component\Process\Exception\ProcessFailedException;
 21 | 
 22 | class CrawlRequestFulfilled
 23 | {
 24 |     protected UrlParser $urlParser;
 25 | 
 26 |     public function __construct(protected Crawler $crawler)
 27 |     {
 28 |         $urlParserClass = $this->crawler->getUrlParserClass();
 29 |         $this->urlParser = new $urlParserClass($this->crawler);
 30 |     }
 31 | 
 32 |     public function __invoke(ResponseInterface $response, $index)
 33 |     {
 34 |         $body = $this->getBody($response);
 35 |         if (empty($body)) {
 36 |             usleep($this->crawler->getDelayBetweenRequests());
 37 | 
 38 |             return;
 39 |         }
 40 | 
 41 |         $robots = new CrawlerRobots(
 42 |             $response->getHeaders(),
 43 |             $body,
 44 |             $this->crawler->mustRespectRobots()
 45 |         );
 46 | 
 47 |         $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
 48 | 
 49 |         if ($this->crawler->mayExecuteJavaScript()) {
 50 |             try {
 51 |                 $body = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);
 52 |             } catch (ProcessFailedException $exception) {
 53 |                 $request = new Request('GET', $crawlUrl->url);
 54 |                 $exception = new RequestException($exception->getMessage(), $request);
 55 |                 $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
 56 | 
 57 |                 $this->crawler->getCrawlObservers()->crawlFailed($crawlUrl, $exception);
 58 | 
 59 |                 usleep($this->crawler->getDelayBetweenRequests());
 60 | 
 61 |                 return;
 62 |             }
 63 | 
 64 |             $response = $response->withBody(Utils::streamFor($body));
 65 |         }
 66 | 
 67 |         $responseWithCachedBody = ResponseWithCachedBody::fromGuzzlePsr7Response($response);
 68 |         $responseWithCachedBody->setCachedBody($body);
 69 | 
 70 |         if ($robots->mayIndex()) {
 71 |             $this->handleCrawled($responseWithCachedBody, $crawlUrl);
 72 |         }
 73 | 
 74 |         if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) {
 75 |             if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) {
 76 |                 return;
 77 |             }
 78 |         }
 79 | 
 80 |         if (! $robots->mayFollow()) {
 81 |             return;
 82 |         }
 83 | 
 84 |         $baseUrl = $this->getBaseUrl($response, $crawlUrl);
 85 |         $originalUrl = $crawlUrl->url;
 86 | 
 87 |         $this->urlParser->addFromHtml($body, $baseUrl, $originalUrl);
 88 | 
 89 |         usleep($this->crawler->getDelayBetweenRequests());
 90 |     }
 91 | 
 92 |     protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl): UriInterface
 93 |     {
 94 |         $redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER);
 95 | 
 96 |         if (empty($redirectHistory)) {
 97 |             return $crawlUrl->url;
 98 |         }
 99 | 
100 |         return new Uri(end($redirectHistory));
101 |     }
102 | 
103 |     protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl): void
104 |     {
105 |         $this->crawler->getCrawlObservers()->crawled($crawlUrl, $response);
106 |     }
107 | 
108 |     protected function getBody(ResponseInterface $response): string
109 |     {
110 |         $contentType = $response->getHeaderLine('Content-Type');
111 | 
112 |         if (! $this->isMimetypeAllowedToParse($contentType)) {
113 |             return '';
114 |         }
115 | 
116 |         return $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize());
117 |     }
118 | 
119 |     protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
120 |     {
121 |         if ($bodyStream->isSeekable()) {
122 |             $bodyStream->rewind();
123 |         }
124 | 
125 |         $body = '';
126 | 
127 |         $chunksToRead = $readMaximumBytes < 512 ? $readMaximumBytes : 512;
128 | 
129 |         for ($bytesRead = 0; $bytesRead < $readMaximumBytes; $bytesRead += $chunksToRead) {
130 |             try {
131 |                 $newDataRead = $bodyStream->read($chunksToRead);
132 |             } catch (Exception $exception) {
133 |                 $newDataRead = null;
134 |             }
135 | 
136 |             if (! $newDataRead) {
137 |                 break;
138 |             }
139 | 
140 |             $body .= $newDataRead;
141 |         }
142 | 
143 |         return $body;
144 |     }
145 | 
146 |     protected function getBodyAfterExecutingJavaScript(UriInterface $url): string
147 |     {
148 |         $browsershot = $this->crawler->getBrowsershot();
149 | 
150 |         $html = $browsershot->setUrl((string) $url)->bodyHtml();
151 | 
152 |         return html_entity_decode($html);
153 |     }
154 | 
155 |     protected function isMimetypeAllowedToParse($contentType): bool
156 |     {
157 |         if (empty($contentType)) {
158 |             return true;
159 |         }
160 | 
161 |         if (! count($this->crawler->getParseableMimeTypes())) {
162 |             return true;
163 |         }
164 | 
165 |         foreach ($this->crawler->getParseableMimeTypes() as $allowedType) {
166 |             if (stristr($contentType, $allowedType)) {
167 |                 return true;
168 |             }
169 |         }
170 | 
171 |         return false;
172 |     }
173 | }
174 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="left">
  2 |     <a href="https://spatie.be/open-source?utm_source=github&utm_medium=banner&utm_campaign=crawler">
  3 |       <picture>
  4 |         <source media="(prefers-color-scheme: dark)" srcset="https://spatie.be/packages/header/crawler/html/dark.webp?">
  5 |         <img alt="Logo for crawler" src="https://spatie.be/packages/header/crawler/html/light.webp">
  6 |       </picture>
  7 |     </a>
  8 | 
  9 | <h1>🕸 Crawl the web using PHP 🕷</h1>
 10 | 
 11 | [![Latest Version on Packagist](https://img.shields.io/packagist/v/spatie/crawler.svg?style=flat-square)](https://packagist.org/packages/spatie/crawler)
 12 | [![MIT Licensed](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat-square)](LICENSE.md)
 13 | ![Tests](https://github.com/spatie/crawler/workflows/Tests/badge.svg)
 14 | [![Total Downloads](https://img.shields.io/packagist/dt/spatie/crawler.svg?style=flat-square)](https://packagist.org/packages/spatie/crawler)
 15 |     
 16 | </div>
 17 | 
 18 | This package provides a class to crawl links on a website. Under the hood Guzzle promises are used to [crawl multiple urls concurrently](http://docs.guzzlephp.org/en/latest/quickstart.html?highlight=pool#concurrent-requests).
 19 | 
 20 | Because the crawler can execute JavaScript, it can crawl JavaScript rendered sites. Under the hood [Chrome and Puppeteer](https://github.com/spatie/browsershot) are used to power this feature.
 21 | 
 22 | ## Support us
 23 | 
 24 | [<img src="https://github-ads.s3.eu-central-1.amazonaws.com/crawler.jpg?t=1" width="419px" />](https://spatie.be/github-ad-click/crawler)
 25 | 
 26 | We invest a lot of resources into creating [best in class open source packages](https://spatie.be/open-source). You can support us by [buying one of our paid products](https://spatie.be/open-source/support-us).
 27 | 
 28 | We highly appreciate you sending us a postcard from your hometown, mentioning which of our package(s) you are using. You'll find our address on [our contact page](https://spatie.be/about-us). We publish all received postcards on [our virtual postcard wall](https://spatie.be/open-source/postcards).
 29 | 
 30 | ## Installation
 31 | 
 32 | This package can be installed via Composer:
 33 | 
 34 | ``` bash
 35 | composer require spatie/crawler
 36 | ```
 37 | 
 38 | ## Usage
 39 | 
 40 | The crawler can be instantiated like this
 41 | 
 42 | ```php
 43 | use Spatie\Crawler\Crawler;
 44 | 
 45 | Crawler::create()
 46 |     ->setCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
 47 |     ->startCrawling($url);
 48 | ```
 49 | 
 50 | The argument passed to `setCrawlObserver` must be an object that extends the `\Spatie\Crawler\CrawlObservers\CrawlObserver` abstract class:
 51 | 
 52 | ```php
 53 | namespace Spatie\Crawler\CrawlObservers;
 54 | 
 55 | use GuzzleHttp\Exception\RequestException;
 56 | use Psr\Http\Message\ResponseInterface;
 57 | use Psr\Http\Message\UriInterface;
 58 | 
 59 | abstract class CrawlObserver
 60 | {
 61 |     /*
 62 |      * Called when the crawler will crawl the url.
 63 |      */
 64 |     public function willCrawl(UriInterface $url, ?string $linkText): void
 65 |     {
 66 |     }
 67 | 
 68 |     /*
 69 |      * Called when the crawler has crawled the given url successfully.
 70 |      */
 71 |     abstract public function crawled(
 72 |         UriInterface $url,
 73 |         ResponseInterface $response,
 74 |         ?UriInterface $foundOnUrl = null,
 75 |         ?string $linkText,
 76 |     ): void;
 77 | 
 78 |     /*
 79 |      * Called when the crawler had a problem crawling the given url.
 80 |      */
 81 |     abstract public function crawlFailed(
 82 |         UriInterface $url,
 83 |         RequestException $requestException,
 84 |         ?UriInterface $foundOnUrl = null,
 85 |         ?string $linkText = null,
 86 |     ): void;
 87 | 
 88 |     /**
 89 |      * Called when the crawl has ended.
 90 |      */
 91 |     public function finishedCrawling(): void
 92 |     {
 93 |     }
 94 | }
 95 | ```
 96 | 
 97 | ### Using multiple observers
 98 | 
 99 | You can set multiple observers with `setCrawlObservers`:
100 | 
101 | ```php
102 | Crawler::create()
103 |     ->setCrawlObservers([
104 |         <class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>,
105 |         <class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>,
106 |         ...
107 |      ])
108 |     ->startCrawling($url);
109 | ```
110 | 
111 | Alternatively you can set multiple observers one by one with `addCrawlObserver`:
112 | 
113 | ```php
114 | Crawler::create()
115 |     ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
116 |     ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
117 |     ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
118 |     ->startCrawling($url);
119 | ```
120 | 
121 | ### Executing JavaScript
122 | 
123 | By default, the crawler will not execute JavaScript. This is how you can enable the execution of JavaScript:
124 | 
125 | ```php
126 | Crawler::create()
127 |     ->executeJavaScript()
128 |     ...
129 | ```
130 | 
131 | In order to make it possible to get the body html after the javascript has been executed, this package depends on
132 | our [Browsershot](https://github.com/spatie/browsershot) package.
133 | This package uses [Puppeteer](https://github.com/puppeteer/puppeteer) under the hood. Here are some pointers on [how to install it on your system](https://spatie.be/docs/browsershot/v2/requirements).
134 | 
135 | Browsershot will make an educated guess as to where its dependencies are installed on your system.
136 | By default, the Crawler will instantiate a new Browsershot instance. You may find the need to set a custom created instance using the `setBrowsershot(Browsershot $browsershot)` method.
137 | 
138 | ```php
139 | Crawler::create()
140 |     ->setBrowsershot($browsershot)
141 |     ->executeJavaScript()
142 |     ...
143 | ```
144 | 
145 | Note that the crawler will still work even if you don't have the system dependencies required by Browsershot.
146 | These system dependencies are only required if you're calling `executeJavaScript()`.
147 | 
148 | ### Filtering certain urls
149 | 
150 | You can tell the crawler not to visit certain urls by using the `setCrawlProfile`-function. That function expects
151 | an object that extends `Spatie\Crawler\CrawlProfiles\CrawlProfile`:
152 | 
153 | ```php
154 | /*
155 |  * Determine if the given url should be crawled.
156 |  */
157 | public function shouldCrawl(UriInterface $url): bool;
158 | ```
159 | 
160 | This package comes with three `CrawlProfiles` out of the box:
161 | 
162 | - `CrawlAllUrls`: this profile will crawl all urls on all pages including urls to an external site.
163 | - `CrawlInternalUrls`: this profile will only crawl the internal urls on the pages of a host.
164 | - `CrawlSubdomains`: this profile will only crawl the internal urls and its subdomains on the pages of a host.
165 | 
166 | ### Custom link extraction
167 | 
168 | You can customize how links are extracted from a page by passing a custom `UrlParser` to the crawler.
169 | 
170 | ```php
171 | Crawler::create()
172 |     ->setUrlParserClass(<class that implements \Spatie\Crawler\UrlParsers\UrlParser>::class)
173 |     ...
174 | ```
175 | 
176 | By default, the `LinkUrlParser` is used. This parser will extract all links from the `href` attribute of `a` tags.
177 | 
178 | There is also a built-in `SitemapUrlParser` that will extract & crawl all links from a sitemap. It does support sitemap index files.
179 | 
180 | ```php
181 | Crawler::create()
182 |     ->setUrlParserClass(SitemapUrlParser::class)
183 |     ...
184 | ```
185 | 
186 | ### Ignoring robots.txt and robots meta
187 | 
188 | By default, the crawler will respect robots data. It is possible to disable these checks like so:
189 | 
190 | ```php
191 | Crawler::create()
192 |     ->ignoreRobots()
193 |     ...
194 | ```
195 | 
196 | Robots data can come from either a `robots.txt` file, meta tags or response headers.
197 | More information on the spec can be found here: [http://www.robotstxt.org/](http://www.robotstxt.org/).
198 | 
199 | Parsing robots data is done by our package [spatie/robots-txt](https://github.com/spatie/robots-txt).
200 | 
201 | ### Accept links with rel="nofollow" attribute
202 | 
203 | By default, the crawler will reject all links containing attribute rel="nofollow". It is possible to disable these checks like so:
204 | 
205 | ```php
206 | Crawler::create()
207 |     ->acceptNofollowLinks()
208 |     ...
209 | ```
210 | 
211 | ### Using a custom User Agent ###
212 | 
213 | In order to respect robots.txt rules for a custom User Agent you can specify your own custom User Agent.
214 | 
215 | ```php
216 | Crawler::create()
217 |     ->setUserAgent('my-agent')
218 | ```
219 | 
220 | You can add your specific crawl rule group for 'my-agent' in robots.txt. This example disallows crawling the entire site for crawlers identified by 'my-agent'.
221 | 
222 | ```txt
223 | // Disallow crawling for my-agent
224 | User-agent: my-agent
225 | Disallow: /
226 | ```
227 | 
228 | ## Setting the number of concurrent requests
229 | 
230 | To improve the speed of the crawl the package concurrently crawls 10 urls by default. If you want to change that number you can use the `setConcurrency` method.
231 | 
232 | ```php
233 | Crawler::create()
234 |     ->setConcurrency(1) // now all urls will be crawled one by one
235 | ```
236 | 
237 | ## Defining Crawl and Time Limits
238 | 
239 | By default, the crawler continues until it has crawled every page it can find. This behavior might cause issues if you are working in an environment with limitations such as a serverless environment.
240 | 
241 | The crawl behavior can be controlled with the following two options:
242 | 
243 |  - **Total Crawl Limit** (`setTotalCrawlLimit`): This limit defines the maximal count of URLs to crawl.
244 |  - **Current Crawl Limit** (`setCurrentCrawlLimit`): This defines how many URLs are processed during the current crawl.
245 |  - **Total Execution Time Limit** (`setTotalExecutionTimeLimit`): This limit defines the maximal execution time of the crawl.
246 |  - **Current Execution Time Limit** (`setCurrentExecutionTimeLimit`): This limits the execution time of the current crawl.
247 | 
248 | Let's take a look at some examples to clarify the difference between `setTotalCrawlLimit` and `setCurrentCrawlLimit`.
249 | The difference between `setTotalExecutionTimeLimit` and `setCurrentExecutionTimeLimit` will be the same.
250 | 
251 | ### Example 1: Using the total crawl limit
252 | 
253 | The `setTotalCrawlLimit` method allows you to limit the total number of URLs to crawl, no matter how often you call the crawler.
254 | 
255 | ```php
256 | $queue = <your selection/implementation of a queue>;
257 | 
258 | // Crawls 5 URLs and ends.
259 | Crawler::create()
260 |     ->setCrawlQueue($queue)
261 |     ->setTotalCrawlLimit(5)
262 |     ->startCrawling($url);
263 | 
264 | // Doesn't crawl further as the total limit is reached.
265 | Crawler::create()
266 |     ->setCrawlQueue($queue)
267 |     ->setTotalCrawlLimit(5)
268 |     ->startCrawling($url);
269 | ```
270 | 
271 | ### Example 2: Using the current crawl limit
272 | 
273 | The `setCurrentCrawlLimit` will set a limit on how many URls will be crawled per execution. This piece of code will process 5 pages with each execution, without a total limit of pages to crawl.
274 | 
275 | ```php
276 | $queue = <your selection/implementation of a queue>;
277 | 
278 | // Crawls 5 URLs and ends.
279 | Crawler::create()
280 |     ->setCrawlQueue($queue)
281 |     ->setCurrentCrawlLimit(5)
282 |     ->startCrawling($url);
283 | 
284 | // Crawls the next 5 URLs and ends.
285 | Crawler::create()
286 |     ->setCrawlQueue($queue)
287 |     ->setCurrentCrawlLimit(5)
288 |     ->startCrawling($url);
289 | ```
290 | 
291 | ### Example 3: Combining the total and crawl limit
292 | 
293 | Both limits can be combined to control the crawler:
294 | 
295 | ```php
296 | $queue = <your selection/implementation of a queue>;
297 | 
298 | // Crawls 5 URLs and ends.
299 | Crawler::create()
300 |     ->setCrawlQueue($queue)
301 |     ->setTotalCrawlLimit(10)
302 |     ->setCurrentCrawlLimit(5)
303 |     ->startCrawling($url);
304 | 
305 | // Crawls the next 5 URLs and ends.
306 | Crawler::create()
307 |     ->setCrawlQueue($queue)
308 |     ->setTotalCrawlLimit(10)
309 |     ->setCurrentCrawlLimit(5)
310 |     ->startCrawling($url);
311 | 
312 | // Doesn't crawl further as the total limit is reached.
313 | Crawler::create()
314 |     ->setCrawlQueue($queue)
315 |     ->setTotalCrawlLimit(10)
316 |     ->setCurrentCrawlLimit(5)
317 |     ->startCrawling($url);
318 | ```
319 | 
320 | ### Example 4: Crawling across requests
321 | 
322 | You can use the `setCurrentCrawlLimit` to break up long running crawls. The following example demonstrates a (simplified) approach. It's made up of an initial request and any number of follow-up requests continuing the crawl.
323 | 
324 | #### Initial Request
325 | 
326 | To start crawling across different requests, you will need to create a new queue of your selected queue-driver. Start by passing the queue-instance to the crawler. The crawler will start filling the queue as pages are processed and new URLs are discovered. Serialize and store the queue reference after the crawler has finished (using the current crawl limit).
327 | 
328 | ```php
329 | // Create a queue using your queue-driver.
330 | $queue = <your selection/implementation of a queue>;
331 | 
332 | // Crawl the first set of URLs
333 | Crawler::create()
334 |     ->setCrawlQueue($queue)
335 |     ->setCurrentCrawlLimit(10)
336 |     ->startCrawling($url);
337 | 
338 | // Serialize and store your queue
339 | $serializedQueue = serialize($queue);
340 | ```
341 | 
342 | #### Subsequent Requests
343 | 
344 | For any following requests you will need to unserialize your original queue and pass it to the crawler:
345 | 
346 | ```php
347 | // Unserialize queue
348 | $queue = unserialize($serializedQueue);
349 | 
350 | // Crawls the next set of URLs
351 | Crawler::create()
352 |     ->setCrawlQueue($queue)
353 |     ->setCurrentCrawlLimit(10)
354 |     ->startCrawling($url);
355 | 
356 | // Serialize and store your queue
357 | $serialized_queue = serialize($queue);
358 | ```
359 | 
360 | The behavior is based on the information in the queue. Only if the same queue-instance is passed in the behavior works as described. When a completely new queue is passed in, the limits of previous crawls -even for the same website- won't apply.
361 | 
362 | An example with more details can be found [here](https://github.com/spekulatius/spatie-crawler-cached-queue-example).
363 | 
364 | ## Setting the maximum crawl depth
365 | 
366 | By default, the crawler continues until it has crawled every page of the supplied URL. If you want to limit the depth of the crawler you can use the `setMaximumDepth` method.
367 | 
368 | ```php
369 | Crawler::create()
370 |     ->setMaximumDepth(2)
371 | ```
372 | 
373 | ## Setting the maximum response size
374 | 
375 | Most html pages are quite small. But the crawler could accidentally pick up on large files such as PDFs and MP3s. To keep memory usage low in such cases the crawler will only use the responses that are smaller than 2 MB. If, when streaming a response, it becomes larger than 2 MB, the crawler will stop streaming the response. An empty response body will be assumed.
376 | 
377 | You can change the maximum response size.
378 | 
379 | ```php
380 | // let's use a 3 MB maximum.
381 | Crawler::create()
382 |     ->setMaximumResponseSize(1024 * 1024 * 3)
383 | ```
384 | 
385 | ## Add a delay between requests
386 | 
387 | In some cases you might get rate-limited when crawling too aggressively. To circumvent this, you can use the `setDelayBetweenRequests()` method to add a pause between every request. This value is expressed in milliseconds.
388 | 
389 | ```php
390 | Crawler::create()
391 |     ->setDelayBetweenRequests(150) // After every page crawled, the crawler will wait for 150ms
392 | ```
393 | 
394 | ## Limiting which content-types to parse
395 | 
396 | By default, every found page will be downloaded (up to `setMaximumResponseSize()` in size) and parsed for additional links. You can limit which content-types should be downloaded and parsed by setting the `setParseableMimeTypes()` with an array of allowed types.
397 | 
398 | ```php
399 | Crawler::create()
400 |     ->setParseableMimeTypes(['text/html', 'text/plain'])
401 | ```
402 | 
403 | This will prevent downloading the body of pages that have different mime types, like binary files, audio/video, ... that are unlikely to have links embedded in them. This feature mostly saves bandwidth.
404 | 
405 | ## Using a custom crawl queue
406 | 
407 | When crawling a site the crawler will put urls to be crawled in a queue. By default, this queue is stored in memory using the built-in `ArrayCrawlQueue`.
408 | 
409 | When a site is very large you may want to store that queue elsewhere, maybe a database. In such cases, you can write your own crawl queue.
410 | 
411 | A valid crawl queue is any class that implements the `Spatie\Crawler\CrawlQueues\CrawlQueue`-interface. You can pass your custom crawl queue via the `setCrawlQueue` method on the crawler.
412 | 
413 | ```php
414 | Crawler::create()
415 |     ->setCrawlQueue(<implementation of \Spatie\Crawler\CrawlQueues\CrawlQueue>)
416 | ```
417 | 
418 | Here
419 | 
420 | - [ArrayCrawlQueue](https://github.com/spatie/crawler/blob/master/src/CrawlQueues/ArrayCrawlQueue.php)
421 | - [RedisCrawlQueue (third-party package)](https://github.com/repat/spatie-crawler-redis)
422 | - [CacheCrawlQueue for Laravel (third-party package)](https://github.com/spekulatius/spatie-crawler-toolkit-for-laravel)
423 | - [Laravel Model as Queue (third-party example app)](https://github.com/insign/spatie-crawler-queue-with-laravel-model)
424 | 
425 | ## Change the default base url scheme
426 | 
427 | By default, the crawler will set the base url scheme to `http` if none. You have the ability to change that with `setDefaultScheme`.
428 | 
429 | ```php
430 | Crawler::create()
431 |     ->setDefaultScheme('https')
432 | ```
433 | 
434 | ## Changelog
435 | 
436 | Please see [CHANGELOG](CHANGELOG.md) for more information what has changed recently.
437 | 
438 | ## Contributing
439 | 
440 | Please see [CONTRIBUTING](https://github.com/spatie/.github/blob/main/CONTRIBUTING.md) for details.
441 | 
442 | ## Testing
443 | 
444 | First, install the Puppeteer dependency, or your tests will fail.
445 | 
446 | ```
447 | npm install puppeteer
448 | ```
449 | 
450 | To run the tests you'll have to start the included node based server first in a separate terminal window.
451 | 
452 | ```bash
453 | cd tests/server
454 | npm install
455 | node server.js
456 | ```
457 | 
458 | With the server running, you can start testing.
459 | ```bash
460 | composer test
461 | ```
462 | 
463 | ## Security
464 | 
465 | If you've found a bug regarding security please mail [security@spatie.be](mailto:security@spatie.be) instead of using the issue tracker.
466 | 
467 | ## Postcardware
468 | 
469 | You're free to use this package, but if it makes it to your production environment we highly appreciate you sending us a postcard from your hometown, mentioning which of our package(s) you are using.
470 | 
471 | Our address is: Spatie, Kruikstraat 22, 2018 Antwerp, Belgium.
472 | 
473 | We publish all received postcards [on our company website](https://spatie.be/en/opensource/postcards).
474 | 
475 | ## Credits
476 | 
477 | - [Freek Van der Herten](https://github.com/freekmurze)
478 | - [All Contributors](../../contributors)
479 | 
480 | ## License
481 | 
482 | The MIT License (MIT). Please see [License File](LICENSE.md) for more information.
483 | 


--------------------------------------------------------------------------------
/src/Crawler.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spatie\Crawler;
  4 | 
  5 | use Generator;
  6 | use GuzzleHttp\Client;
  7 | use GuzzleHttp\Pool;
  8 | use GuzzleHttp\Psr7\Request;
  9 | use GuzzleHttp\Psr7\Uri;
 10 | use GuzzleHttp\RequestOptions;
 11 | use Psr\Http\Message\UriInterface;
 12 | use Spatie\Browsershot\Browsershot;
 13 | use Spatie\Crawler\CrawlObservers\CrawlObserver;
 14 | use Spatie\Crawler\CrawlObservers\CrawlObserverCollection;
 15 | use Spatie\Crawler\CrawlProfiles\CrawlAllUrls;
 16 | use Spatie\Crawler\CrawlProfiles\CrawlProfile;
 17 | use Spatie\Crawler\CrawlQueues\ArrayCrawlQueue;
 18 | use Spatie\Crawler\CrawlQueues\CrawlQueue;
 19 | use Spatie\Crawler\Exceptions\InvalidCrawlRequestHandler;
 20 | use Spatie\Crawler\Handlers\CrawlRequestFailed;
 21 | use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
 22 | use Spatie\Crawler\UrlParsers\LinkUrlParser;
 23 | use Spatie\Robots\RobotsTxt;
 24 | use Tree\Node\Node;
 25 | 
 26 | class Crawler
 27 | {
 28 |     public const DEFAULT_USER_AGENT = '*';
 29 | 
 30 |     protected UriInterface $baseUrl;
 31 | 
 32 |     protected CrawlObserverCollection $crawlObservers;
 33 | 
 34 |     protected CrawlProfile $crawlProfile;
 35 | 
 36 |     protected CrawlQueue $crawlQueue;
 37 | 
 38 |     protected int $totalUrlCount = 0;
 39 | 
 40 |     protected int $currentUrlCount = 0;
 41 | 
 42 |     protected ?int $totalCrawlLimit = null;
 43 | 
 44 |     protected ?int $currentCrawlLimit = null;
 45 | 
 46 |     protected ?int $startedAt = null;
 47 | 
 48 |     protected int $executionTime = 0;
 49 | 
 50 |     protected ?int $totalExecutionTimeLimit = null;
 51 | 
 52 |     protected ?int $currentExecutionTimeLimit = null;
 53 | 
 54 |     protected int $maximumResponseSize = 1024 * 1024 * 2;
 55 | 
 56 |     protected ?int $maximumDepth = null;
 57 | 
 58 |     protected bool $respectRobots = true;
 59 | 
 60 |     protected bool $rejectNofollowLinks = true;
 61 | 
 62 |     protected Node $depthTree;
 63 | 
 64 |     protected bool $executeJavaScript = false;
 65 | 
 66 |     protected ?Browsershot $browsershot = null;
 67 | 
 68 |     protected ?RobotsTxt $robotsTxt = null;
 69 | 
 70 |     protected string $crawlRequestFulfilledClass;
 71 | 
 72 |     protected string $crawlRequestFailedClass;
 73 | 
 74 |     protected string $urlParserClass;
 75 | 
 76 |     protected int $delayBetweenRequests = 0;
 77 | 
 78 |     protected array $allowedMimeTypes = [];
 79 | 
 80 |     protected string $defaultScheme = 'http';
 81 | 
 82 |     protected static array $defaultClientOptions = [
 83 |         RequestOptions::COOKIES => true,
 84 |         RequestOptions::CONNECT_TIMEOUT => 10,
 85 |         RequestOptions::TIMEOUT => 10,
 86 |         RequestOptions::ALLOW_REDIRECTS => false,
 87 |         RequestOptions::HEADERS => [
 88 |             'User-Agent' => self::DEFAULT_USER_AGENT,
 89 |         ],
 90 |     ];
 91 | 
 92 |     public static function create(array $clientOptions = []): static
 93 |     {
 94 |         $clientOptions = (count($clientOptions))
 95 |             ? $clientOptions
 96 |             : static::$defaultClientOptions;
 97 | 
 98 |         $client = new Client($clientOptions);
 99 | 
100 |         return new static($client);
101 |     }
102 | 
103 |     public function __construct(
104 |         protected Client $client,
105 |         protected int $concurrency = 10,
106 |     ) {
107 |         $this->crawlProfile = new CrawlAllUrls;
108 | 
109 |         $this->crawlQueue = new ArrayCrawlQueue;
110 | 
111 |         $this->crawlObservers = new CrawlObserverCollection;
112 | 
113 |         $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
114 | 
115 |         $this->crawlRequestFailedClass = CrawlRequestFailed::class;
116 | 
117 |         $this->urlParserClass = LinkUrlParser::class;
118 |     }
119 | 
120 |     public function getDefaultScheme(): string
121 |     {
122 |         return $this->defaultScheme;
123 |     }
124 | 
125 |     public function setDefaultScheme(string $defaultScheme): self
126 |     {
127 |         $this->defaultScheme = $defaultScheme;
128 | 
129 |         return $this;
130 |     }
131 | 
132 |     public function setConcurrency(int $concurrency): self
133 |     {
134 |         $this->concurrency = $concurrency;
135 | 
136 |         return $this;
137 |     }
138 | 
139 |     public function setMaximumResponseSize(int $maximumResponseSizeInBytes): self
140 |     {
141 |         $this->maximumResponseSize = $maximumResponseSizeInBytes;
142 | 
143 |         return $this;
144 |     }
145 | 
146 |     public function getMaximumResponseSize(): ?int
147 |     {
148 |         return $this->maximumResponseSize;
149 |     }
150 | 
151 |     public function setTotalCrawlLimit(int $totalCrawlLimit): self
152 |     {
153 |         $this->totalCrawlLimit = $totalCrawlLimit;
154 | 
155 |         return $this;
156 |     }
157 | 
158 |     public function getTotalCrawlLimit(): ?int
159 |     {
160 |         return $this->totalCrawlLimit;
161 |     }
162 | 
163 |     public function getTotalCrawlCount(): int
164 |     {
165 |         return $this->totalUrlCount;
166 |     }
167 | 
168 |     public function setCurrentCrawlLimit(int $currentCrawlLimit): self
169 |     {
170 |         $this->currentCrawlLimit = $currentCrawlLimit;
171 | 
172 |         return $this;
173 |     }
174 | 
175 |     public function getCurrentCrawlLimit(): ?int
176 |     {
177 |         return $this->currentCrawlLimit;
178 |     }
179 | 
180 |     public function getCurrentCrawlCount(): int
181 |     {
182 |         return $this->currentUrlCount;
183 |     }
184 | 
185 |     public function setTotalExecutionTimeLimit(int $totalExecutionTimeLimitInSecond): self
186 |     {
187 |         $this->totalExecutionTimeLimit = $totalExecutionTimeLimitInSecond;
188 | 
189 |         return $this;
190 |     }
191 | 
192 |     public function getTotalExecutionTimeLimit(): ?int
193 |     {
194 |         return $this->totalExecutionTimeLimit;
195 |     }
196 | 
197 |     public function getTotalExecutionTime(): int
198 |     {
199 |         return $this->executionTime + $this->getCurrentExecutionTime();
200 |     }
201 | 
202 |     public function setCurrentExecutionTimeLimit(int $currentExecutionTimeLimitInSecond): self
203 |     {
204 |         $this->currentExecutionTimeLimit = $currentExecutionTimeLimitInSecond;
205 | 
206 |         return $this;
207 |     }
208 | 
209 |     public function getCurrentExecutionTimeLimit(): ?int
210 |     {
211 |         return $this->currentExecutionTimeLimit;
212 |     }
213 | 
214 |     public function getCurrentExecutionTime(): int
215 |     {
216 |         if (is_null($this->startedAt)) {
217 |             return 0;
218 |         }
219 | 
220 |         return time() - $this->startedAt;
221 |     }
222 | 
223 |     public function setMaximumDepth(int $maximumDepth): self
224 |     {
225 |         $this->maximumDepth = $maximumDepth;
226 | 
227 |         return $this;
228 |     }
229 | 
230 |     public function getMaximumDepth(): ?int
231 |     {
232 |         return $this->maximumDepth;
233 |     }
234 | 
235 |     public function setDelayBetweenRequests(int $delayInMilliseconds): self
236 |     {
237 |         $this->delayBetweenRequests = ($delayInMilliseconds * 1000);
238 | 
239 |         return $this;
240 |     }
241 | 
242 |     public function getDelayBetweenRequests(): int
243 |     {
244 |         return $this->delayBetweenRequests;
245 |     }
246 | 
247 |     public function setParseableMimeTypes(array $types): self
248 |     {
249 |         $this->allowedMimeTypes = $types;
250 | 
251 |         return $this;
252 |     }
253 | 
254 |     public function getParseableMimeTypes(): array
255 |     {
256 |         return $this->allowedMimeTypes;
257 |     }
258 | 
259 |     public function ignoreRobots(): self
260 |     {
261 |         $this->respectRobots = false;
262 | 
263 |         return $this;
264 |     }
265 | 
266 |     public function respectRobots(): self
267 |     {
268 |         $this->respectRobots = true;
269 | 
270 |         return $this;
271 |     }
272 | 
273 |     public function mustRespectRobots(): bool
274 |     {
275 |         return $this->respectRobots;
276 |     }
277 | 
278 |     public function acceptNofollowLinks(): self
279 |     {
280 |         $this->rejectNofollowLinks = false;
281 | 
282 |         return $this;
283 |     }
284 | 
285 |     public function rejectNofollowLinks(): self
286 |     {
287 |         $this->rejectNofollowLinks = true;
288 | 
289 |         return $this;
290 |     }
291 | 
292 |     public function mustRejectNofollowLinks(): bool
293 |     {
294 |         return $this->rejectNofollowLinks;
295 |     }
296 | 
297 |     public function getRobotsTxt(): ?RobotsTxt
298 |     {
299 |         return $this->robotsTxt;
300 |     }
301 | 
302 |     public function setCrawlQueue(CrawlQueue $crawlQueue): self
303 |     {
304 |         $this->crawlQueue = $crawlQueue;
305 | 
306 |         return $this;
307 |     }
308 | 
309 |     public function getCrawlQueue(): CrawlQueue
310 |     {
311 |         return $this->crawlQueue;
312 |     }
313 | 
314 |     public function executeJavaScript(): self
315 |     {
316 |         $this->executeJavaScript = true;
317 | 
318 |         return $this;
319 |     }
320 | 
321 |     public function doNotExecuteJavaScript(): self
322 |     {
323 |         $this->executeJavaScript = false;
324 | 
325 |         return $this;
326 |     }
327 | 
328 |     public function mayExecuteJavascript(): bool
329 |     {
330 |         return $this->executeJavaScript;
331 |     }
332 | 
333 |     public function setCrawlObserver(CrawlObserver|array $crawlObservers): self
334 |     {
335 |         if (! is_array($crawlObservers)) {
336 |             $crawlObservers = [$crawlObservers];
337 |         }
338 | 
339 |         return $this->setCrawlObservers($crawlObservers);
340 |     }
341 | 
342 |     public function setCrawlObservers(array $crawlObservers): self
343 |     {
344 |         $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
345 | 
346 |         return $this;
347 |     }
348 | 
349 |     public function addCrawlObserver(CrawlObserver $crawlObserver): self
350 |     {
351 |         $this->crawlObservers->addObserver($crawlObserver);
352 | 
353 |         return $this;
354 |     }
355 | 
356 |     public function getCrawlObservers(): CrawlObserverCollection
357 |     {
358 |         return $this->crawlObservers;
359 |     }
360 | 
361 |     public function setCrawlProfile(CrawlProfile $crawlProfile): self
362 |     {
363 |         $this->crawlProfile = $crawlProfile;
364 | 
365 |         return $this;
366 |     }
367 | 
368 |     public function getCrawlProfile(): CrawlProfile
369 |     {
370 |         return $this->crawlProfile;
371 |     }
372 | 
373 |     public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): self
374 |     {
375 |         $baseClass = CrawlRequestFulfilled::class;
376 | 
377 |         if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
378 |             throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
379 |         }
380 | 
381 |         $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
382 | 
383 |         return $this;
384 |     }
385 | 
386 |     public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): self
387 |     {
388 |         $baseClass = CrawlRequestFailed::class;
389 | 
390 |         if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
391 |             throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
392 |         }
393 | 
394 |         $this->crawlRequestFailedClass = $crawlRequestFailedClass;
395 | 
396 |         return $this;
397 |     }
398 | 
399 |     public function setUrlParserClass(string $urlParserClass): self
400 |     {
401 |         $this->urlParserClass = $urlParserClass;
402 | 
403 |         return $this;
404 |     }
405 | 
406 |     public function getUrlParserClass(): string
407 |     {
408 |         return $this->urlParserClass;
409 |     }
410 | 
411 |     public function setBrowsershot(Browsershot $browsershot)
412 |     {
413 |         $this->browsershot = $browsershot;
414 | 
415 |         return $this;
416 |     }
417 | 
418 |     public function setUserAgent(string $userAgent): self
419 |     {
420 |         $clientOptions = $this->client->getConfig();
421 | 
422 |         $headers = array_change_key_case($clientOptions['headers']);
423 |         $headers['user-agent'] = $userAgent;
424 | 
425 |         $clientOptions['headers'] = $headers;
426 | 
427 |         $this->client = new Client($clientOptions);
428 | 
429 |         return $this;
430 |     }
431 | 
432 |     public function getUserAgent(): string
433 |     {
434 |         $headers = $this->client->getConfig('headers');
435 | 
436 |         foreach (array_keys($headers) as $name) {
437 |             if (strtolower($name) === 'user-agent') {
438 |                 return (string) $headers[$name];
439 |             }
440 |         }
441 | 
442 |         return static::DEFAULT_USER_AGENT;
443 |     }
444 | 
445 |     public function getBrowsershot(): Browsershot
446 |     {
447 |         if (! $this->browsershot) {
448 |             $this->browsershot = new Browsershot;
449 |         }
450 | 
451 |         return $this->browsershot;
452 |     }
453 | 
454 |     public function getBaseUrl(): UriInterface
455 |     {
456 |         return $this->baseUrl;
457 |     }
458 | 
459 |     public function startCrawling(UriInterface|string $baseUrl)
460 |     {
461 |         $this->startedAt = time();
462 | 
463 |         if (! $baseUrl instanceof UriInterface) {
464 |             $baseUrl = new Uri($baseUrl);
465 |         }
466 | 
467 |         if ($baseUrl->getScheme() === '') {
468 |             $baseUrl = $baseUrl->withScheme($this->defaultScheme);
469 |         }
470 | 
471 |         if ($baseUrl->getPath() === '') {
472 |             $baseUrl = $baseUrl->withPath('/');
473 |         }
474 | 
475 |         $this->totalUrlCount = $this->crawlQueue->getProcessedUrlCount();
476 | 
477 |         $this->baseUrl = $baseUrl;
478 | 
479 |         $crawlUrl = CrawlUrl::create($this->baseUrl);
480 | 
481 |         if ($this->respectRobots) {
482 |             $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
483 |         }
484 | 
485 |         if ($this->shouldAddToCrawlQueue($crawlUrl)) {
486 |             $this->addToCrawlQueue($crawlUrl);
487 |         }
488 | 
489 |         $this->depthTree = new Node((string) $this->baseUrl);
490 | 
491 |         $this->startCrawlingQueue();
492 | 
493 |         foreach ($this->crawlObservers as $crawlObserver) {
494 |             $crawlObserver->finishedCrawling();
495 |         }
496 | 
497 |         $this->executionTime += time() - $this->startedAt;
498 |         $this->startedAt = null; // To reset currentExecutionTime
499 |     }
500 | 
501 |     public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node
502 |     {
503 |         if (is_null($this->maximumDepth)) {
504 |             return new Node((string) $url);
505 |         }
506 | 
507 |         $node = $node ?? $this->depthTree;
508 | 
509 |         $returnNode = null;
510 | 
511 |         if ($node->getValue() === (string) $parentUrl || $node->getValue() === (string) $originalUrl) {
512 |             $newNode = new Node((string) $url);
513 | 
514 |             $node->addChild($newNode);
515 | 
516 |             return $newNode;
517 |         }
518 | 
519 |         foreach ($node->getChildren() as $currentNode) {
520 |             $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode, $originalUrl);
521 | 
522 |             if (! is_null($returnNode)) {
523 |                 break;
524 |             }
525 |         }
526 | 
527 |         return $returnNode;
528 |     }
529 | 
530 |     protected function shouldAddToCrawlQueue($crawlUrl): bool
531 |     {
532 |         if (! $this->respectRobots) {
533 |             return true;
534 |         }
535 | 
536 |         if ($this->robotsTxt === null) {
537 |             return false;
538 |         }
539 | 
540 |         if ($this->robotsTxt->allows((string) $crawlUrl->url, $this->getUserAgent())) {
541 |             return true;
542 |         }
543 | 
544 |         return false;
545 |     }
546 | 
547 |     protected function startCrawlingQueue(): void
548 |     {
549 |         while (
550 |             $this->reachedCrawlLimits() === false &&
551 |             $this->reachedTimeLimits() === false &&
552 |             $this->crawlQueue->hasPendingUrls()
553 |         ) {
554 |             $pool = new Pool($this->client, $this->getCrawlRequests(), [
555 |                 'concurrency' => $this->concurrency,
556 |                 'options' => $this->client->getConfig(),
557 |                 'fulfilled' => new $this->crawlRequestFulfilledClass($this),
558 |                 'rejected' => new $this->crawlRequestFailedClass($this),
559 |             ]);
560 | 
561 |             $promise = $pool->promise();
562 | 
563 |             $promise->wait();
564 |         }
565 |     }
566 | 
567 |     protected function createRobotsTxt(UriInterface $uri): RobotsTxt
568 |     {
569 |         try {
570 |             $robotsUrl = (string) $uri->withPath('/robots.txt');
571 |             $response = $this->client->get($robotsUrl);
572 |             $content = (string) $response->getBody();
573 | 
574 |             return new RobotsTxt($content);
575 |         } catch (\Exception $exception) {
576 |             return new RobotsTxt('');
577 |         }
578 |     }
579 | 
580 |     protected function getCrawlRequests(): Generator
581 |     {
582 |         while (
583 |             $this->reachedCrawlLimits() === false &&
584 |             $this->reachedTimeLimits() === false &&
585 |             $crawlUrl = $this->crawlQueue->getPendingUrl()
586 |         ) {
587 |             if (
588 |                 $this->crawlProfile->shouldCrawl($crawlUrl->url) === false ||
589 |                 $this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)
590 |             ) {
591 |                 $this->crawlQueue->markAsProcessed($crawlUrl);
592 | 
593 |                 continue;
594 |             }
595 | 
596 |             foreach ($this->crawlObservers as $crawlObserver) {
597 |                 $crawlObserver->willCrawl($crawlUrl->url, $crawlUrl->linkText);
598 |             }
599 | 
600 |             $this->totalUrlCount++;
601 |             $this->currentUrlCount++;
602 |             $this->crawlQueue->markAsProcessed($crawlUrl);
603 | 
604 |             yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
605 |         }
606 |     }
607 | 
608 |     public function addToCrawlQueue(CrawlUrl $crawlUrl): self
609 |     {
610 |         if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
611 |             return $this;
612 |         }
613 | 
614 |         if ($this->getCrawlQueue()->has($crawlUrl->url)) {
615 |             return $this;
616 |         }
617 | 
618 |         $this->crawlQueue->add($crawlUrl);
619 | 
620 |         return $this;
621 |     }
622 | 
623 |     public function reachedCrawlLimits(): bool
624 |     {
625 |         $totalCrawlLimit = $this->getTotalCrawlLimit();
626 |         if (! is_null($totalCrawlLimit) && $this->getTotalCrawlCount() >= $totalCrawlLimit) {
627 |             return true;
628 |         }
629 | 
630 |         $currentCrawlLimit = $this->getCurrentCrawlLimit();
631 |         if (! is_null($currentCrawlLimit) && $this->getCurrentCrawlCount() >= $currentCrawlLimit) {
632 |             return true;
633 |         }
634 | 
635 |         return false;
636 |     }
637 | 
638 |     public function reachedTimeLimits(): bool
639 |     {
640 |         $totalExecutionTimeLimit = $this->getTotalExecutionTimeLimit();
641 |         if (! is_null($totalExecutionTimeLimit) && $this->getTotalExecutionTime() >= $totalExecutionTimeLimit) {
642 |             return true;
643 |         }
644 | 
645 |         $currentExecutionTimeLimit = $this->getCurrentExecutionTimeLimit();
646 |         if (! is_null($currentExecutionTimeLimit) && $this->getCurrentExecutionTime() >= $currentExecutionTimeLimit) {
647 |             return true;
648 |         }
649 | 
650 |         return false;
651 |     }
652 | }
653 | 


--------------------------------------------------------------------------------