├── .editorconfig ├── .github └── workflows │ ├── php-cs-fixer.yml │ ├── run-tests.yml │ └── update-changelog.yml ├── .php-cs-fixer.dist.php ├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── composer.json ├── http-status-check ├── package-lock.json └── src ├── ConsoleApplication.php ├── CrawlLogger.php └── ScanCommand.php /.editorconfig: -------------------------------------------------------------------------------- 1 | ; This file is for unifying the coding style for different editors and IDEs. 2 | ; More information at http://editorconfig.org 3 | 4 | root = true 5 | 6 | [*] 7 | charset = utf-8 8 | indent_size = 4 9 | indent_style = space 10 | end_of_line = lf 11 | insert_final_newline = true 12 | trim_trailing_whitespace = true 13 | 14 | [*.md] 15 | trim_trailing_whitespace = false 16 | -------------------------------------------------------------------------------- /.github/workflows/php-cs-fixer.yml: -------------------------------------------------------------------------------- 1 | name: Check & fix styling 2 | 3 | on: [push] 4 | 5 | jobs: 6 | php-cs-fixer: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@v2 12 | with: 13 | ref: ${{ github.head_ref }} 14 | 15 | - name: Run PHP CS Fixer 16 | uses: docker://oskarstark/php-cs-fixer-ga 17 | with: 18 | args: --config=.php-cs-fixer.dist.php --allow-risky=yes 19 | 20 | - name: Commit changes 21 | uses: stefanzweifel/git-auto-commit-action@v4 22 | with: 23 | commit_message: Fix styling 24 | -------------------------------------------------------------------------------- /.github/workflows/run-tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | os: [ubuntu-latest] 12 | php: [8.2, 8.1] 13 | stability: [prefer-lowest, prefer-stable] 14 | 15 | name: P${{ matrix.php }} - ${{ matrix.stability }} - ${{ matrix.os }} 16 | 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v2 20 | 21 | - name: Setup PHP 22 | uses: shivammathur/setup-php@v2 23 | with: 24 | php-version: ${{ matrix.php }} 25 | extensions: dom, curl, libxml, mbstring, zip, pcntl, pdo, sqlite, pdo_sqlite, bcmath, soap, intl, gd, exif, iconv, imagick 26 | coverage: none 27 | 28 | - name: Setup problem matchers 29 | run: | 30 | echo "::add-matcher::${{ runner.tool_cache }}/php.json" 31 | echo "::add-matcher::${{ runner.tool_cache }}/phpunit.json" 32 | - name: Install dependencies 33 | run: composer update --${{ matrix.stability }} --prefer-dist --no-interaction 34 | 35 | - name: Start server 36 | run: ./tests/server/start_server.sh 37 | 38 | - name: Execute tests 39 | run: vendor/bin/phpunit 40 | -------------------------------------------------------------------------------- /.github/workflows/update-changelog.yml: -------------------------------------------------------------------------------- 1 | name: "Update Changelog" 2 | 3 | on: 4 | release: 5 | types: [released] 6 | 7 | jobs: 8 | update: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Checkout code 13 | uses: actions/checkout@v2 14 | with: 15 | ref: main 16 | 17 | - name: Update Changelog 18 | uses: stefanzweifel/changelog-updater-action@v1 19 | with: 20 | latest-version: ${{ github.event.release.name }} 21 | release-notes: ${{ github.event.release.body }} 22 | 23 | - name: Commit updated CHANGELOG 24 | uses: stefanzweifel/git-auto-commit-action@v4 25 | with: 26 | branch: main 27 | commit_message: Update CHANGELOG 28 | file_pattern: CHANGELOG.md 29 | -------------------------------------------------------------------------------- /.php-cs-fixer.dist.php: -------------------------------------------------------------------------------- 1 | in([ 5 | __DIR__ . '/src', 6 | __DIR__ . '/tests', 7 | ]) 8 | ->name('*.php') 9 | ->ignoreDotFiles(true) 10 | ->ignoreVCS(true); 11 | 12 | return (new PhpCsFixer\Config()) 13 | ->setRules([ 14 | '@PSR12' => true, 15 | 'array_syntax' => ['syntax' => 'short'], 16 | 'ordered_imports' => ['sort_algorithm' => 'alpha'], 17 | 'no_unused_imports' => true, 18 | 'not_operator_with_successor_space' => true, 19 | 'trailing_comma_in_multiline' => true, 20 | 'phpdoc_scalar' => true, 21 | 'unary_operator_spaces' => true, 22 | 'binary_operator_spaces' => true, 23 | 'blank_line_before_statement' => [ 24 | 'statements' => ['break', 'continue', 'declare', 'return', 'throw', 'try'], 25 | ], 26 | 'phpdoc_single_line_var_spacing' => true, 27 | 'phpdoc_var_without_name' => true, 28 | 'class_attributes_separation' => [ 29 | 'elements' => [ 30 | 'method' => 'one', 31 | ], 32 | ], 33 | 'method_argument_space' => [ 34 | 'on_multiline' => 'ensure_fully_multiline', 35 | 'keep_multiple_spaces_after_comma' => true, 36 | ], 37 | 'single_trait_insert_per_statement' => true, 38 | ]) 39 | ->setFinder($finder); 40 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to `spatie/http-status-check` will be documented in this file. 4 | 5 | ## 4.0.0 - 2023-06-14 6 | 7 | ### What's Changed 8 | 9 | - Fix broken screenshot link on readme file by @yongkyali in https://github.com/spatie/http-status-check/pull/88 10 | - Update spatie/http-status-check by @nlemoine in https://github.com/spatie/http-status-check/pull/91 11 | 12 | ### New Contributors 13 | 14 | - @yongkyali made their first contribution in https://github.com/spatie/http-status-check/pull/88 15 | 16 | **Full Changelog**: https://github.com/spatie/http-status-check/compare/3.4.0...4.0.0 17 | 18 | ## 4.0 - 2023-06-14 19 | 20 | - Remove support for PHP 8.0 21 | - Update spatie/crawler 22 | - Bump dependencies 23 | - Fix deprecation notices related to spatie/crawler 24 | 25 | ## 3.3.0 - 2020-12-01 26 | 27 | - Add support for PHP 8 28 | 29 | ## 3.2.0 - 2020-03-18 30 | 31 | - follow and log redirects as multiple responses (#66) 32 | 33 | ## 3.1.4 - 2020-02-18 34 | 35 | - fix for overwriting a file 36 | 37 | ## 3.1.3 - 2020-02-16 38 | 39 | - use response status code if available (#59) 40 | 41 | ## 3.1.2 - 2019-11-19 42 | 43 | - allow symfony 5 components 44 | 45 | ## 3.1.1 - 2018-05-22 46 | 47 | - Add an extra null check when a request fails to determine the message. 48 | 49 | ## 3.1.0 - 2018-05-09 50 | 51 | - Update crawler to `^4.1.0`. 52 | - Add `--ignore-robots` option. 53 | 54 | ## 3.0.0 - 2017-12-24 55 | 56 | - PHP 7.1 required 57 | - update `spatie/crawler` to `^3.0` 58 | 59 | ## 2.5.0 - 2017-12-22 60 | 61 | - added support for Symfony 4 62 | 63 | ## 2.4.0 - 2017-10-16 64 | 65 | - added some command line arguments 66 | 67 | ## 2.3.0 - 2017-02-01 68 | 69 | - add `timout` option 70 | 71 | ## 2.2.1 - 2017-02-17 72 | 73 | - fix add `dont-crawl-external-urls` option 74 | 75 | ## 2.2.0 76 | 77 | - add `dont-crawl-external-urls` option 78 | 79 | ## 2.1.1 80 | 81 | - append urls to log file instead of overwriting entire file 82 | 83 | ## 2.1.0 84 | 85 | - added an option to write an output log file 86 | 87 | ## 2.0.0 88 | 89 | - improve speed by crawling links concurrently 90 | - show on which url a broken link was found 91 | 92 | ## 1.0.2 93 | 94 | - add support for Symfony 3 95 | 96 | ## 1.0.1 97 | 98 | - Lower requirements to php 5.5 99 | 100 | ## 1.0.0 101 | 102 | - First release 103 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Spatie bvba 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Check the HTTP status code of all links on a website 2 | 3 | [![Latest Version on Packagist](https://img.shields.io/packagist/v/spatie/http-status-check.svg?style=flat-square)](https://packagist.org/packages/spatie/http-status-check) 4 | [![Software License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat-square)](LICENSE.md) 5 | ![Tests](https://github.com/spatie/http-status-check/workflows/Tests/badge.svg) 6 | [![Total Downloads](https://img.shields.io/packagist/dt/spatie/http-status-check.svg?style=flat-square)](https://packagist.org/packages/spatie/http-status-check) 7 | 8 | This repository provides a tool to check the HTTP status code of every link on a given website. 9 | 10 | ## Support us 11 | 12 | [](https://spatie.be/github-ad-click/http-status-check) 13 | 14 | We invest a lot of resources into creating [best in class open source packages](https://spatie.be/open-source). You can support us by [buying one of our paid products](https://spatie.be/open-source/support-us). 15 | 16 | We highly appreciate you sending us a postcard from your hometown, mentioning which of our package(s) you are using. You'll find our address on [our contact page](https://spatie.be/about-us). We publish all received postcards on [our virtual postcard wall](https://spatie.be/open-source/postcards). 17 | 18 | ## Installation 19 | 20 | This package can be installed via Composer: 21 | 22 | ```bash 23 | composer global require spatie/http-status-check 24 | ``` 25 | 26 | ## Usage 27 | 28 | This tool will scan all links on a given website: 29 | 30 | ```bash 31 | http-status-check scan https://example.com 32 | ``` 33 | 34 | It outputs a line per link found. Here's an example on Laracast website scan: 35 | 36 | ![screenshot](https://freek.dev/uploads/2015/11/screenshot.png) 37 | 38 | When the crawling process is finished a summary will be shown. 39 | 40 | By default the crawler uses 10 concurrent connections to speed up the crawling process. You can change that number by passing a different value to the `--concurrency` option: 41 | 42 | ```bash 43 | http-status-check scan https://example.com --concurrency=20 44 | ``` 45 | 46 | You can also write all urls that gave a non-2xx or non-3xx response to a file: 47 | 48 | ```bash 49 | http-status-check scan https://example.com --output=log.txt 50 | ``` 51 | 52 | When the crawler finds a link to an external website it will by default crawl that link as well. If you don't want the crawler to crawl such external urls use the `--dont-crawl-external-links` option: 53 | 54 | ```bash 55 | http-status-check scan https://example.com --dont-crawl-external-links 56 | ``` 57 | 58 | By default, requests timeout after 10 seconds. You can change this by passing the number of seconds to the `--timeout` option: 59 | 60 | ```bash 61 | http-status-check scan https://example.com --timeout=30 62 | ``` 63 | 64 | By default, the crawler will respect robots data. You can ignore them though with the `--ignore-robots` option: 65 | 66 | ```bash 67 | http-status-check scan https://example.com --ignore-robots 68 | ``` 69 | 70 | If your site requires a basic authentification, you can use the `--auth` option: 71 | 72 | ```bash 73 | http-status-check scan https://example.com --auth=username:password 74 | ``` 75 | 76 | ## Testing 77 | 78 | To run the tests, first make sure you have [Node.js](https://nodejs.org/) installed. Then start the included node based server in a separate terminal window: 79 | 80 | ```bash 81 | cd tests/server 82 | npm install 83 | node server.js 84 | ``` 85 | 86 | With the server running, you can start testing: 87 | 88 | ```bash 89 | vendor/bin/phpunit 90 | ``` 91 | 92 | ## Changelog 93 | 94 | Please see [CHANGELOG](CHANGELOG.md) for more information on what has changed recently. 95 | 96 | ## Contributing 97 | 98 | Please see [CONTRIBUTING](https://github.com/spatie/.github/blob/main/CONTRIBUTING.md) for details. 99 | 100 | ## Security 101 | 102 | If you've found a bug regarding security please mail [security@spatie.be](mailto:security@spatie.be) instead of using the issue tracker. 103 | 104 | ## Credits 105 | 106 | - [Freek Van der Herten](https://github.com/freekmurze) 107 | - [Sebastian De Deyne](https://github.com/sebastiandedeyne) 108 | - [All Contributors](../../contributors) 109 | 110 | ## License 111 | 112 | The MIT License (MIT). Please see [License File](LICENSE.md) for more information. 113 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "spatie/http-status-check", 3 | "description": "CLI tool to crawl a website and check HTTP status code", 4 | "homepage": "https://github.com/spatie/http-status-checker", 5 | "license": "MIT", 6 | "keywords": [ 7 | "spatie", 8 | "http", 9 | "status", 10 | "check", 11 | "crawler" 12 | ], 13 | "authors": [ 14 | { 15 | "name": "Freek Van der Herten", 16 | "email": "freek@spatie.be" 17 | }, 18 | { 19 | "name": "Sebastian De Deyne", 20 | "email": "sebastiandedeyne@gmail.com" 21 | } 22 | ], 23 | "require": { 24 | "php": "^8.1", 25 | "symfony/console": "^5.4|^6.0|^7.0", 26 | "spatie/crawler": "^8.0", 27 | "guzzlehttp/promises": "^2.0" 28 | }, 29 | "require-dev": { 30 | "phpunit/phpunit": "^9.5" 31 | }, 32 | "autoload": { 33 | "psr-4": { 34 | "Spatie\\HttpStatusCheck\\": "src" 35 | } 36 | }, 37 | "autoload-dev": { 38 | "psr-4": { 39 | "Spatie\\HttpStatusCheck\\Test\\": "tests" 40 | } 41 | }, 42 | "bin": [ 43 | "http-status-check" 44 | ] 45 | } 46 | -------------------------------------------------------------------------------- /http-status-check: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | 3 | run(); 14 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "lockfileVersion": 1 3 | } 4 | -------------------------------------------------------------------------------- /src/ConsoleApplication.php: -------------------------------------------------------------------------------- 1 | add(new ScanCommand()); 16 | } 17 | 18 | public function getLongVersion() 19 | { 20 | return parent::getLongVersion().' by Spatie'; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/CrawlLogger.php: -------------------------------------------------------------------------------- 1 | consoleOutput = $consoleOutput; 37 | } 38 | 39 | /** 40 | * Called when the crawl will crawl the url. 41 | */ 42 | public function willCrawl(UriInterface $url, ?string $linkText): void 43 | { 44 | } 45 | 46 | /** 47 | * Called when the crawl has ended. 48 | */ 49 | public function finishedCrawling(): void 50 | { 51 | $this->consoleOutput->writeln(''); 52 | $this->consoleOutput->writeln('Crawling summary'); 53 | $this->consoleOutput->writeln('----------------'); 54 | 55 | ksort($this->crawledUrls); 56 | 57 | foreach ($this->crawledUrls as $statusCode => $urls) { 58 | $colorTag = $this->getColorTagForStatusCode($statusCode); 59 | 60 | $count = count($urls); 61 | 62 | if (is_numeric($statusCode)) { 63 | $this->consoleOutput->writeln("<{$colorTag}>Crawled {$count} url(s) with statuscode {$statusCode}"); 64 | } 65 | 66 | if ($statusCode == static::UNRESPONSIVE_HOST) { 67 | $this->consoleOutput->writeln("<{$colorTag}>{$count} url(s) did have unresponsive host(s)"); 68 | } 69 | } 70 | 71 | $this->consoleOutput->writeln(''); 72 | } 73 | 74 | protected function getColorTagForStatusCode(string $code): string 75 | { 76 | if ($this->startsWith($code, '2')) { 77 | return 'info'; 78 | } 79 | 80 | if ($this->startsWith($code, '3')) { 81 | return 'comment'; 82 | } 83 | 84 | return 'error'; 85 | } 86 | 87 | /** 88 | * @param string|null $haystack 89 | * @param string|array $needles 90 | * 91 | * @return bool 92 | */ 93 | public function startsWith($haystack, $needles): bool 94 | { 95 | foreach ((array) $needles as $needle) { 96 | if ($needle != '' && substr($haystack, 0, strlen($needle)) === (string) $needle) { 97 | return true; 98 | } 99 | } 100 | 101 | return false; 102 | } 103 | 104 | /** 105 | * Set the filename to write the output log. 106 | * 107 | * @param string $filename 108 | */ 109 | public function setOutputFile($filename) 110 | { 111 | $this->outputFile = $filename; 112 | } 113 | 114 | public function crawled( 115 | UriInterface $url, 116 | ResponseInterface $response, 117 | ?UriInterface $foundOnUrl = null, 118 | ?string $linkText = null, 119 | ): void { 120 | if ($this->addRedirectedResult($url, $response, $foundOnUrl)) { 121 | return; 122 | } 123 | 124 | // response wasnt a redirect so lets add it as a standard result 125 | $this->addResult( 126 | (string) $url, 127 | (string) $foundOnUrl, 128 | $response->getStatusCode(), 129 | $response->getReasonPhrase() 130 | ); 131 | } 132 | 133 | public function crawlFailed( 134 | UriInterface $url, 135 | RequestException $requestException, 136 | ?UriInterface $foundOnUrl = null, 137 | ?string $linkText = null, 138 | ): void { 139 | if ($response = $requestException->getResponse()) { 140 | $this->crawled($url, $response, $foundOnUrl); 141 | } else { 142 | $this->addResult((string) $url, (string) $foundOnUrl, '---', self::UNRESPONSIVE_HOST); 143 | } 144 | } 145 | 146 | public function addResult($url, $foundOnUrl, $statusCode, $reason) 147 | { 148 | /* 149 | * don't display duplicate results 150 | * this happens if a redirect is followed to an existing page 151 | */ 152 | if (isset($this->crawledUrls[$statusCode]) && in_array($url, $this->crawledUrls[$statusCode])) { 153 | return; 154 | } 155 | 156 | $colorTag = $this->getColorTagForStatusCode($statusCode); 157 | 158 | $timestamp = date('Y-m-d H:i:s'); 159 | 160 | $message = "{$statusCode} {$reason} - ".(string) $url; 161 | 162 | if ($foundOnUrl && $colorTag === 'error') { 163 | $message .= " (found on {$foundOnUrl})"; 164 | } 165 | 166 | if ($this->outputFile && $colorTag === 'error') { 167 | file_put_contents($this->outputFile, $message.PHP_EOL, FILE_APPEND); 168 | } 169 | 170 | $this->consoleOutput->writeln("<{$colorTag}>[{$timestamp}] {$message}"); 171 | 172 | $this->crawledUrls[$statusCode][] = $url; 173 | } 174 | 175 | /* 176 | * https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests 177 | */ 178 | public function addRedirectedResult( 179 | UriInterface $url, 180 | ResponseInterface $response, 181 | ?UriInterface $foundOnUrl = null 182 | ) { 183 | // if its not a redirect the return false 184 | if (! $response->getHeader('X-Guzzle-Redirect-History')) { 185 | return false; 186 | } 187 | 188 | // retrieve Redirect URI history 189 | $redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); 190 | 191 | // retrieve Redirect HTTP Status history 192 | $redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); 193 | 194 | // Add the initial URI requested to the (beginning of) URI history 195 | array_unshift($redirectUriHistory, (string) $url); 196 | 197 | // Add the final HTTP status code to the end of HTTP response history 198 | array_push($redirectCodeHistory, $response->getStatusCode()); 199 | 200 | // Combine the items of each array into a single result set 201 | $fullRedirectReport = []; 202 | foreach ($redirectUriHistory as $key => $value) { 203 | $fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]]; 204 | } 205 | 206 | // Add the redirects and final URL as results 207 | foreach ($fullRedirectReport as $k => $redirect) { 208 | $this->addResult( 209 | (string) $redirect['location'], 210 | (string) $foundOnUrl, 211 | $redirect['code'], 212 | $k + 1 == count($fullRedirectReport) ? $response->getReasonPhrase() : self::REDIRECT 213 | ); 214 | } 215 | 216 | return true; 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/ScanCommand.php: -------------------------------------------------------------------------------- 1 | setName('scan') 21 | ->setDescription('Check the http status code of all links on a website.') 22 | ->addArgument( 23 | 'url', 24 | InputArgument::REQUIRED, 25 | 'The url to check' 26 | ) 27 | ->addOption( 28 | 'concurrency', 29 | 'c', 30 | InputOption::VALUE_REQUIRED, 31 | 'The amount of concurrent connections to use', 32 | 10 33 | ) 34 | ->addOption( 35 | 'output', 36 | 'o', 37 | InputOption::VALUE_REQUIRED, 38 | 'Log all non-2xx and non-3xx responses in this file' 39 | ) 40 | ->addOption( 41 | 'dont-crawl-external-links', 42 | 'x', 43 | InputOption::VALUE_NONE, 44 | 'Dont crawl external links' 45 | ) 46 | ->addOption( 47 | 'timeout', 48 | 't', 49 | InputOption::VALUE_OPTIONAL, 50 | 'The maximum number of seconds the request can take', 51 | 10 52 | ) 53 | ->addOption( 54 | 'user-agent', 55 | 'u', 56 | InputOption::VALUE_OPTIONAL, 57 | 'The User Agent to pass for the request', 58 | '' 59 | ) 60 | ->addOption( 61 | 'skip-verification', 62 | 's', 63 | InputOption::VALUE_NONE, 64 | 'Skips checking the SSL certificate' 65 | ) 66 | ->addOption( 67 | 'auth', 68 | 'a', 69 | InputOption::VALUE_OPTIONAL, 70 | 'Username and password for basic auth (username:password)' 71 | ) 72 | ->addOption( 73 | 'options', 74 | 'opt', 75 | InputOption::VALUE_IS_ARRAY | InputOption::VALUE_OPTIONAL, 76 | 'Additional options to the request', 77 | [] 78 | ) 79 | ->addOption( 80 | 'ignore-robots', 81 | null, 82 | InputOption::VALUE_NONE, 83 | 'Ignore robots checks' 84 | ); 85 | } 86 | 87 | /** 88 | * @param \Symfony\Component\Console\Input\InputInterface $input 89 | * @param \Symfony\Component\Console\Output\OutputInterface $output 90 | * 91 | * @return int 92 | */ 93 | protected function execute(InputInterface $input, OutputInterface $output) 94 | { 95 | $baseUrl = $input->getArgument('url'); 96 | $crawlProfile = $input->getOption('dont-crawl-external-links') ? new CrawlInternalUrls($baseUrl) : new CrawlAllUrls(); 97 | 98 | $output->writeln("Start scanning {$baseUrl}"); 99 | $output->writeln(''); 100 | 101 | $crawlLogger = new CrawlLogger($output); 102 | 103 | if ($input->getOption('output')) { 104 | $outputFile = $input->getOption('output'); 105 | 106 | if (file_exists($outputFile)) { 107 | $helper = $this->getHelper('question'); 108 | $question = new ConfirmationQuestion( 109 | "The output file `{$outputFile}` already exists. Overwrite it? (y/n)", 110 | false 111 | ); 112 | 113 | if (! $helper->ask($input, $output, $question)) { 114 | $output->writeln('Aborting...'); 115 | 116 | return 0; 117 | } 118 | 119 | unlink($outputFile); 120 | } 121 | 122 | $crawlLogger->setOutputFile($input->getOption('output')); 123 | } 124 | 125 | $clientOptions = [ 126 | RequestOptions::TIMEOUT => $input->getOption('timeout'), 127 | RequestOptions::VERIFY => ! $input->getOption('skip-verification'), 128 | RequestOptions::ALLOW_REDIRECTS => [ 129 | 'track_redirects' => true, 130 | ], 131 | ]; 132 | 133 | $clientOptions = array_merge($clientOptions, $input->getOption('options')); 134 | 135 | if ($input->getOption('auth') && false !== strpos($input->getOption('auth'), ':')) { 136 | $clientOptions[RequestOptions::AUTH] = explode(':', $input->getOption('auth')); 137 | } 138 | 139 | if ($input->getOption('user-agent')) { 140 | $clientOptions[RequestOptions::HEADERS]['user-agent'] = $input->getOption('user-agent'); 141 | } 142 | 143 | $crawler = Crawler::create($clientOptions) 144 | ->setConcurrency($input->getOption('concurrency')) 145 | ->setCrawlObserver($crawlLogger) 146 | ->setCrawlProfile($crawlProfile); 147 | 148 | if ($input->getOption('ignore-robots')) { 149 | $crawler->ignoreRobots(); 150 | } 151 | 152 | $crawler->startCrawling($baseUrl); 153 | 154 | return 0; 155 | } 156 | } 157 | --------------------------------------------------------------------------------