├── .github ├── dependabot.yml └── workflows │ └── main.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── assets ├── components-graph.png └── schema.png ├── build ├── .gitkeep └── logs │ └── .gitkeep ├── composer.json ├── composer.lock ├── phpunit.xml ├── source ├── Directive.php ├── LogsIfAvailableTrait.php ├── Parser │ ├── DirectiveProcessors │ │ ├── AbstractAllowanceProcessor.php │ │ ├── AbstractDirectiveProcessor.php │ │ ├── AllowProcessor.php │ │ ├── CacheDelayProcessor.php │ │ ├── CleanParamProcessor.php │ │ ├── CrawlDelayProcessor.php │ │ ├── DirectiveProcessorInterface.php │ │ ├── DisallowProcessor.php │ │ ├── HostProcessor.php │ │ ├── SitemapProcessor.php │ │ └── UserAgentProcessor.php │ ├── DirectiveProcessorsFactory.php │ ├── HostName.php │ ├── TreeBuilder.php │ ├── TreeBuilderInterface.php │ ├── Url.php │ └── UserAgent │ │ ├── UserAgentMatcher.php │ │ └── UserAgentMatcherInterface.php ├── RobotsTxtParser.php ├── Stream │ ├── CustomFilterInterface.php │ ├── Filters │ │ ├── SkipCommentedLinesFilter.php │ │ ├── SkipDirectivesWithInvalidValuesFilter.php │ │ ├── SkipEmptyLinesFilter.php │ │ ├── SkipEndOfCommentedLineFilter.php │ │ ├── SkipUnsupportedDirectivesFilter.php │ │ └── TrimSpacesLeftFilter.php │ ├── GeneratorBasedReader.php │ └── ReaderInterface.php └── WarmingMessages.php └── test ├── AllowTest.php ├── AtSymbolTest.php ├── CommentsTest.php ├── Directives ├── CacheDelayTest.php ├── CleanParamTest.php ├── CrawlDelayTest.php ├── HostTest.php └── SitemapsTest.php ├── DisallowAllTest.php ├── DisallowUppercasePathTest.php ├── EmptyRulesShouldAllowEverythingTest.php ├── EncodingTest.php ├── EndAnchorTest.php ├── Fixtures ├── allow-all.txt ├── allow-spec.txt ├── cache-delay-spec.txt ├── crawl-delay-spec.txt ├── disallow-all.txt ├── expected-skipped-lines-log.php ├── large-commented-lines.txt ├── market-yandex-Windows-1251.txt ├── market-yandex-ru.txt ├── wikipedia-org.txt ├── with-clean-param.txt ├── with-commented-line-endings.txt ├── with-commented-lines.txt ├── with-empty-and-whitespace.txt ├── with-empty-lines.txt ├── with-empty-rules.txt ├── with-faulty-directives.txt ├── with-hosts.txt ├── with-invalid-request-rate.txt └── with-sitemaps.txt ├── HttpStatusCodeTest.php ├── InvalidPathTest.php ├── Parser ├── DirectivesProcessors │ ├── CleanParamProcessorTest.php │ ├── CrawlDelayProcessorTest.php │ ├── HostProcessorTest.php │ ├── SitemapProcessorTest.php │ └── UserAgentProcessorTest.php └── UserAgent │ └── UserAgentMatcherTest.php ├── RenderTest.php ├── RobotsTxtParserTest.php ├── Stream ├── Filter │ ├── SkipCommentedLinesFilterTest.php │ ├── SkipDirectivesWithInvalidValuesFilterTest.php │ ├── SkipEmptyLinesFilterTest.php │ ├── SkipEndOfCommentedLineFilterTest.php │ ├── SkipUnsupportedDirectivesTest.php │ └── TrimSpacesLeftAndRightFilterTest.php └── ReaderTest.php ├── UnlistedPathTest.php ├── UserAgentTest.php ├── WhitespacesTest.php └── bootstrap.php /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "composer" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | # Triggers the workflow on push or pull request events but only for the master branch 5 | push: 6 | branches: [ master ] 7 | pull_request: 8 | branches: [ master ] 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | env: 14 | ACTIONS_ALLOW_UNSECURE_COMMANDS: true 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - name: Cache Composer dependencies 20 | uses: actions/cache@v2 21 | with: 22 | path: /tmp/composer-cache 23 | key: ${{ runner.os }}-${{ hashFiles('**/composer.lock') }} 24 | 25 | # @link https://github.com/php-actions/composer 26 | - uses: php-actions/composer@v5 27 | with: 28 | php_version: 7.4 29 | php_extensions: mbstring iconv json xdebug 30 | version: 1 31 | 32 | # @link https://github.com/php-actions/example-phpunit 33 | - name: "phpunit" 34 | uses: php-actions/phpunit@v2 35 | with: 36 | php_version: 7.4 37 | php_extensions: mbstring iconv json xdebug 38 | bootstrap: test/bootstrap.php 39 | configuration: phpunit.xml 40 | args: --coverage-clover clover.xml 41 | env: 42 | XDEBUG_MODE: coverage 43 | 44 | - name: "Send code coverage report to Codecov.io" 45 | env: 46 | CODECOV_TOKEN: "${{ secrets.CODECOV_TOKEN }}" 47 | run: "bash <(curl -s https://codecov.io/bash) || true" 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/* 2 | .idea/ 3 | composer.phar 4 | codeclimate.json 5 | build/logs/* 6 | .phpunit.result.cache 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM php:7.4-cli-alpine 2 | 3 | RUN apk add gnu-libiconv --update-cache --repository http://dl-cdn.alpinelinux.org/alpine/edge/testing/ --allow-untrusted 4 | ENV LD_PRELOAD /usr/lib/preloadable_libiconv.so php 5 | 6 | # install composer 7 | RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2013 Igor Timoshenkov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Robots.txt php parser class 2 | ===================== 3 | 4 | [![Build Status](https://travis-ci.org/t1gor/Robots.txt-Parser-Class.svg?branch=master)](https://travis-ci.org/t1gor/Robots.txt-Parser-Class) [![Code Climate](https://codeclimate.com/github/t1gor/Robots.txt-Parser-Class/badges/gpa.svg)](https://codeclimate.com/github/t1gor/Robots.txt-Parser-Class) [![Test Coverage](https://codeclimate.com/github/t1gor/Robots.txt-Parser-Class/badges/coverage.svg)](https://codeclimate.com/github/t1gor/Robots.txt-Parser-Class) [![License](https://poser.pugx.org/t1gor/robots-txt-parser/license.svg)](https://packagist.org/packages/t1gor/robots-txt-parser) [![Total Downloads](https://poser.pugx.org/t1gor/robots-txt-parser/downloads.svg)](https://packagist.org/packages/t1gor/robots-txt-parser) 5 | 6 | PHP class to parse robots.txt rules according to Google, Yandex, W3C and The Web Robots Pages specifications. 7 | 8 | Full list of supported specifications (and what's not supported, yet) are available in our [Wiki](https://github.com/t1gor/Robots.txt-Parser-Class/wiki/Specifications). 9 | 10 | ### Supported directives: 11 | 12 | - User-agent 13 | - Allow 14 | - Disallow 15 | - Sitemap 16 | - Host 17 | - Cache-delay 18 | - Clean-param 19 | - Crawl-delay 20 | - Request-rate (in progress) 21 | - Visit-time (in progress) 22 | 23 | ### Installation 24 | The library is available for install via Composer package. To install via Composer, please add the requirement to your `composer.json` file, like this: 25 | 26 | ```sh 27 | composer require t1gor/robots-txt-parser 28 | ``` 29 | 30 | You can find out more about Composer here: https://getcomposer.org/ 31 | 32 | ### Usage example 33 | 34 | ###### Creating parser instance 35 | 36 | ```php 37 | use t1gor\RobotsTxtParser\RobotsTxtParser; 38 | 39 | # from string 40 | $parser = new RobotsTxtParser("User-agent: * \nDisallow: /"); 41 | 42 | # from local file 43 | $parser = new RobotsTxtParser(fopen('some/robots.txt')); 44 | 45 | # or a remote one (make sure it's allowed in your php.ini) 46 | # even FTP should work (but this is not confirmed) 47 | $parser = new RobotsTxtParser(fopen('http://example.com/robots.txt')); 48 | ``` 49 | 50 | ###### Logging parsing process 51 | 52 | We are implementing `LoggerAwareInterface` from `PSR`, so it should work out of the box with any logger supporting that standard. Please see below for Monolog example with Telegram bot: 53 | 54 | ```php 55 | use Monolog\Handler\TelegramBotHandler; 56 | use Monolog\Logger; 57 | use PHPUnit\Framework\TestCase; 58 | use Psr\Log\LogLevel; 59 | use t1gor\RobotsTxtParser\RobotsTxtParser; 60 | 61 | $monologLogger = new Logger('robot.txt-parser'); 62 | $monologLogger->setHandler(new TelegramBotHandler('api-key', 'channel')); 63 | 64 | $parser = new RobotsTxtParser(fopen('some/robots.txt')); 65 | $parser->setLogger($monologLogger); 66 | ``` 67 | 68 | Most log entries we have are of `LogLevel::DEBUG`, but there might also be some `LogLevel::WARNINGS` where it is appropriate. 69 | 70 | ###### Parsing non UTF-8 encoded files 71 | 72 | ```php 73 | use t1gor\RobotsTxtParser\RobotsTxtParser; 74 | 75 | /** @see EncodingTest for more details */ 76 | $parser = new RobotsTxtParser(fopen('market-yandex-Windows-1251.txt', 'r'), 'Windows-1251'); 77 | ``` 78 | 79 | ### Public API 80 | 81 | | Method | Params | Returns | Description | 82 | | ------ | ------ | ------ | ----------- | 83 | | `setLogger` | `Psr\Log\LoggerInterface $logger` | `void` | | 84 | | `getLogger` | `-` | `Psr\Log\LoggerInterface` | | 85 | | `setHttpStatusCode` | `int $code` | `void` | Set HTTP response code for allowance checks | 86 | | `isAllowed` | `string $url, ?string $userAgent` | `bool` | If no `$userAgent` is passed, will return for `*` | 87 | | `isDisallowed` | `string $url, ?string $userAgent` | `bool` | If no `$userAgent` is passed, will return for `*` | 88 | | `getDelay` | `string $userAgent, string $type = 'crawl-delay'` | `float` | Get any of the delays, e.g. `Crawl-delay`, `Cache-delay`, etc. | 89 | | `getCleanParam` | `-` | `[ string => string[] ]` | Where key is the path, and values are params | 90 | | `getRules` | `?string $userAgent` | `array` | Get the rules the parser read in a tree-line structure | 91 | | `getHost` | `?string $userAgent` | `string[]` or `string` or `null` | If no `$userAgent` is passed, will return all | 92 | | `getSitemaps` | `?string $userAgent` | `string[]` | If no `$userAgent` is passed, will return all | 93 | | `getContent` | `-` | `string` | The content that was parsed. | 94 | | `getLog` | `-` | `[]` | **Deprecated.** Please use PSR logger as described above. | 95 | | `render` | `-` | `string` | **Deprecated.** Please `getContent` | 96 | 97 | Even more code samples could be found in the [tests folder](https://github.com/t1gor/Robots.txt-Parser-Class/tree/master/test). 98 | 99 | **Some useful links and materials:** 100 | * [Google: Robots.txt Specifications](https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt) 101 | * [Yandex: Using robots.txt](http://help.yandex.com/webmaster/?id=1113851) 102 | * [The Web Robots Pages](http://www.robotstxt.org/) 103 | * [W3C Recommendation](https://www.w3.org/TR/html4/appendix/notes.html#h-B.4.1.2) 104 | * [Some inspirational code](http://socoder.net/index.php?snippet=23824), and [some more](http://www.the-art-of-web.com/php/parse-robots/) 105 | * [Google Webmaster tools Robots.txt testing tool](https://www.google.com/webmasters/tools/robots-testing-tool) 106 | 107 | ### Contributing 108 | First of all - thank you for your interest and a desire to help! If you found an issue and know how to fix it, please submit a pull request to the dev branch. Please do not forget the following: 109 | - Your fixed issue should be covered with tests (we are using phpUnit) 110 | - Please mind the [code climate](https://codeclimate.com/github/t1gor/Robots.txt-Parser-Class) recommendations. It some-how helps to keep things simpler, or at least seems to :) 111 | - Following the coding standard would also be much appreciated (4 tabs as an indent, camelCase, etc.) 112 | 113 | I would really appreciate if you could share the link to your project that is utilizing the lib. 114 | 115 | License 116 | ------- 117 | 118 | The MIT License 119 | 120 | Copyright (c) 2013 Igor Timoshenkov 121 | 122 | Permission is hereby granted, free of charge, to any person obtaining a copy 123 | of this software and associated documentation files (the "Software"), to deal 124 | in the Software without restriction, including without limitation the rights 125 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 126 | copies of the Software, and to permit persons to whom the Software is 127 | furnished to do so, subject to the following conditions: 128 | 129 | The above copyright notice and this permission notice shall be included in 130 | all copies or substantial portions of the Software. 131 | 132 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 133 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 134 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 135 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 136 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 137 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 138 | THE SOFTWARE. 139 | -------------------------------------------------------------------------------- /assets/components-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/t1gor/Robots.txt-Parser-Class/531095ed96e86c0dd11e21bc9fb93a9acf9902d0/assets/components-graph.png -------------------------------------------------------------------------------- /assets/schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/t1gor/Robots.txt-Parser-Class/531095ed96e86c0dd11e21bc9fb93a9acf9902d0/assets/schema.png -------------------------------------------------------------------------------- /build/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/t1gor/Robots.txt-Parser-Class/531095ed96e86c0dd11e21bc9fb93a9acf9902d0/build/.gitkeep -------------------------------------------------------------------------------- /build/logs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/t1gor/Robots.txt-Parser-Class/531095ed96e86c0dd11e21bc9fb93a9acf9902d0/build/logs/.gitkeep -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "t1gor/robots-txt-parser", 3 | "description": "PHP class to parse robots.txt rules according to Google, Yandex, W3C and The Web Robots Pages specifications.", 4 | "keywords": [ 5 | "robots.txt", 6 | "parser", 7 | "Google", 8 | "Yandex", 9 | "W3C", 10 | "The Web Robots Pages" 11 | ], 12 | "homepage": "https://github.com/t1gor/Robots.txt-Parser-Class", 13 | "type": "library", 14 | "license": "MIT", 15 | "require": { 16 | "php": ">=7.4", 17 | "ext-mbstring": "*", 18 | "ext-iconv": "*", 19 | "vipnytt/useragentparser": "^1.0", 20 | "psr/log": "^1.1" 21 | }, 22 | "require-dev": { 23 | "ext-json": "*", 24 | "codeclimate/php-test-reporter": ">=0.2", 25 | "phpunit/phpunit": "^9", 26 | "monolog/monolog": "^2.3" 27 | }, 28 | "authors": [ 29 | { 30 | "name": "Igor Timoshenkov", 31 | "email": "igor.timoshenkov@gmail.com", 32 | "role": "creator" 33 | }, 34 | { 35 | "name": "Jan-Petter Gundersen", 36 | "email": "jpg@vipnytt.no", 37 | "role": "contributor" 38 | } 39 | ], 40 | "autoload": { 41 | "psr-4": { 42 | "t1gor\\RobotsTxtParser\\": "source" 43 | } 44 | }, 45 | "scripts": { 46 | "test": "phpunit" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | test 6 | 7 | 8 | 9 | 10 | source 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /source/Directive.php: -------------------------------------------------------------------------------- 1 | logger = $logger; 14 | } 15 | 16 | protected function log(string $message, array $context = [], string $level = LogLevel::DEBUG) { 17 | if (!is_null($this->logger)) { 18 | $this->logger->log($level, $message, $context); 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /source/Parser/DirectiveProcessors/AbstractAllowanceProcessor.php: -------------------------------------------------------------------------------- 1 | getDirectiveName(); 11 | 12 | if (empty($entry)) { 13 | $this->log(strtr('{directive} with empty value found for {useragent}, skipping', [ 14 | '{directive}' => $directive, 15 | '{useragent}' => $currentUserAgent, 16 | ])); 17 | 18 | return; 19 | } 20 | 21 | if (!preg_match("/^\//", $entry)) { 22 | $this->log(strtr('{directive} with invalid value "{faulty}" found for {useragent}, skipping', [ 23 | '{directive}' => $directive, 24 | '{faulty}' => $entry, 25 | '{useragent}' => $currentUserAgent, 26 | ])); 27 | 28 | return; 29 | } 30 | 31 | if (!isset($root[$currentUserAgent][$directive])) { 32 | $root[$currentUserAgent][$directive] = []; 33 | } 34 | 35 | if (!in_array($entry, $root[$currentUserAgent][$directive])) { 36 | $root[$currentUserAgent][$directive][] = $entry; 37 | } else { 38 | $this->log(strtr('{directive} with value {faulty} skipped as already exists for {useragent}', [ 39 | '{directive}' => $directive, 40 | '{faulty}' => $entry, 41 | '{useragent}' => $currentUserAgent, 42 | ])); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /source/Parser/DirectiveProcessors/AbstractDirectiveProcessor.php: -------------------------------------------------------------------------------- 1 | logger = $logger; 15 | } 16 | 17 | public function getLogger(): ?LoggerInterface { 18 | return $this->logger; 19 | } 20 | 21 | public function matches(string $line): bool { 22 | return (bool) preg_match('/^' . $this->getDirectiveName() . '\s*:\s+/isu', $line); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /source/Parser/DirectiveProcessors/AllowProcessor.php: -------------------------------------------------------------------------------- 1 | log('{directive} with value {faulty} dropped as invalid', [ 19 | '{directive}' => Directive::CACHE_DELAY, 20 | '{faulty}' => $parts[1], 21 | ]); 22 | return; 23 | } 24 | 25 | $root[$currentUserAgent][Directive::CACHE_DELAY] = $filteredCacheDelay; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /source/Parser/DirectiveProcessors/CleanParamProcessor.php: -------------------------------------------------------------------------------- 1 | log(strtr('{directive} with value {faulty} dropped as invalid for {useragent}', [ 20 | '{directive}' => Directive::CRAWL_DELAY, 21 | '{faulty}' => $entry, 22 | '{useragent}' => $currentUserAgent 23 | ])); 24 | 25 | return; 26 | } 27 | 28 | $root[$currentUserAgent][Directive::CRAWL_DELAY] = $filteredCrawlDelay; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /source/Parser/DirectiveProcessors/DirectiveProcessorInterface.php: -------------------------------------------------------------------------------- 1 | log(strtr('{directive} with value {faulty} dropped for {useragent} as invalid{ipAddress}', [ 26 | '{directive}' => Directive::HOST, 27 | '{faulty}' => $entry, 28 | '{useragent}' => $currentUserAgent, 29 | '{ipAddress}' => HostName::isIpAddress($entry) ? ' (IP address is not a valid hostname)' : '', 30 | ])); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /source/Parser/DirectiveProcessors/SitemapProcessor.php: -------------------------------------------------------------------------------- 1 | log(strtr('{directive} with value {faulty} skipped as already exists for {useragent}', [ 27 | '{directive}' => Directive::SITEMAP, 28 | '{faulty}' => $entry, 29 | '{useragent}' => $currentUserAgent, 30 | ])); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /source/Parser/DirectiveProcessors/UserAgentProcessor.php: -------------------------------------------------------------------------------- 1 | log('New useragent is equal to current one, skipping ...'); 23 | return; 24 | } 25 | 26 | $currentUserAgent = trim($parts[1]); 27 | 28 | if (!isset($root[$currentUserAgent])) { 29 | $root[$currentUserAgent] = []; 30 | } 31 | 32 | // if one user-agent is followed by another one - just link them 33 | if ($this->matches($prevLine)) { 34 | $prevParts = explode(':', $prevLine); 35 | $pervLineUserAgent = trim($prevParts[1]); 36 | 37 | $root[$pervLineUserAgent] = & $root[$currentUserAgent]; 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /source/Parser/DirectiveProcessorsFactory.php: -------------------------------------------------------------------------------- 1 | logger = $logger; 26 | 27 | if (empty($processors)) { 28 | $this->log("Seems like you've passed an empty processors array.", [], LogLevel::WARNING); 29 | } 30 | 31 | // reformat processors 32 | foreach ($processors as $processor) { 33 | $this->processors[$processor->getDirectiveName()] = $processor; 34 | } 35 | } 36 | 37 | /** 38 | * Wrapper to check that processor is available 39 | */ 40 | protected function processDirective(string $directive, string $line, &$tree, string &$userAgent, string $prevLine = '') { 41 | if (!isset($this->processors[$directive])) { 42 | $this->log(strtr('{directive} met, but no processor found for it. Skipping.', [ 43 | '{directive}' => $directive, 44 | ])); 45 | return; 46 | } 47 | 48 | $this->processors[$directive]->process($line, $tree, $userAgent, $prevLine); 49 | } 50 | 51 | /** 52 | * @return \Iterator 53 | */ 54 | public function getContent(): \Iterator { 55 | return $this->content; 56 | } 57 | 58 | /** 59 | * @param \Iterator $content 60 | */ 61 | public function setContent(\Iterator $content): void { 62 | $this->content = $content; 63 | } 64 | 65 | /** 66 | * @return array 67 | * @todo check for multibyte support? 68 | */ 69 | public function build(): array { 70 | $currentUserAgent = '*'; 71 | $tree = []; 72 | $prevLine = ''; 73 | 74 | $this->log('Building directives tree...'); 75 | 76 | foreach ($this->content as $line) { 77 | foreach ($this->processors as $processor) { 78 | if ($processor->matches($line)) { 79 | $this->processDirective( 80 | $processor->getDirectiveName(), 81 | $line, 82 | $tree, 83 | $currentUserAgent, 84 | $prevLine 85 | ); 86 | break; 87 | } 88 | } 89 | 90 | // override 91 | $prevLine = $line; 92 | } 93 | 94 | return $tree; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /source/Parser/TreeBuilderInterface.php: -------------------------------------------------------------------------------- 1 | in = $this->encode(trim($in)); 16 | } 17 | 18 | /** 19 | * URL encoder according to RFC 3986 20 | * Returns a string containing the encoded URL with disallowed characters converted to their percentage encodings. 21 | * 22 | * @link http://publicmind.in/blog/url-encoding/ 23 | * 24 | * @param string $url 25 | * 26 | * @return string string 27 | */ 28 | protected function encode(string $url): string { 29 | $reserved = [ 30 | ':' => '!%3A!ui', 31 | '/' => '!%2F!ui', 32 | '?' => '!%3F!ui', 33 | '#' => '!%23!ui', 34 | '[' => '!%5B!ui', 35 | ']' => '!%5D!ui', 36 | '@' => '!%40!ui', 37 | '!' => '!%21!ui', 38 | '$' => '!%24!ui', 39 | '&' => '!%26!ui', 40 | "'" => '!%27!ui', 41 | '(' => '!%28!ui', 42 | ')' => '!%29!ui', 43 | '*' => '!%2A!ui', 44 | '+' => '!%2B!ui', 45 | ',' => '!%2C!ui', 46 | ';' => '!%3B!ui', 47 | '=' => '!%3D!ui', 48 | '%' => '!%25!ui', 49 | ]; 50 | 51 | return preg_replace(array_values($reserved), array_keys($reserved), rawurlencode($url)); 52 | } 53 | 54 | public static function isValidScheme(string $scheme): bool { 55 | return in_array($scheme, ['http', 'https', 'ftp', 'sftp']); 56 | } 57 | 58 | /** 59 | * Parse URL 60 | * 61 | * @param string $url 62 | * 63 | * @return array|false 64 | */ 65 | protected function parse(string $url) { 66 | $parsed = parse_url($url); 67 | 68 | if ($parsed === false) { 69 | $this->log("Failed to parse URL from {$url}"); 70 | 71 | return false; 72 | } 73 | 74 | if (!isset($parsed['scheme']) || !static::isValidScheme($parsed['scheme'])) { 75 | $this->log("URL scheme invalid or missing for {$url}"); 76 | 77 | return false; 78 | } 79 | 80 | if (!isset($parsed['host']) || !HostName::isValid($parsed['host'])) { 81 | $this->log("URL host invalid or missing for {$url}"); 82 | 83 | return false; 84 | } 85 | 86 | if (!isset($parsed['port'])) { 87 | $parsed['port'] = getservbyname($parsed['scheme'], 'tcp'); 88 | 89 | if (!is_int($parsed['port'])) { 90 | $this->log("URL port should be a number, {$parsed['port']} found for {$url}"); 91 | 92 | return false; 93 | } 94 | } 95 | 96 | $parsed['custom'] = ($parsed['path'] ?? '/') . (isset($parsed['query']) ? '?' . $parsed['query'] : ''); 97 | 98 | return $parsed; 99 | } 100 | 101 | public function getPath() { 102 | $parsed = $this->parse($this->in); 103 | 104 | if ($parsed !== false) { 105 | return $parsed['custom']; 106 | } 107 | 108 | return $this->in; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /source/Parser/UserAgent/UserAgentMatcher.php: -------------------------------------------------------------------------------- 1 | logger = $logger; 18 | } 19 | 20 | public function getMatching(string $userAgent, array $available = []): string { 21 | if ($userAgent === '*') { 22 | return $userAgent; 23 | } 24 | 25 | $uaParser = new UserAgentParser($userAgent); 26 | $userAgentMatch = $uaParser->getMostSpecific($available); 27 | 28 | if (false !== $userAgentMatch) { 29 | $this->log("Matched {$userAgentMatch} for user agent {$userAgent}"); 30 | return $userAgentMatch; 31 | } 32 | 33 | $this->log("Failed to match user agent '{$userAgent}', falling back to '*'"); 34 | return '*'; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /source/Parser/UserAgent/UserAgentMatcherInterface.php: -------------------------------------------------------------------------------- 1 | 22 | * @author Jan-Petter Gundersen 23 | * 24 | * Logic schema and signals: 25 | * @link https://docs.google.com/document/d/1_rNjxpnUUeJG13ap6cnXM6Sx9ZQtd1ngADXnW9SHJSE 26 | * 27 | * Specifications: 28 | * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt 29 | * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml 30 | * @link http://www.robotstxt.org/ 31 | * @link http://www.w3.org/TR/html4/appendix/notes.html 32 | * 33 | * Useful links and materials: 34 | * @link http://www.the-art-of-web.com/php/parse-robots/ 35 | * @link http://socoder.net/index.php?snippet=23824 36 | */ 37 | class RobotsTxtParser implements LoggerAwareInterface { 38 | 39 | use LogsIfAvailableTrait; 40 | 41 | // default encoding 42 | const DEFAULT_ENCODING = 'UTF-8'; 43 | 44 | // rules set 45 | protected $rules = []; 46 | 47 | // host set 48 | protected $host = null; 49 | 50 | // robots.txt http status code 51 | protected ?int $httpStatusCode; 52 | 53 | // url 54 | private $url = null; 55 | 56 | // UserAgent 57 | private $userAgent = '*'; 58 | 59 | // robots.txt file content 60 | private $content = ''; 61 | private string $encoding = ''; 62 | 63 | private array $tree = []; 64 | private ?ReaderInterface $reader; 65 | private ?TreeBuilderInterface $treeBuilder; 66 | private ?UserAgentMatcherInterface $userAgentMatcher; 67 | 68 | public function __construct( 69 | $content, 70 | string $encoding = self::DEFAULT_ENCODING, 71 | ?TreeBuilderInterface $treeBuilder = null, 72 | ?ReaderInterface $reader = null, 73 | ?UserAgentMatcherInterface $userAgentMatcher = null 74 | ) { 75 | $this->treeBuilder = $treeBuilder; 76 | $this->reader = $reader; 77 | $this->encoding = $encoding; 78 | $this->userAgentMatcher = $userAgentMatcher; 79 | 80 | if (is_null($this->reader)) { 81 | $this->log('Reader is not passed, using a default one...'); 82 | 83 | $this->reader = is_resource($content) 84 | ? GeneratorBasedReader::fromStream($content) 85 | : GeneratorBasedReader::fromString($content); 86 | } 87 | 88 | if (is_null($this->userAgentMatcher)) { 89 | $this->log('UserAgentMatcher is not passed, using a default one...'); 90 | 91 | $this->userAgentMatcher = new UserAgentMatcher(); 92 | } 93 | } 94 | 95 | private function buildTree() { 96 | if (!empty($this->tree)) { 97 | return; 98 | } 99 | 100 | if ($this->encoding !== static::DEFAULT_ENCODING) { 101 | $this->reader->setEncoding($this->encoding); 102 | } 103 | 104 | // construct a tree builder if not passed 105 | if (is_null($this->treeBuilder)) { 106 | $this->log('Creating a default tree builder as none passed...'); 107 | 108 | $this->treeBuilder = new TreeBuilder( 109 | DirectiveProcessorsFactory::getDefault($this->logger), 110 | $this->logger 111 | ); 112 | } 113 | 114 | $this->treeBuilder->setContent($this->reader->getContentIterated()); 115 | $this->tree = $this->treeBuilder->build(); 116 | } 117 | 118 | public function getLogger(): ?LoggerInterface { 119 | return $this->logger; 120 | } 121 | 122 | public function setLogger(LoggerInterface $logger): void { 123 | $this->logger = $logger; 124 | 125 | if ($this->reader instanceof LoggerAwareInterface) { 126 | $this->reader->setLogger($this->logger); 127 | } 128 | 129 | if ($this->userAgentMatcher instanceof LoggerAwareInterface) { 130 | $this->userAgentMatcher->setLogger($this->logger); 131 | } 132 | } 133 | 134 | private static function isValidHostName(string $host): bool { 135 | return HostName::isValid($host); 136 | } 137 | 138 | /** 139 | * Validate URL scheme 140 | * 141 | * @param string $scheme 142 | * 143 | * @return bool 144 | */ 145 | private static function isValidScheme($scheme) { 146 | return Url::isValidScheme($scheme); 147 | } 148 | 149 | /** 150 | * Parse URL 151 | * 152 | * @param string $url 153 | * 154 | * @return array|false 155 | */ 156 | protected function parseURL($url) { 157 | $parsed = parse_url($url); 158 | if ($parsed === false) { 159 | return false; 160 | } elseif (!isset($parsed['scheme']) || !$this->isValidScheme($parsed['scheme'])) { 161 | return false; 162 | } else { 163 | if (!isset($parsed['host']) || !$this->isValidHostName($parsed['host'])) { 164 | return false; 165 | } else { 166 | if (!isset($parsed['port'])) { 167 | $parsed['port'] = getservbyname($parsed['scheme'], 'tcp'); 168 | if (!is_int($parsed['port'])) { 169 | return false; 170 | } 171 | } 172 | } 173 | } 174 | $parsed['custom'] = (isset($parsed['path']) ? $parsed['path'] : '/') . (isset($parsed['query']) ? '?' . $parsed['query'] : ''); 175 | return $parsed; 176 | } 177 | 178 | /** 179 | * Explode Clean-Param rule 180 | * 181 | * @param string $rule 182 | * 183 | * @return array 184 | */ 185 | private function explodeCleanParamRule($rule) { 186 | // strip multi-spaces 187 | $rule = preg_replace('/\s+/S', ' ', $rule); 188 | // split into parameter and path 189 | $array = explode(' ', $rule, 2); 190 | $cleanParam = []; 191 | // strip any invalid characters from path prefix 192 | $cleanParam['path'] = isset($array[1]) ? $this->encode_url(preg_replace('/[^A-Za-z0-9\.-\/\*\_]/', '', $array[1])) : '/*'; 193 | $param = explode('&', $array[0]); 194 | foreach ($param as $key) { 195 | $cleanParam['param'][] = trim($key); 196 | } 197 | return $cleanParam; 198 | } 199 | 200 | /** 201 | * Set the HTTP status code 202 | * 203 | * @param int $code 204 | * 205 | * @return bool 206 | */ 207 | public function setHttpStatusCode(int $code): bool { 208 | if (!is_int($code) || $code < 100 || $code > 599) { 209 | $this->log('Invalid HTTP status code, not taken into account.', ['code' => $code], LogLevel::WARNING); 210 | return false; 211 | } 212 | 213 | $this->httpStatusCode = $code; 214 | 215 | return true; 216 | } 217 | 218 | public function isAllowed(string $url, ?string $userAgent = '*'): bool { 219 | $this->buildTree(); 220 | 221 | $url = new Url($url); 222 | !is_null($this->logger) && $url->setLogger($this->logger); 223 | 224 | return $this->checkRules(Directive::ALLOW, $url->getPath(), $userAgent); 225 | } 226 | 227 | /** 228 | * Set UserAgent 229 | * 230 | * @param string $userAgent 231 | * 232 | * @return void 233 | * @deprecated please check rules for exact user agent instead 234 | */ 235 | public function setUserAgent(string $userAgent) { 236 | throw new \RuntimeException(WarmingMessages::SET_UA_DEPRECATED); 237 | } 238 | 239 | /** 240 | * Check rules 241 | * 242 | * @param string $rule - rule to check 243 | * @param string $path - path to check 244 | * @param string $userAgent - which robot to check for 245 | * 246 | * @return bool 247 | */ 248 | protected function checkRules(string $rule, string $path, string $userAgent = '*'): bool { 249 | // check for disallowed http status code 250 | if ($this->checkHttpStatusCodeRule()) { 251 | return ($rule === Directive::DISALLOW); 252 | } 253 | 254 | // Check each directive for rules, allowed by default 255 | $result = ($rule === Directive::ALLOW); 256 | $userAgent = $this->userAgentMatcher->getMatching($userAgent, array_keys($this->tree)); 257 | 258 | foreach ([Directive::DISALLOW, Directive::ALLOW] as $directive) { 259 | if (!isset($this->tree[$userAgent][$directive])) { 260 | continue; 261 | } 262 | 263 | foreach ($this->tree[$userAgent][$directive] as $robotRule) { 264 | // check rule 265 | if ($this->checkRuleSwitch($robotRule, $path)) { 266 | // rule match 267 | $result = ($rule === $directive); 268 | } 269 | } 270 | } 271 | 272 | return $result; 273 | } 274 | 275 | /** 276 | * Check HTTP status code rule 277 | * 278 | * @return bool 279 | */ 280 | private function checkHttpStatusCodeRule(): bool { 281 | if (isset($this->httpStatusCode) && $this->httpStatusCode >= 500 && $this->httpStatusCode <= 599) { 282 | $this->log("Disallowed by HTTP status code {$this->httpStatusCode}"); 283 | return true; 284 | } 285 | 286 | return false; 287 | } 288 | 289 | protected function checkRuleSwitch(string $rule, string $path): bool { 290 | switch (Directive::attemptGetInline($rule)) { 291 | 292 | case Directive::CLEAN_PARAM: 293 | if ($this->checkCleanParamRule(Directive::stripInline($rule), $path)) { 294 | return true; 295 | } 296 | break; 297 | 298 | case Directive::HOST; 299 | if ($this->checkHostRule(Directive::stripInline($rule))) { 300 | return true; 301 | } 302 | break; 303 | 304 | default: 305 | return $this->checkBasicRule($rule, $path); 306 | } 307 | } 308 | 309 | /** 310 | * Check Clean-Param rule 311 | * 312 | * @param string $rule 313 | * @param string $path 314 | * 315 | * @return bool 316 | */ 317 | private function checkCleanParamRule($rule, $path) { 318 | $cleanParam = $this->explodeCleanParamRule($rule); 319 | // check if path prefix matches the path of the url we're checking 320 | if (!$this->checkBasicRule($cleanParam['path'], $path)) { 321 | return false; 322 | } 323 | foreach ($cleanParam['param'] as $param) { 324 | if (!strpos($path, "?$param=") 325 | && !strpos($path, "&$param=") 326 | ) { 327 | return false; 328 | } 329 | } 330 | $this->log('Rule match: ' . Directive::CLEAN_PARAM . ' directive'); 331 | return true; 332 | } 333 | 334 | /** 335 | * Check basic rule 336 | */ 337 | private function checkBasicRule(string $rule, string $path): bool { 338 | // change @ to \@ 339 | $escaped = strtr($this->prepareRegexRule($rule), ['@' => '\@']); 340 | 341 | // match result 342 | if (preg_match('@' . $escaped . '@', $path)) { 343 | $this->log('Rule match: Path'); 344 | return true; 345 | } 346 | 347 | return false; 348 | } 349 | 350 | protected function prepareRegexRule(string $value): string { 351 | $escape = ['$' => '\$', '?' => '\?', '.' => '\.', '*' => '.*', '[' => '\[', ']' => '\]']; 352 | $value = str_replace(array_keys($escape), array_values($escape), $value); 353 | 354 | if (mb_strlen($value) > 2 && mb_substr($value, -2) == '\$') { 355 | $value = substr($value, 0, -2) . '$'; 356 | } 357 | 358 | if (mb_strrpos($value, '/') == (mb_strlen($value) - 1) 359 | || mb_strrpos($value, '=') == (mb_strlen($value) - 1) 360 | || mb_strrpos($value, '?') == (mb_strlen($value) - 1) 361 | ) { 362 | $value .= '.*'; 363 | } 364 | 365 | if (substr($value, 0, 2) != '.*') { 366 | $value = '^' . $value; 367 | } 368 | return $value; 369 | } 370 | 371 | /** 372 | * Check Host rule 373 | * 374 | * @param string $rule 375 | * 376 | * @return bool 377 | */ 378 | private function checkHostRule($rule) { 379 | if (!isset($this->url)) { 380 | $error_msg = WarmingMessages::INLINED_HOST; 381 | $this->log($error_msg, [], LogLevel::ERROR); 382 | return false; 383 | } 384 | 385 | $url = $this->parseURL($this->url); 386 | $host = trim(str_ireplace(Directive::HOST . ':', '', mb_strtolower($rule))); 387 | if (in_array( 388 | $host, [ 389 | $url['host'], 390 | $url['host'] . ':' . $url['port'], 391 | $url['scheme'] . '://' . $url['host'], 392 | $url['scheme'] . '://' . $url['host'] . ':' . $url['port'], 393 | ] 394 | )) { 395 | $this->log('Rule match: ' . Directive::HOST . ' directive'); 396 | return true; 397 | } 398 | return false; 399 | } 400 | 401 | /** 402 | * Check url wrapper 403 | * 404 | * @param string $url - url to check 405 | * @param string|null $userAgent - which robot to check for 406 | * 407 | * @return bool 408 | */ 409 | public function isDisallowed(string $url, string $userAgent = '*'): bool { 410 | $this->buildTree(); 411 | 412 | $url = new Url($url); 413 | !is_null($this->logger) && $url->setLogger($this->logger); 414 | 415 | return $this->checkRules(Directive::DISALLOW, $url->getPath(), $userAgent); 416 | } 417 | 418 | public function getDelay(string $userAgent = "*", string $type = Directive::CRAWL_DELAY) { 419 | $this->buildTree(); 420 | 421 | $directive = in_array($type, [Directive::CACHE, Directive::CACHE_DELAY]) 422 | ? Directive::CACHE_DELAY 423 | : Directive::CRAWL_DELAY; 424 | 425 | if (isset($this->tree[$userAgent][$directive])) { 426 | // return delay for requested directive 427 | return $this->tree[$userAgent][$directive]; 428 | } 429 | 430 | if (isset($this->tree[$userAgent][Directive::CRAWL_DELAY])) { 431 | $this->log("{$directive} directive (unofficial): Not found, fallback to " . Directive::CRAWL_DELAY . " directive"); 432 | return $this->tree[$userAgent][Directive::CRAWL_DELAY]; 433 | } 434 | 435 | $this->log("$directive directive: Not found"); 436 | 437 | return 0; 438 | } 439 | 440 | public function getCleanParam(): array { 441 | $this->buildTree(); 442 | 443 | if (!isset($this->tree[Directive::CLEAN_PARAM]) || empty($this->tree[Directive::CLEAN_PARAM])) { 444 | $this->log(Directive::CLEAN_PARAM . ' directive: Not found'); 445 | } 446 | 447 | return $this->tree[Directive::CLEAN_PARAM]; 448 | } 449 | 450 | /** 451 | * @deprecated 452 | */ 453 | public function getContent(): string { 454 | return $this->reader->getContentRaw(); 455 | } 456 | 457 | /** 458 | * @return array 459 | * @deprecated 460 | * @see RobotsTxtParser::getLogger() 461 | */ 462 | public function getLog(): array { 463 | return []; 464 | } 465 | 466 | /** 467 | * Render 468 | * 469 | * @param string $eol 470 | * 471 | * @return string 472 | */ 473 | public function render($eol = "\r\n") { 474 | $input = $this->getRules(); 475 | krsort($input); 476 | $output = []; 477 | foreach ($input as $userAgent => $rules) { 478 | $output[] = 'User-agent: ' . $userAgent; 479 | foreach ($rules as $directive => $value) { 480 | // Not multibyte 481 | $directive = ucfirst($directive); 482 | if (is_array($value)) { 483 | // Shorter paths later 484 | usort($value, function ($a, $b) { 485 | return mb_strlen($a) < mb_strlen($b); 486 | }); 487 | foreach ($value as $subValue) { 488 | $output[] = $directive . ': ' . $subValue; 489 | } 490 | } else { 491 | $output[] = $directive . ': ' . $value; 492 | } 493 | } 494 | $output[] = ''; 495 | } 496 | 497 | $host = $this->getHost(); 498 | if ($host !== null) { 499 | $output[] = 'Host: ' . $host; 500 | } 501 | 502 | $sitemaps = $this->getSitemaps(); 503 | foreach ($sitemaps as $sitemap) { 504 | $output[] = 'Sitemap: ' . $sitemap; 505 | } 506 | 507 | $output[] = ''; 508 | return implode($eol, $output); 509 | } 510 | 511 | public function getRules(?string $userAgent = null) { 512 | $this->buildTree(); 513 | 514 | // return all rules 515 | if ($userAgent === null) { 516 | return $this->tree; 517 | } 518 | 519 | $userAgent = $this->userAgentMatcher->getMatching($userAgent, array_keys($this->tree)); 520 | 521 | // direct match 522 | if (isset($this->tree[$userAgent])) { 523 | return $this->tree[$userAgent]; 524 | } 525 | 526 | // fallback for * 527 | if (isset($this->tree['*'])) { 528 | $this->log(sprintf("No direct match found for '%s', fallback to *", $userAgent)); 529 | return $this->tree['*']; 530 | } 531 | 532 | $this->log(sprintf("Rules not found for the given User-Agent '%s'", $userAgent)); 533 | 534 | return []; 535 | } 536 | 537 | /** 538 | * @param ?string $userAgent 539 | * 540 | * @note NULL is returned to public API compatibility reasons. Will be removed in the future. 541 | * 542 | * @return string[]|string|null 543 | */ 544 | public function getHost(?string $userAgent = null) { 545 | $this->buildTree(); 546 | 547 | if (!is_null($userAgent)) { 548 | $userAgent = $this->userAgentMatcher->getMatching($userAgent, array_keys($this->tree)); 549 | 550 | if (isset($this->tree[$userAgent][Directive::HOST]) && !empty($this->tree[$userAgent][Directive::HOST])) { 551 | return $this->tree[$userAgent][Directive::HOST]; 552 | } 553 | 554 | return null; 555 | } 556 | 557 | $hosts = []; 558 | 559 | foreach ($this->tree as $userAgentBased) { 560 | if (isset($userAgentBased[Directive::HOST]) && !empty($userAgentBased[Directive::HOST])) { 561 | array_push($hosts, $userAgentBased[Directive::HOST]); 562 | } 563 | } 564 | 565 | return !empty($hosts) ? $hosts : null; 566 | } 567 | 568 | public function getSitemaps(?string $userAgent = null): array { 569 | $this->buildTree(); 570 | $maps = []; 571 | 572 | if (!is_null($userAgent)) { 573 | $userAgent = $this->userAgentMatcher->getMatching($userAgent, array_keys($this->tree)); 574 | 575 | if (isset($this->tree[$userAgent][Directive::SITEMAP]) && !empty($this->tree[$userAgent][Directive::SITEMAP])) { 576 | return $this->tree[$userAgent][Directive::SITEMAP]; 577 | } 578 | } else { 579 | foreach ($this->tree as $userAgentBased) { 580 | if (isset($userAgentBased[Directive::SITEMAP]) && !empty($userAgentBased[Directive::SITEMAP])) { 581 | $maps = array_merge($maps, $userAgentBased[Directive::SITEMAP]); 582 | } 583 | } 584 | } 585 | 586 | return $maps; 587 | } 588 | } 589 | -------------------------------------------------------------------------------- /source/Stream/CustomFilterInterface.php: -------------------------------------------------------------------------------- 1 | datalen for each $bucket. 20 | * @param bool $closing 21 | * If the stream is in the process of closing (and therefore this is the last pass 22 | * through the filterchain), the closing parameter will be set to TRUE. 23 | * 24 | * @return int 25 | * The filter() method must return one of three values upon completion. 26 | * - PSFS_PASS_ON: Filter processed successfully with data available in the out 27 | * bucket brigade. 28 | * - PSFS_FEED_ME: Filter processed successfully, however no data was available to 29 | * return. More data is required from the stream or prior filter. 30 | * - PSFS_ERR_FATAL (default): The filter experienced an unrecoverable error and 31 | * cannot continue. 32 | */ 33 | public function filter($in, $out, &$consumed, $closing); 34 | 35 | /** 36 | * Called when creating the filter. 37 | * 38 | * @return bool 39 | * Your implementation of this method should return FALSE on failure, or TRUE on success. 40 | */ 41 | public function onCreate(); 42 | 43 | /** 44 | * Called when closing the filter. 45 | */ 46 | public function onClose(); 47 | } 48 | -------------------------------------------------------------------------------- /source/Stream/Filters/SkipCommentedLinesFilter.php: -------------------------------------------------------------------------------- 1 | data = preg_replace('/^#.*/mui', '', $bucket->data, -1, $replacedCount); 18 | $consumed += $bucket->datalen; 19 | stream_bucket_append($out, $bucket); 20 | 21 | if ($replacedCount > 0 22 | && isset($this->params['logger']) 23 | && $this->params['logger'] instanceof LoggerInterface 24 | ) { 25 | $this->params['logger']->debug($replacedCount . ' lines skipped as commented out'); 26 | } 27 | } 28 | 29 | return PSFS_PASS_ON; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /source/Stream/Filters/SkipDirectivesWithInvalidValuesFilter.php: -------------------------------------------------------------------------------- 1 | data = preg_replace(Directive::getRequestRateRegex(), '', $bucket->data, -1, $skippedRequestRateValues); 25 | $bucket->data = preg_replace(Directive::getCrawlDelayRegex(), '', $bucket->data, -1, $skippedCrawlDelayValues); 26 | // $bucket->data = preg_replace(Directive::getAllowDisallowRegex(), '', $bucket->data, -1, $skippedAllowanceValues); 27 | 28 | $consumed += $bucket->datalen; 29 | stream_bucket_append($out, $bucket); 30 | 31 | if (isset($this->params['logger']) && $this->params['logger'] instanceof LoggerInterface) { 32 | if ($skippedRequestRateValues > 0) { 33 | $this->params['logger']->debug($skippedRequestRateValues . ' char(s) dropped as invalid Request-rate value.'); 34 | } 35 | if ($skippedCrawlDelayValues > 0) { 36 | $this->params['logger']->debug($skippedCrawlDelayValues . ' char(s) dropped as invalid Crawl-delay value.'); 37 | } 38 | if ($skippedAllowanceValues > 0) { 39 | $this->params['logger']->debug($skippedAllowanceValues . ' char(s) dropped as invalid allow/disallow value.'); 40 | } 41 | } 42 | } 43 | 44 | return PSFS_PASS_ON; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /source/Stream/Filters/SkipEmptyLinesFilter.php: -------------------------------------------------------------------------------- 1 | data = preg_replace( 18 | '/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/mui', 19 | PHP_EOL, 20 | $bucket->data, -1, 21 | $replacedCount 22 | ); 23 | 24 | $consumed += $bucket->datalen; 25 | stream_bucket_append($out, $bucket); 26 | 27 | if ($replacedCount > 0 28 | && isset($this->params['logger']) 29 | && $this->params['logger'] instanceof LoggerInterface 30 | ) { 31 | $this->params['logger']->debug($replacedCount . ' lines skipped as empty.'); 32 | } 33 | } 34 | 35 | return PSFS_PASS_ON; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /source/Stream/Filters/SkipEndOfCommentedLineFilter.php: -------------------------------------------------------------------------------- 1 | data = preg_replace('/\s*#.*/mui', '', $bucket->data, -1, $replacedCount); 18 | $consumed += $bucket->datalen; 19 | stream_bucket_append($out, $bucket); 20 | 21 | if ($replacedCount > 0 22 | && isset($this->params['logger']) 23 | && $this->params['logger'] instanceof LoggerInterface 24 | ) { 25 | $this->params['logger']->debug($replacedCount . ' char(s) dropped as commented out'); 26 | } 27 | } 28 | 29 | return PSFS_PASS_ON; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /source/Stream/Filters/SkipUnsupportedDirectivesFilter.php: -------------------------------------------------------------------------------- 1 | data = preg_replace(Directive::getRegex(), '', $bucket->data, -1, $replacedCount); 19 | $consumed += $bucket->datalen; 20 | stream_bucket_append($out, $bucket); 21 | 22 | if ($replacedCount > 0 23 | && isset($this->params['logger']) 24 | && $this->params['logger'] instanceof LoggerInterface 25 | ) { 26 | $this->params['logger']->debug($replacedCount . ' lines skipped as un-supported'); 27 | } 28 | } 29 | 30 | return PSFS_PASS_ON; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /source/Stream/Filters/TrimSpacesLeftFilter.php: -------------------------------------------------------------------------------- 1 | data = preg_replace('/(^\s+)(?!\n$)/mui', '', $bucket->data); 16 | $consumed += $bucket->datalen; 17 | stream_bucket_append($out, $bucket); 18 | } 19 | 20 | return PSFS_PASS_ON; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /source/Stream/GeneratorBasedReader.php: -------------------------------------------------------------------------------- 1 | filters = [ 30 | SkipCommentedLinesFilter::class => false, 31 | SkipEndOfCommentedLineFilter::class => false, 32 | TrimSpacesLeftFilter::class => false, 33 | SkipUnsupportedDirectivesFilter::class => false, 34 | SkipDirectivesWithInvalidValuesFilter::class => false, 35 | SkipEmptyLinesFilter::class => false, 36 | ]; 37 | } 38 | 39 | /** 40 | * @link https://www.php.net/manual/en/function.stream-filter-append.php#84637 41 | */ 42 | public function __destruct() { 43 | foreach ($this->filters as $class => $instance) { 44 | try { 45 | if (is_resource($instance)) { 46 | stream_filter_remove($instance); 47 | } 48 | } catch (\Throwable $throwable) { 49 | $this->log('Failed to remove filter "{class}": {message}', [ 50 | 'class' => $class, 51 | 'message' => $throwable->getMessage(), 52 | ]); 53 | } 54 | } 55 | 56 | if (is_resource($this->stream)) { 57 | fclose($this->stream); 58 | } 59 | } 60 | 61 | /** 62 | * @param string $input 63 | * 64 | * @return static 65 | */ 66 | public static function fromString(string $input = ''): self { 67 | $reader = new GeneratorBasedReader(); 68 | $stream = tmpfile(); 69 | 70 | fwrite($stream, $input); 71 | fseek($stream, 0); 72 | 73 | $reader->log(WarmingMessages::STRING_INIT_DEPRECATE); 74 | 75 | return $reader->setStream($stream); 76 | } 77 | 78 | public static function fromStream($stream): self { 79 | if (!is_resource($stream)) { 80 | $error = sprintf('Argument must be a valid resource type. %s given.', gettype($stream)); 81 | throw new \InvalidArgumentException($error); 82 | } 83 | 84 | $reader = new GeneratorBasedReader(); 85 | rewind($stream); 86 | 87 | return $reader->setStream($stream); 88 | } 89 | 90 | protected function setStream($stream): GeneratorBasedReader { 91 | $this->stream = $stream; 92 | 93 | foreach ($this->filters as $filterClass => & $filter) { 94 | stream_filter_register($filterClass::NAME, $filterClass); 95 | $filter = stream_filter_append( 96 | $this->stream, 97 | $filterClass::NAME, 98 | STREAM_FILTER_READ, 99 | ['logger' => $this->logger] // pass logger to filters 100 | ); 101 | } 102 | 103 | return $this; 104 | } 105 | 106 | /** 107 | * @param string $encoding 108 | * 109 | * @TODO check on composer install if we have filters available 110 | */ 111 | public function setEncoding(string $encoding) { 112 | if (strtoupper($encoding) === RobotsTxtParser::DEFAULT_ENCODING) { 113 | return; 114 | } 115 | 116 | $this->log(WarmingMessages::ENCODING_NOT_UTF8, [], LogLevel::WARNING); 117 | 118 | $filterName = 'convert.iconv.' . $encoding . '/utf-8'; 119 | $this->log('Adding encoding filter ' . $filterName); 120 | 121 | // convert encoding 122 | $this->filters['iconv'] = stream_filter_prepend($this->stream, $filterName, STREAM_FILTER_READ); 123 | } 124 | 125 | public function getContentIterated(): \Generator { 126 | rewind($this->stream); 127 | 128 | while (!feof($this->stream)) { 129 | $line = fgets($this->stream); 130 | 131 | if (false !== $line) { 132 | yield $line; 133 | } 134 | } 135 | } 136 | 137 | public function getContentRaw(): string { 138 | rewind($this->stream); 139 | return stream_get_contents($this->stream); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /source/Stream/ReaderInterface.php: -------------------------------------------------------------------------------- 1 | parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/allow-spec.txt', 'r')); 17 | } 18 | 19 | public function tearDown(): void { 20 | $this->parser = null; 21 | } 22 | 23 | public function testForCrawlerZ() { 24 | $this->assertTrue($this->parser->isAllowed('/', 'crawlerZ')); 25 | $this->assertTrue($this->parser->isDisallowed('/forum', 'crawlerZ')); 26 | $this->assertTrue($this->parser->isDisallowed('/public', 'crawlerZ')); 27 | $this->assertFalse($this->parser->isDisallowed('/', 'crawlerZ')); 28 | $this->assertFalse($this->parser->isAllowed('/forum', 'crawlerZ')); 29 | $this->assertFalse($this->parser->isAllowed('/public', 'crawlerZ')); 30 | } 31 | 32 | public function testForDefaultUserAgent() { 33 | $this->assertTrue($this->parser->isAllowed('/')); 34 | $this->assertTrue($this->parser->isAllowed('/article')); 35 | $this->assertTrue($this->parser->isDisallowed('/temp')); 36 | $this->assertTrue($this->parser->isDisallowed('/Admin')); 37 | $this->assertTrue($this->parser->isDisallowed('/admin')); 38 | $this->assertTrue($this->parser->isDisallowed('/admin/cp/test/')); 39 | $this->assertFalse($this->parser->isDisallowed('/')); 40 | $this->assertFalse($this->parser->isDisallowed('/article')); 41 | $this->assertFalse($this->parser->isAllowed('/temp')); 42 | $this->assertFalse($this->parser->isDisallowed('/article')); 43 | } 44 | 45 | public function testForAgentV() { 46 | $this->assertTrue($this->parser->isDisallowed('/foo', 'agentV')); 47 | $this->assertTrue($this->parser->isAllowed('/bar', 'agentV')); 48 | $this->assertTrue($this->parser->isAllowed('/Foo', 'agentV')); 49 | } 50 | 51 | public function testForAgentW() { 52 | $this->assertTrue($this->parser->isDisallowed('/foo', 'agentW')); 53 | $this->assertTrue($this->parser->isAllowed('/bar', 'agentW')); 54 | $this->assertTrue($this->parser->isAllowed('/Foo', 'agentW')); 55 | } 56 | 57 | public function testForBotY() { 58 | $this->assertTrue($this->parser->isDisallowed('/', 'botY-test')); 59 | $this->assertTrue($this->parser->isDisallowed('/forum', 'botY-test')); 60 | $this->assertTrue($this->parser->isAllowed('/forum/', 'botY-test')); 61 | $this->assertTrue($this->parser->isDisallowed('/forum/topic', 'botY-test')); 62 | $this->assertTrue($this->parser->isDisallowed('/public', 'botY-test')); 63 | $this->assertFalse($this->parser->isAllowed('/', 'botY-test')); 64 | $this->assertFalse($this->parser->isAllowed('/forum', 'botY-test')); 65 | $this->assertFalse($this->parser->isDisallowed('/forum/', 'botY-test')); 66 | $this->assertFalse($this->parser->isAllowed('/forum/topic', 'botY-test')); 67 | $this->assertFalse($this->parser->isAllowed('/public', 'botY-test')); 68 | } 69 | 70 | /** 71 | * @param string $url 72 | * @param bool $isAllowed 73 | * 74 | * @dataProvider generateDataForSpiderX 75 | */ 76 | public function testForSpiderX(string $url, bool $isAllowed) { 77 | if ($isAllowed) { 78 | $this->assertTrue($this->parser->isAllowed($url, 'spiderX/1.0')); 79 | $this->assertFalse($this->parser->isDisallowed($url, 'spiderX/1.0')); 80 | } else { 81 | $this->assertTrue($this->parser->isDisallowed($url, 'spiderX/1.0')); 82 | $this->assertFalse($this->parser->isAllowed($url, 'spiderX/1.0')); 83 | } 84 | } 85 | 86 | public function generateDataForSpiderX(): array { 87 | return [ 88 | ['/temp', true], 89 | ['/assets', false], 90 | ['/forum', true], 91 | ]; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /test/AtSymbolTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($parser->isAllowed("/peanuts")); 20 | $this->assertFalse($parser->isDisallowed("/peanuts")); 21 | $this->assertFalse($parser->isAllowed("/url_containing_@_symbol")); 22 | $this->assertTrue($parser->isDisallowed("/url_containing_@_symbol")); 23 | } 24 | 25 | /** 26 | * Generate test case data 27 | * @return array 28 | */ 29 | public function generateDataForTest() 30 | { 31 | return array( 32 | array(" 33 | User-Agent: * 34 | Disallow: /url_containing_@_symbol 35 | Allow: /peanuts 36 | ") 37 | ); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /test/CommentsTest.php: -------------------------------------------------------------------------------- 1 | getRules('*'); 20 | $this->assertEmpty($rules, 'expected remove comments'); 21 | } 22 | 23 | /** 24 | * @dataProvider generateDataFor2Test 25 | * @param string $robotsTxtContent 26 | * @param string $expectedDisallowValue 27 | */ 28 | public function testRemoveCommentsFromValue($robotsTxtContent, $expectedDisallowValue) 29 | { 30 | $parser = new RobotsTxtParser($robotsTxtContent); 31 | $this->assertNotEmpty($parser->getRules('*'), 'expected data'); 32 | $this->assertArrayHasKey(Directive::DISALLOW, $parser->getRules('*')); 33 | $this->assertNotEmpty($parser->getRules('*')[Directive::DISALLOW], 'disallow expected'); 34 | $this->assertEquals($expectedDisallowValue, $parser->getRules('*')[Directive::DISALLOW][0]); 35 | } 36 | 37 | /** 38 | * Generate test case data 39 | * @return array 40 | */ 41 | public function generateDataForTest() 42 | { 43 | return array( 44 | array(" 45 | User-agent: * 46 | #Disallow: /tech 47 | "), 48 | array(" 49 | User-agent: * 50 | Disallow: #/tech 51 | "), 52 | array(" 53 | User-agent: * 54 | Disal # low: /tech 55 | "), 56 | array(" 57 | User-agent: * 58 | Disallow#: /tech # ds 59 | "), 60 | ); 61 | } 62 | 63 | /** 64 | * Generate test case data 65 | * @return array 66 | */ 67 | public function generateDataFor2Test() 68 | { 69 | return array( 70 | array( 71 | "User-agent: * 72 | Disallow: /tech #comment", 73 | 'disallowValue' => '/tech', 74 | ), 75 | ); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /test/Directives/CacheDelayTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 22 | 23 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/cache-delay-spec.txt', 'r')); 24 | $this->parser->setLogger($log); 25 | } 26 | 27 | public function tearDown(): void { 28 | $this->parser = null; 29 | } 30 | 31 | public function testCacheDelayForExistingUserAgents() { 32 | $this->assertEquals(0.5, $this->parser->getDelay('*', Directive::CACHE_DELAY)); 33 | $this->assertEquals(3.7, $this->parser->getDelay('GoogleBot', Directive::CACHE_DELAY)); 34 | $this->assertEquals(8, $this->parser->getDelay('AhrefsBot', Directive::CACHE_DELAY)); 35 | } 36 | 37 | public function testCacheDelayFallsBackForNonStandardCacheDirective() { 38 | $this->assertEquals(0.5, $this->parser->getDelay('*', Directive::CACHE)); 39 | $this->assertEquals(3.7, $this->parser->getDelay('GoogleBot', Directive::CACHE)); 40 | $this->assertEquals(8, $this->parser->getDelay('AhrefsBot', Directive::CACHE)); 41 | } 42 | 43 | public function testCacheDelayFallsBackToCrawlDelayIfNotSpecified() { 44 | $this->assertEquals(1.5, $this->parser->getDelay('Yandex', Directive::CACHE)); 45 | 46 | /** @var TestHandler $handler */ 47 | $handler = $this->parser->getLogger()->getHandlers()[0]; 48 | 49 | $this->assertTrue($handler->hasRecord( 50 | 'cache-delay directive (unofficial): Not found, fallback to crawl-delay directive', 51 | LogLevel::DEBUG 52 | )); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /test/Directives/CleanParamTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 21 | 22 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/with-clean-param.txt', 'r')); 23 | $this->parser->setLogger($log); 24 | } 25 | 26 | public function tearDown(): void { 27 | $this->parser = null; 28 | } 29 | 30 | public function testCleanParam() { 31 | $this->assertArrayHasKey('/forum/showthread.php', $this->parser->getCleanParam()); 32 | $this->assertEquals(['abc'], $this->parser->getCleanParam()['/forum/showthread.php']); 33 | 34 | $this->assertArrayHasKey('/forum/*.php', $this->parser->getCleanParam()); 35 | $this->assertEquals(['sid', 'sort'], $this->parser->getCleanParam()['/forum/*.php']); 36 | 37 | $this->assertArrayHasKey('/*', $this->parser->getCleanParam()); 38 | $this->assertEquals(['someTrash', 'otherTrash'], $this->parser->getCleanParam()['/*']); 39 | } 40 | 41 | public function testCleanParamsAppliedForAllowDisallow() { 42 | $this->markTestIncomplete('@TODO this needs to be finished yet.'); 43 | 44 | $this->assertTrue($this->parser->isDisallowed("http://www.site1.com/forums/showthread.php?s=681498b9648949605&ref=parent")); 45 | $this->assertFalse($this->parser->isAllowed("http://www.site1.com/forums/showthread.php?s=681498b9648949605&ref=parent")); 46 | 47 | /** @var TestHandler $handler */ 48 | $handler = $this->parser->getLogger()->getHandlers()[0]; 49 | 50 | $this->assertTrue( 51 | $handler->hasRecord('Rule match: clean-param directive', LogLevel::DEBUG), 52 | stringifyLogs($handler->getRecords()) 53 | ); 54 | 55 | $this->assertTrue($this->parser->isAllowed("http://www.site2.com/forums/showthread.php?s=681498b9648949605")); 56 | $this->assertFalse($this->parser->isDisallowed("http://www.site2.com/forums/showthread.php?s=681498b9648949605")); 57 | 58 | $this->assertTrue( 59 | $handler->hasRecord('Rule match: Path', LogLevel::DEBUG), 60 | stringifyLogs($handler->getRecords()) 61 | ); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /test/Directives/CrawlDelayTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 22 | 23 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/crawl-delay-spec.txt', 'r')); 24 | $this->parser->setLogger($log); 25 | } 26 | 27 | public function tearDown(): void { 28 | $this->parser = null; 29 | } 30 | 31 | public function testCrawlDelayForExactUserAgent() { 32 | $this->assertEquals(0.9, $this->parser->getDelay('GoogleBot')); 33 | $this->assertEquals(1.5, $this->parser->getDelay('AhrefsBot')); 34 | } 35 | 36 | public function testCrawlDelayWithNoUserAgent() { 37 | $this->assertEquals(0, $this->parser->getDelay()); 38 | } 39 | 40 | public function testCrawlDelayLogsFallbackToCrawlDelay() { 41 | $this->assertEquals(0.9, $this->parser->getDelay('GoogleBot', Directive::CACHE_DELAY)); 42 | 43 | /** @var TestHandler $handler */ 44 | $handler = $this->parser->getLogger()->getHandlers()[0]; 45 | 46 | $this->assertTrue($handler->hasRecord( 47 | 'cache-delay directive (unofficial): Not found, fallback to crawl-delay directive', 48 | LogLevel::DEBUG 49 | )); 50 | } 51 | 52 | public function testCrawlDelayLogsFallbackForMissingUserAgent() { 53 | $this->assertEquals(0, $this->parser->getDelay('YandexBot', Directive::CACHE_DELAY)); 54 | 55 | /** @var TestHandler $handler */ 56 | $handler = $this->parser->getLogger()->getHandlers()[0]; 57 | 58 | $this->assertTrue($handler->hasRecord( 59 | 'cache-delay directive: Not found', 60 | LogLevel::DEBUG 61 | )); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /test/Directives/HostTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 21 | 22 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/with-hosts.txt', 'r')); 23 | $this->parser->setLogger($log); 24 | } 25 | 26 | public function tearDown(): void { 27 | $this->parser = null; 28 | } 29 | 30 | public function testGetAllHosts() { 31 | $allHosts = $this->parser->getHost(); 32 | $this->assertContains('myhost.ru', $allHosts); 33 | $this->assertContains('www.myhost.ru', $allHosts); 34 | } 35 | 36 | public function testHostForSomeUserAgent() { 37 | $yandexHost = $this->parser->getHost('Yandex'); 38 | $this->assertEquals('www.myhost.ru', $yandexHost); 39 | } 40 | 41 | public function testHostForSomeUserAgentFallsBackToDefault() { 42 | $googleHost = $this->parser->getHost('Google'); 43 | $this->assertEquals('myhost.ru', $googleHost); 44 | 45 | /** @var TestHandler $handler */ 46 | $handler = $this->parser->getLogger()->getHandlers()[0]; 47 | 48 | $this->assertTrue( 49 | $handler->hasRecord("Failed to match user agent 'Google', falling back to '*'", LogLevel::DEBUG), 50 | stringifyLogs($handler->getRecords()) 51 | ); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /test/Directives/SitemapsTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 21 | 22 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/with-sitemaps.txt', 'r')); 23 | $this->parser->setLogger($log); 24 | } 25 | 26 | public function tearDown(): void { 27 | $this->parser = null; 28 | } 29 | 30 | public function testRemoveDuplicateSitemaps() { 31 | $allMaps = $this->parser->getSitemaps(); 32 | 33 | $this->assertCount(5, $allMaps); 34 | $this->assertContains('http://example.com/sitemap.xml?year=2015', $allMaps); 35 | $this->assertContains('http://somesite.com/sitemap-for-all.xml', $allMaps); 36 | $this->assertContains('http://internet.com/sitemap-for-google-bot.xml', $allMaps); 37 | $this->assertContains('http://worldwideweb.com/sitemap-yahoo.xml', $allMaps); 38 | $this->assertContains('http://example.com/sitemap-yahoo.xml?year=2016', $allMaps); 39 | } 40 | 41 | public function testGetSitemapForExactUserAgent() { 42 | $yahooMaps = $this->parser->getSitemaps('Yahoo'); 43 | 44 | $this->assertCount(2, $yahooMaps); 45 | $this->assertContains('http://worldwideweb.com/sitemap-yahoo.xml', $yahooMaps); 46 | $this->assertContains('http://example.com/sitemap-yahoo.xml?year=2016', $yahooMaps); 47 | } 48 | 49 | public function testGetSitemapFallsBackToDefault() { 50 | $fallenBack = $this->parser->getSitemaps('Yandex'); 51 | 52 | $this->assertCount(2, $fallenBack); 53 | $this->assertContains('http://somesite.com/sitemap-for-all.xml', $fallenBack); 54 | $this->assertContains('http://example.com/sitemap.xml?year=2015', $fallenBack); 55 | 56 | /** @var TestHandler $handler */ 57 | $handler = $this->parser->getLogger()->getHandlers()[0]; 58 | 59 | $this->assertTrue( 60 | $handler->hasRecord("Failed to match user agent 'Yandex', falling back to '*'", LogLevel::DEBUG), 61 | stringifyLogs($handler->getRecords()) 62 | ); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /test/DisallowAllTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($parser->isDisallowed("/index")); 15 | $this->assertFalse($parser->isAllowed("/index")); 16 | } 17 | 18 | public function testAllowWildcard() { 19 | $parser = new RobotsTxtParser(file_get_contents(__DIR__ . '/Fixtures/allow-all.txt')); 20 | $this->assertFalse($parser->isDisallowed("/index")); 21 | $this->assertFalse($parser->isDisallowed("/")); 22 | $this->assertTrue($parser->isAllowed("/index")); 23 | $this->assertTrue($parser->isAllowed("/")); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/DisallowUppercasePathTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($parser->isDisallowed("/Admin")); 22 | $this->assertFalse($parser->isAllowed("/Admin")); 23 | } 24 | 25 | /** 26 | * Generate test case data 27 | * @return array 28 | */ 29 | public function generateDataForTest(): array { 30 | return [ 31 | [ 32 | " 33 | User-agent: * 34 | Disallow : /Admin 35 | " 36 | ] 37 | ]; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /test/EmptyRulesShouldAllowEverythingTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($parser->isAllowed('/foo')); 20 | $this->assertFalse($parser->isDisallowed('/foo')); 21 | $this->assertNull($parser->getHost()); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /test/EncodingTest.php: -------------------------------------------------------------------------------- 1 | logger = new Logger(static::class); 17 | $this->logger->pushHandler(new TestHandler(LogLevel::DEBUG)); 18 | } 19 | 20 | public function testLogsNonStandardEncoding() { 21 | $parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/market-yandex-Windows-1251.txt', 'r'), 'Windows-1251'); 22 | $parser->setLogger($this->logger); 23 | $parser->getRules(); 24 | 25 | /** @var TestHandler $handler */ 26 | $handler = $parser->getLogger()->getHandlers()[0]; 27 | 28 | $this->assertTrue( 29 | $handler->hasRecord(WarmingMessages::ENCODING_NOT_UTF8, LogLevel::WARNING), 30 | stringifyLogs($handler->getRecords()) 31 | ); 32 | 33 | $this->assertTrue( 34 | $handler->hasRecord('Adding encoding filter convert.iconv.Windows-1251/utf-8', LogLevel::DEBUG), 35 | stringifyLogs($handler->getRecords()) 36 | ); 37 | } 38 | 39 | public function testWindows1251Readable() { 40 | $parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/market-yandex-Windows-1251.txt', 'r'), 'Windows-1251'); 41 | $parser->setLogger($this->logger); 42 | 43 | $allRules = $parser->getRules(); 44 | $this->assertCount(5, $allRules, json_encode(array_keys($allRules))); 45 | } 46 | 47 | public function testShouldNotChangeInternalEncoding() { 48 | $this->assertEquals('UTF-8', mb_internal_encoding()); 49 | $parser = new RobotsTxtParser('', 'iso-8859-1'); 50 | $this->assertEquals('UTF-8', mb_internal_encoding()); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /test/EndAnchorTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($parser->isAllowed($path)); 25 | $this->assertFalse($parser->isDisallowed($path)); 26 | } else { 27 | $this->assertTrue($parser->isDisallowed($path)); 28 | $this->assertFalse($parser->isAllowed($path)); 29 | } 30 | } 31 | 32 | /** 33 | * Generate test case data 34 | * @return array 35 | */ 36 | public function generateDataForTest() { 37 | // Data provider defined in format: 38 | // [tested path, robotsTxtContent, true when allowed / false when disallowed] 39 | return [ 40 | [ 41 | "/", 42 | " 43 | User-Agent: * 44 | Disallow: /* 45 | Allow: /$ 46 | ", 47 | true, 48 | ], 49 | [ 50 | "/asd", 51 | " 52 | User-Agent: * 53 | Disallow: /* 54 | Allow: /$ 55 | ", 56 | false, 57 | ], 58 | [ 59 | "/asd/", 60 | " 61 | User-Agent: * 62 | Disallow: /* 63 | Allow: /$ 64 | ", 65 | false, 66 | ], 67 | [ 68 | "/deny_all/", 69 | " 70 | User-Agent: * 71 | Disallow: *deny_all/$ 72 | ", 73 | /** 74 | * @see InvalidPathTest for details why this is changed 75 | */ 76 | true, 77 | ], 78 | [ 79 | "/deny_all/", 80 | " 81 | User-Agent: * 82 | Disallow: /deny_all/$ 83 | ", 84 | false, 85 | ], 86 | [ 87 | "/deny_all/", 88 | " 89 | User-Agent: * 90 | Disallow: deny_all/$ 91 | ", 92 | true, 93 | ], 94 | ]; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /test/Fixtures/allow-all.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: / 3 | -------------------------------------------------------------------------------- /test/Fixtures/allow-spec.txt: -------------------------------------------------------------------------------- 1 | User-agent: anyone 2 | User-agent: * 3 | Disallow: /admin 4 | Disallow: /admin 5 | Disallow: /Admin 6 | Disallow: /temp#comment 7 | Disallow: /forum 8 | Disallow: /admin/cp/test/ 9 | 10 | User-agent: agentU/2.0 11 | Disallow: /bar 12 | Allow: /foo 13 | 14 | User-agent: agentV 15 | User-agent: agentW 16 | Disallow: /foo 17 | Allow: /bar #comment 18 | 19 | User-agent: spiderX 20 | Disallow: 21 | Disallow: /admin# 22 | Disallow: /assets 23 | 24 | User-agent: botY 25 | Disallow: / 26 | Allow: &&/1@| #invalid 27 | Allow: /forum/$ 28 | Allow: /article 29 | 30 | User-agent: crawlerZ 31 | Disallow: 32 | Disallow: / 33 | Allow: /$ 34 | -------------------------------------------------------------------------------- /test/Fixtures/cache-delay-spec.txt: -------------------------------------------------------------------------------- 1 | User-Agent: * 2 | Crawl-Delay: 0.5 3 | 4 | User-Agent: GoogleBot 5 | Cache-Delay: 3.7 6 | 7 | User-Agent: AhrefsBot 8 | Cache-Delay: 8 9 | 10 | User-Agent: Yandex 11 | Crawl-Delay: 1.5 12 | -------------------------------------------------------------------------------- /test/Fixtures/crawl-delay-spec.txt: -------------------------------------------------------------------------------- 1 | User-Agent: GoogleBot 2 | Crawl-Delay: 0.9 3 | 4 | User-Agent: AhrefsBot 5 | Crawl-Delay: 1.5 6 | -------------------------------------------------------------------------------- /test/Fixtures/disallow-all.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: / 3 | -------------------------------------------------------------------------------- /test/Fixtures/expected-skipped-lines-log.php: -------------------------------------------------------------------------------- 1 |
432 | #
433 | # Localisable part of robots.txt for en.wikipedia.org
434 | #
435 | # Edit at https://en.wikipedia.org/w/index.php?title=MediaWiki:Robots.txt&action=edit
436 | # Don't add newlines here. All rules set here are active for every user-agent.
437 | #
438 | # Please check any changes using a syntax validator
439 | # Enter https://en.wikipedia.org/robots.txt as the URL to check.
440 | #
441 | # https://phabricator.wikimedia.org/T16075
442 | Disallow: /wiki/MediaWiki:Spam-blacklist
443 | Disallow: /wiki/MediaWiki%3ASpam-blacklist
444 | Disallow: /wiki/MediaWiki_talk:Spam-blacklist
445 | Disallow: /wiki/MediaWiki_talk%3ASpam-blacklist
446 | Disallow: /wiki/Wikipedia:WikiProject_Spam
447 | Disallow: /wiki/Wikipedia_talk:WikiProject_Spam
448 | #
449 | # Folks get annoyed when XfD discussions end up the number 1 google hit for
450 | # their name.
451 | # https://phabricator.wikimedia.org/T16075
452 | Disallow: /wiki/Wikipedia:Articles_for_deletion
453 | Disallow: /wiki/Wikipedia%3AArticles_for_deletion
454 | Disallow: /wiki/Wikipedia:Votes_for_deletion
455 | Disallow: /wiki/Wikipedia%3AVotes_for_deletion
456 | Disallow: /wiki/Wikipedia:Pages_for_deletion
457 | Disallow: /wiki/Wikipedia%3APages_for_deletion
458 | Disallow: /wiki/Wikipedia:Miscellany_for_deletion
459 | Disallow: /wiki/Wikipedia%3AMiscellany_for_deletion
460 | Disallow: /wiki/Wikipedia:Miscellaneous_deletion
461 | Disallow: /wiki/Wikipedia%3AMiscellaneous_deletion
462 | Disallow: /wiki/Wikipedia:Categories_for_discussion
463 | Disallow: /wiki/Wikipedia%3ACategories_for_discussion
464 | Disallow: /wiki/Wikipedia:Templates_for_deletion
465 | Disallow: /wiki/Wikipedia%3ATemplates_for_deletion
466 | Disallow: /wiki/Wikipedia:Redirects_for_discussion
467 | Disallow: /wiki/Wikipedia%3ARedirects_for_discussion
468 | Disallow: /wiki/Wikipedia:Deletion_review
469 | Disallow: /wiki/Wikipedia%3ADeletion_review
470 | Disallow: /wiki/Wikipedia:WikiProject_Deletion_sorting
471 | Disallow: /wiki/Wikipedia%3AWikiProject_Deletion_sorting
472 | Disallow: /wiki/Wikipedia:Files_for_deletion
473 | Disallow: /wiki/Wikipedia%3AFiles_for_deletion
474 | Disallow: /wiki/Wikipedia:Files_for_discussion
475 | Disallow: /wiki/Wikipedia%3AFiles_for_discussion
476 | Disallow: /wiki/Wikipedia:Possibly_unfree_files
477 | Disallow: /wiki/Wikipedia%3APossibly_unfree_files
478 | #
479 | # https://phabricator.wikimedia.org/T12288
480 | Disallow: /wiki/Wikipedia_talk:Articles_for_deletion
481 | Disallow: /wiki/Wikipedia_talk%3AArticles_for_deletion
482 | Disallow: /wiki/Wikipedia_talk:Votes_for_deletion
483 | Disallow: /wiki/Wikipedia_talk%3AVotes_for_deletion
484 | Disallow: /wiki/Wikipedia_talk:Pages_for_deletion
485 | Disallow: /wiki/Wikipedia_talk%3APages_for_deletion
486 | Disallow: /wiki/Wikipedia_talk:Miscellany_for_deletion
487 | Disallow: /wiki/Wikipedia_talk%3AMiscellany_for_deletion
488 | Disallow: /wiki/Wikipedia_talk:Miscellaneous_deletion
489 | Disallow: /wiki/Wikipedia_talk%3AMiscellaneous_deletion
490 | Disallow: /wiki/Wikipedia_talk:Templates_for_deletion
491 | Disallow: /wiki/Wikipedia_talk%3ATemplates_for_deletion
492 | Disallow: /wiki/Wikipedia_talk:Categories_for_discussion
493 | Disallow: /wiki/Wikipedia_talk%3ACategories_for_discussion
494 | Disallow: /wiki/Wikipedia_talk:Deletion_review
495 | Disallow: /wiki/Wikipedia_talk%3ADeletion_review
496 | Disallow: /wiki/Wikipedia_talk:WikiProject_Deletion_sorting
497 | Disallow: /wiki/Wikipedia_talk%3AWikiProject_Deletion_sorting
498 | Disallow: /wiki/Wikipedia_talk:Files_for_deletion
499 | Disallow: /wiki/Wikipedia_talk%3AFiles_for_deletion
500 | Disallow: /wiki/Wikipedia_talk:Files_for_discussion
501 | Disallow: /wiki/Wikipedia_talk%3AFiles_for_discussion
502 | Disallow: /wiki/Wikipedia_talk:Possibly_unfree_files
503 | Disallow: /wiki/Wikipedia_talk%3APossibly_unfree_files
504 | #
505 | Disallow: /wiki/Wikipedia:Copyright_problems
506 | Disallow: /wiki/Wikipedia%3ACopyright_problems
507 | Disallow: /wiki/Wikipedia_talk:Copyright_problems
508 | Disallow: /wiki/Wikipedia_talk%3ACopyright_problems
509 | Disallow: /wiki/Wikipedia:Suspected_copyright_violations
510 | Disallow: /wiki/Wikipedia%3ASuspected_copyright_violations
511 | Disallow: /wiki/Wikipedia_talk:Suspected_copyright_violations
512 | Disallow: /wiki/Wikipedia_talk%3ASuspected_copyright_violations
513 | Disallow: /wiki/Wikipedia:Contributor_copyright_investigations
514 | Disallow: /wiki/Wikipedia%3AContributor_copyright_investigations
515 | Disallow: /wiki/Wikipedia:Contributor_copyright_investigations
516 | Disallow: /wiki/Wikipedia%3AContributor_copyright_investigations
517 | Disallow: /wiki/Wikipedia_talk:Contributor_copyright_investigations
518 | Disallow: /wiki/Wikipedia_talk%3AContributor_copyright_investigations
519 | Disallow: /wiki/Wikipedia_talk:Contributor_copyright_investigations
520 | Disallow: /wiki/Wikipedia_talk%3AContributor_copyright_investigations
521 | Disallow: /wiki/Wikipedia:Protected_titles
522 | Disallow: /wiki/Wikipedia%3AProtected_titles
523 | Disallow: /wiki/Wikipedia_talk:Protected_titles
524 | Disallow: /wiki/Wikipedia_talk%3AProtected_titles
525 | Disallow: /wiki/Wikipedia:Articles_for_creation
526 | Disallow: /wiki/Wikipedia%3AArticles_for_creation
527 | Disallow: /wiki/Wikipedia_talk:Articles_for_creation
528 | Disallow: /wiki/Wikipedia_talk%3AArticles_for_creation
529 | Disallow: /wiki/Wikipedia_talk:Article_wizard
530 | Disallow: /wiki/Wikipedia_talk%3AArticle_wizard
531 | #
532 | # https://phabricator.wikimedia.org/T13261
533 | Disallow: /wiki/Wikipedia:Requests_for_arbitration
534 | Disallow: /wiki/Wikipedia%3ARequests_for_arbitration
535 | Disallow: /wiki/Wikipedia_talk:Requests_for_arbitration
536 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_arbitration
537 | Disallow: /wiki/Wikipedia:Requests_for_comment
538 | Disallow: /wiki/Wikipedia%3ARequests_for_comment
539 | Disallow: /wiki/Wikipedia_talk:Requests_for_comment
540 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_comment
541 | Disallow: /wiki/Wikipedia:Requests_for_adminship
542 | Disallow: /wiki/Wikipedia%3ARequests_for_adminship
543 | Disallow: /wiki/Wikipedia_talk:Requests_for_adminship
544 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_adminship
545 | #
546 | # https://phabricator.wikimedia.org/T14111
547 | Disallow: /wiki/Wikipedia:Requests_for_checkuser
548 | Disallow: /wiki/Wikipedia%3ARequests_for_checkuser
549 | Disallow: /wiki/Wikipedia_talk:Requests_for_checkuser
550 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_checkuser
551 | #
552 | # https://phabricator.wikimedia.org/T15398
553 | Disallow: /wiki/Wikipedia:WikiProject_Spam
554 | Disallow: /wiki/Wikipedia%3AWikiProject_Spam
555 | #
556 | # https://phabricator.wikimedia.org/T16793
557 | Disallow: /wiki/Wikipedia:Changing_username
558 | Disallow: /wiki/Wikipedia%3AChanging_username
559 | Disallow: /wiki/Wikipedia:Changing_username
560 | Disallow: /wiki/Wikipedia%3AChanging_username
561 | Disallow: /wiki/Wikipedia_talk:Changing_username
562 | Disallow: /wiki/Wikipedia_talk%3AChanging_username
563 | Disallow: /wiki/Wikipedia_talk:Changing_username
564 | Disallow: /wiki/Wikipedia_talk%3AChanging_username
565 | #
566 | Disallow: /wiki/Wikipedia:Administrators%27_noticeboard
567 | Disallow: /wiki/Wikipedia%3AAdministrators%27_noticeboard
568 | Disallow: /wiki/Wikipedia_talk:Administrators%27_noticeboard
569 | Disallow: /wiki/Wikipedia_talk%3AAdministrators%27_noticeboard
570 | Disallow: /wiki/Wikipedia:Community_sanction_noticeboard
571 | Disallow: /wiki/Wikipedia%3ACommunity_sanction_noticeboard
572 | Disallow: /wiki/Wikipedia_talk:Community_sanction_noticeboard
573 | Disallow: /wiki/Wikipedia_talk%3ACommunity_sanction_noticeboard
574 | Disallow: /wiki/Wikipedia:Bureaucrats%27_noticeboard
575 | Disallow: /wiki/Wikipedia%3ABureaucrats%27_noticeboard
576 | Disallow: /wiki/Wikipedia_talk:Bureaucrats%27_noticeboard
577 | Disallow: /wiki/Wikipedia_talk%3ABureaucrats%27_noticeboard
578 | #
579 | Disallow: /wiki/Wikipedia:Sockpuppet_investigations
580 | Disallow: /wiki/Wikipedia%3ASockpuppet_investigations
581 | Disallow: /wiki/Wikipedia_talk:Sockpuppet_investigations
582 | Disallow: /wiki/Wikipedia_talk%3ASockpuppet_investigations
583 | #
584 | Disallow: /wiki/Wikipedia:Neutral_point_of_view/Noticeboard
585 | Disallow: /wiki/Wikipedia%3ANeutral_point_of_view/Noticeboard
586 | Disallow: /wiki/Wikipedia_talk:Neutral_point_of_view/Noticeboard
587 | Disallow: /wiki/Wikipedia_talk%3ANeutral_point_of_view/Noticeboard
588 | #
589 | Disallow: /wiki/Wikipedia:No_original_research/noticeboard
590 | Disallow: /wiki/Wikipedia%3ANo_original_research/noticeboard
591 | Disallow: /wiki/Wikipedia_talk:No_original_research/noticeboard
592 | Disallow: /wiki/Wikipedia_talk%3ANo_original_research/noticeboard
593 | #
594 | Disallow: /wiki/Wikipedia:Fringe_theories/Noticeboard
595 | Disallow: /wiki/Wikipedia%3AFringe_theories/Noticeboard
596 | Disallow: /wiki/Wikipedia_talk:Fringe_theories/Noticeboard
597 | Disallow: /wiki/Wikipedia_talk%3AFringe_theories/Noticeboard
598 | #
599 | Disallow: /wiki/Wikipedia:Conflict_of_interest/Noticeboard
600 | Disallow: /wiki/Wikipedia%3AConflict_of_interest/Noticeboard
601 | Disallow: /wiki/Wikipedia_talk:Conflict_of_interest/Noticeboard
602 | Disallow: /wiki/Wikipedia_talk%3AConflict_of_interest/Noticeboard
603 | #
604 | Disallow: /wiki/Wikipedia:Long-term_abuse
605 | Disallow: /wiki/Wikipedia%3ALong-term_abuse
606 | Disallow: /wiki/Wikipedia_talk:Long-term_abuse
607 | Disallow: /wiki/Wikipedia_talk%3ALong-term_abuse
608 | Disallow: /wiki/Wikipedia:Long_term_abuse
609 | Disallow: /wiki/Wikipedia%3ALong_term_abuse
610 | Disallow: /wiki/Wikipedia_talk:Long_term_abuse
611 | Disallow: /wiki/Wikipedia_talk%3ALong_term_abuse
612 | #
613 | Disallow: /wiki/Wikipedia:Wikiquette_assistance
614 | Disallow: /wiki/Wikipedia%3AWikiquette_assistance
615 | #
616 | Disallow: /wiki/Wikipedia:Abuse_reports
617 | Disallow: /wiki/Wikipedia%3AAbuse_reports
618 | Disallow: /wiki/Wikipedia_talk:Abuse_reports
619 | Disallow: /wiki/Wikipedia_talk%3AAbuse_reports
620 | Disallow: /wiki/Wikipedia:Abuse_response
621 | Disallow: /wiki/Wikipedia%3AAbuse_response
622 | Disallow: /wiki/Wikipedia_talk:Abuse_response
623 | Disallow: /wiki/Wikipedia_talk%3AAbuse_response
624 | #
625 | Disallow: /wiki/Wikipedia:Reliable_sources/Noticeboard
626 | Disallow: /wiki/Wikipedia%3AReliable_sources/Noticeboard
627 | Disallow: /wiki/Wikipedia_talk:Reliable_sources/Noticeboard
628 | Disallow: /wiki/Wikipedia_talk%3AReliable_sources/Noticeboard
629 | #
630 | Disallow: /wiki/Wikipedia:Suspected_sock_puppets
631 | Disallow: /wiki/Wikipedia%3ASuspected_sock_puppets
632 | Disallow: /wiki/Wikipedia_talk:Suspected_sock_puppets
633 | Disallow: /wiki/Wikipedia_talk%3ASuspected_sock_puppets
634 | #
635 | Disallow: /wiki/Wikipedia:Biographies_of_living_persons/Noticeboard
636 | Disallow: /wiki/Wikipedia%3ABiographies_of_living_persons/Noticeboard
637 | Disallow: /wiki/Wikipedia_talk:Biographies_of_living_persons/Noticeboard
638 | Disallow: /wiki/Wikipedia_talk%3ABiographies_of_living_persons/Noticeboard
639 | Disallow: /wiki/Wikipedia:Biographies_of_living_persons%2FNoticeboard
640 | Disallow: /wiki/Wikipedia%3ABiographies_of_living_persons%2FNoticeboard
641 | Disallow: /wiki/Wikipedia_talk:Biographies_of_living_persons%2FNoticeboard
642 | Disallow: /wiki/Wikipedia_talk%3ABiographies_of_living_persons%2FNoticeboard
643 | #
644 | Disallow: /wiki/Wikipedia:Content_noticeboard
645 | Disallow: /wiki/Wikipedia%3AContent_noticeboard
646 | Disallow: /wiki/Wikipedia_talk:Content_noticeboard
647 | Disallow: /wiki/Wikipedia_talk%3AContent_noticeboard
648 | #
649 | Disallow: /wiki/Template:Editnotices
650 | Disallow: /wiki/Template%3AEditnotices
651 | #
652 | Disallow: /wiki/Wikipedia:Arbitration
653 | Disallow: /wiki/Wikipedia%3AArbitration
654 | Disallow: /wiki/Wikipedia_talk:Arbitration
655 | Disallow: /wiki/Wikipedia_talk%3AArbitration
656 | #
657 | Disallow: /wiki/Wikipedia:Arbitration_Committee
658 | Disallow: /wiki/Wikipedia%3AArbitration_Committee
659 | Disallow: /wiki/Wikipedia_talk:Arbitration_Committee
660 | Disallow: /wiki/Wikipedia_talk%3AArbitration_Committee
661 | #
662 | Disallow: /wiki/Wikipedia:Arbitration_Committee_Elections
663 | Disallow: /wiki/Wikipedia%3AArbitration_Committee_Elections
664 | Disallow: /wiki/Wikipedia_talk:Arbitration_Committee_Elections
665 | Disallow: /wiki/Wikipedia_talk%3AArbitration_Committee_Elections
666 | #
667 | Disallow: /wiki/Wikipedia:Mediation_Committee
668 | Disallow: /wiki/Wikipedia%3AMediation_Committee
669 | Disallow: /wiki/Wikipedia_talk:Mediation_Committee
670 | Disallow: /wiki/Wikipedia_talk%3AMediation_Committee
671 | #
672 | Disallow: /wiki/Wikipedia:Mediation_Cabal/Cases
673 | Disallow: /wiki/Wikipedia%3AMediation_Cabal/Cases
674 | #
675 | Disallow: /wiki/Wikipedia:Requests_for_bureaucratship
676 | Disallow: /wiki/Wikipedia%3ARequests_for_bureaucratship
677 | Disallow: /wiki/Wikipedia_talk:Requests_for_bureaucratship
678 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_bureaucratship
679 | #
680 | Disallow: /wiki/Wikipedia:Administrator_review
681 | Disallow: /wiki/Wikipedia%3AAdministrator_review
682 | Disallow: /wiki/Wikipedia_talk:Administrator_review
683 | Disallow: /wiki/Wikipedia_talk%3AAdministrator_review
684 | #
685 | Disallow: /wiki/Wikipedia:Editor_review
686 | Disallow: /wiki/Wikipedia%3AEditor_review
687 | Disallow: /wiki/Wikipedia_talk:Editor_review
688 | Disallow: /wiki/Wikipedia_talk%3AEditor_review
689 | #
690 | Disallow: /wiki/Wikipedia:Article_Incubator
691 | Disallow: /wiki/Wikipedia%3AArticle_Incubator
692 | Disallow: /wiki/Wikipedia_talk:Article_Incubator
693 | Disallow: /wiki/Wikipedia_talk%3AArticle_Incubator
694 | #
695 | Disallow: /wiki/Category:Noindexed_pages
696 | Disallow: /wiki/Category%3ANoindexed_pages
697 | #
698 | # User sandboxes for modules and Template Styles are placed in these subpages for testing
699 | #
700 | Disallow: /wiki/Module:Sandbox
701 | Disallow: /wiki/Module%3ASandbox
702 | Disallow: /wiki/Template:TemplateStyles_sandbox
703 | Disallow: /wiki/Template%3ATemplateStyles_sandbox
704 | #
705 | # 
706 | -------------------------------------------------------------------------------- /test/Fixtures/with-clean-param.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: / 3 | Clean-param: s&ref /forum*/sh*wthread.php 4 | Clean-param: abc /forum/showthread.php 5 | Clean-param: sid&sort /forum/*.php 6 | Clean-param: someTrash&otherTrash 7 | -------------------------------------------------------------------------------- /test/Fixtures/with-commented-line-endings.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disal # low: /tech 3 | 4 | User-agent: google-bot 5 | Disallow: #/tech 6 | 7 | User-agent: yahoo-bot 8 | Disallow: /tech # ds 9 | 10 | User-agent: yandex-bot 11 | Disallow#: /tech # ds 12 | 13 | User-agent: * 14 | Disallow: /comment-after #comment 15 | -------------------------------------------------------------------------------- /test/Fixtures/with-commented-lines.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | # Disallow: /tech 3 | # this is a commented line 4 | # it should not be in the iterator 5 | Allow: /some 6 | -------------------------------------------------------------------------------- /test/Fixtures/with-empty-and-whitespace.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | User-Agent: GoogleBot 4 | Crawl-Delay: 0.9 5 | User-Agent: AhrefsBot 6 | Crawl-Delay: 1.5 7 | -------------------------------------------------------------------------------- /test/Fixtures/with-empty-lines.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | # Disallow: /tech 3 | 4 | 5 | 6 | # this is a commented line 7 | 8 | 9 | 10 | # it should not be in the iterator 11 | 12 | 13 | 14 | Allow: /some 15 | -------------------------------------------------------------------------------- /test/Fixtures/with-empty-rules.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | #Disallow: /tech 3 | -------------------------------------------------------------------------------- /test/Fixtures/with-faulty-directives.txt: -------------------------------------------------------------------------------- 1 | User-agent: google1 #specifies the robots that the directives are set for 2 | Disallow: /bin/ # prohibits links from the Shopping Cart. 3 | Disallow: /search/ # prohibits page links of the search embedded on the site 4 | Disallow: /admin/ # prohibits links from the admin panel 5 | Disallow /admin/ # prohibits links from the admin panel 6 | Sitemap: http://example.com/sitemap # specifies the path to the site's Sitemap file for the robot 7 | Clean-param: ref /some_dir/get_book.pl 8 | 9 | user-agent: google2 #specifies the robots that the directives are set for 10 | disallow: /bin/ # prohibits links from the Shopping Cart. 11 | disallow: /search/ # prohibits page links of the search embedded on the site 12 | disallow: /admin/ # prohibits links from the admin panel 13 | sitemap: http://example.com/sitemap # specifies the path to the site's Sitemap file for the robot 14 | clean-param: ref /some_dir/get_book.pl 15 | 16 | user-Agent: google3 #specifies the robots that the directives are set for 17 | disaLLow: /bin/ # prohibits links from the Shopping Cart. 18 | diSallow: /search/ # prohibits page links of the search embedded on the site 19 | dis@llow: /admin/ # prohibits links from the admin panel 20 | sitEmap: http://example.com/sitemap # specifies the path to the site's Sitemap file for the robot 21 | cleanParam: ref /some_dir/get_book.pl 22 | 23 | User#agent: google4 #specifies the robots that the directives are set for 24 | Disa#low: /bin/ # prohibits links from the Shopping Cart. 25 | Disa#low: /search/ # prohibits page links of the search embedded on the site 26 | Disa#low: /admin/ # prohibits links from the admin panel 27 | Site#ap: http://example.com/sitemap # specifies the path to the site's Sitemap file for the robot 28 | Clean#param: ref /some_dir/get_book.pl 29 | -------------------------------------------------------------------------------- /test/Fixtures/with-hosts.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /cgi-bin 3 | Disallow: / 4 | Host: myhost.ru 5 | 6 | User-agent: Yandex 7 | Disallow: /cgi-bin 8 | 9 | # Examples of Host directives that will be ignored 10 | Host: www.myhost-.com 11 | Host: www.-myhost.com 12 | Host: www.myhost.com:100000 13 | Host: www.my_host.com 14 | Host: .my-host.com:8000 15 | Host: my-host.com.Host: my..host.com 16 | Host: www.myhost.com:8080/ 17 | Host: 213.180.194.129 18 | Host: [2001:db8::1] 19 | Host: FE80::0202:B3FF:FE1E:8329 20 | Host: https://[2001:db8:0:1]:80 21 | Host: www.firsthost.ru,www.secondhost.com 22 | Host: www.firsthost.ru www.secondhost.com 23 | 24 | # Examples of valid Host directives 25 | Host: myhost.ru # uses this one 26 | Host: www.myhost.ru # is not used 27 | -------------------------------------------------------------------------------- /test/Fixtures/with-invalid-request-rate.txt: -------------------------------------------------------------------------------- 1 | Useragent: GoogleBot 2 | Crawl-delay: 0.3 # valid 3 | Crawl-delay: 0.599 # valid 4 | Crawl-delay: 8888 # valid 5 | Crawl-delay: 8888 6 | Crawl-delay: ngfsngdndag 7 | Crawl-delay: ngfsn.gdndag # invalid 8 | Crawl-delay: 0.vfsbfsb # invalid 9 | Request-rate: 100/854000 # valid 10 | Request-rate: 100/bgdndgnd # invalid 11 | Request-rate: 15686 # invalid 12 | Request-rate: ngdndganda # invalid 13 | -------------------------------------------------------------------------------- /test/Fixtures/with-sitemaps.txt: -------------------------------------------------------------------------------- 1 | Sitemap: http://example.com/sitemap.xml?year=2015 2 | Sitemap: http://example.com/sitemap.xml?year=2015 3 | Sitemap: http://example.com/sitemap.xml?year=2015 4 | 5 | User-agent: * 6 | Disallow: /admin/ 7 | Sitemap: http://somesite.com/sitemap-for-all.xml 8 | 9 | User-agent: Googlebot 10 | Sitemap: http://internet.com/sitemap-for-google-bot.xml 11 | 12 | User-agent: Yahoo 13 | Sitemap: http://worldwideweb.com/sitemap-yahoo.xml 14 | Sitemap: http://example.com/sitemap-yahoo.xml?year=2016 15 | -------------------------------------------------------------------------------- /test/HttpStatusCodeTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 16 | 17 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/allow-all.txt', 'r')); 18 | $this->parser->setLogger($log); 19 | } 20 | 21 | public function tearDown(): void { 22 | $this->parser = null; 23 | } 24 | 25 | public function testHttpStatusCodeValid() { 26 | $this->parser->setHttpStatusCode(200); 27 | $this->assertTrue($this->parser->isAllowed("/")); 28 | $this->assertFalse($this->parser->isDisallowed("/")); 29 | 30 | /** @var TestHandler $handler */ 31 | $handler = $this->parser->getLogger()->getHandlers()[0]; 32 | 33 | $this->assertTrue( 34 | $handler->hasRecord("Rule match: Path", LogLevel::DEBUG), 35 | stringifyLogs($handler->getRecords()) 36 | ); 37 | } 38 | 39 | public function testHttpStatusCodeInvalid() { 40 | $this->parser->setHttpStatusCode(503); 41 | $this->assertTrue($this->parser->isDisallowed("/")); 42 | $this->assertFalse($this->parser->isAllowed("/")); 43 | 44 | /** @var TestHandler $handler */ 45 | $handler = $this->parser->getLogger()->getHandlers()[0]; 46 | 47 | $this->assertTrue( 48 | $handler->hasRecord("Disallowed by HTTP status code 503", LogLevel::DEBUG), 49 | stringifyLogs($handler->getRecords()) 50 | ); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /test/InvalidPathTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($parser->isAllowed('*wildcard')); 17 | $this->assertFalse($parser->isDisallowed("&&1@|")); 18 | $this->assertTrue($parser->isAllowed('+£€@@1¤')); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /test/Parser/DirectivesProcessors/CleanParamProcessorTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 22 | 23 | $this->processor = new CleanParamProcessor($log); 24 | } 25 | 26 | public function tearDown(): void { 27 | $this->processor = null; 28 | } 29 | 30 | public function testProcessesCorrectlyWithPath() { 31 | $tree = []; 32 | $line = 'Clean-param: some&someMore /only/here'; 33 | 34 | $this->processor->process($line, $tree); 35 | 36 | $this->assertArrayHasKey(Directive::CLEAN_PARAM, $tree); 37 | $this->assertArrayHasKey('/only/here', $tree[Directive::CLEAN_PARAM], json_encode($tree[Directive::CLEAN_PARAM])); 38 | $this->assertContains('some', $tree[Directive::CLEAN_PARAM]['/only/here'], json_encode($tree[Directive::CLEAN_PARAM])); 39 | $this->assertContains('someMore', $tree[Directive::CLEAN_PARAM]['/only/here'], json_encode($tree[Directive::CLEAN_PARAM])); 40 | } 41 | 42 | public function testProcessesCorrectlyWithNoPath() { 43 | $tree = []; 44 | $line = 'Clean-param: some&someMore'; 45 | 46 | $this->processor->process($line, $tree); 47 | 48 | $this->assertArrayHasKey(Directive::CLEAN_PARAM, $tree); 49 | $this->assertArrayHasKey('/*', $tree[Directive::CLEAN_PARAM], json_encode($tree[Directive::CLEAN_PARAM])); 50 | $this->assertContains('some', $tree[Directive::CLEAN_PARAM]['/*'], json_encode($tree[Directive::CLEAN_PARAM])); 51 | $this->assertContains('someMore', $tree[Directive::CLEAN_PARAM]['/*'], json_encode($tree[Directive::CLEAN_PARAM])); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /test/Parser/DirectivesProcessors/CrawlDelayProcessorTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 22 | 23 | $this->processor = new CrawlDelayProcessor($log); 24 | } 25 | 26 | public function tearDown(): void { 27 | $this->processor = null; 28 | } 29 | 30 | public function testSavesValidCrawlDelayInteger() { 31 | $tree = []; 32 | $line = 'Crawl-delay: 25'; 33 | 34 | $this->processor->process($line, $tree); 35 | 36 | $this->assertArrayHasKey('*', $tree); 37 | $this->assertArrayHasKey(Directive::CRAWL_DELAY, $tree['*']); 38 | $this->assertEquals(25, $tree['*'][Directive::CRAWL_DELAY], json_encode($tree)); 39 | } 40 | 41 | public function testSavesValidCrawlDelayDecimal() { 42 | $tree = []; 43 | $line = 'Crawl-delay: 0.5'; 44 | 45 | $this->processor->process($line, $tree); 46 | 47 | $this->assertArrayHasKey('*', $tree); 48 | $this->assertArrayHasKey(Directive::CRAWL_DELAY, $tree['*']); 49 | $this->assertEquals(0.5, $tree['*'][Directive::CRAWL_DELAY], json_encode($tree)); 50 | } 51 | 52 | public function testSkipsInvalidAndLogs() { 53 | $tree = []; 54 | $line = 'Crawl-delay: thisIsNotANumber'; 55 | 56 | $this->processor->process($line, $tree); 57 | 58 | $this->assertArrayNotHasKey('*', $tree, json_encode($tree)); 59 | 60 | /** @var TestHandler $handler */ 61 | $handler = $this->processor->getLogger()->getHandlers()[0]; 62 | 63 | $this->assertTrue( 64 | $handler->hasRecord( 65 | 'crawl-delay with value thisIsNotANumber dropped as invalid for *', 66 | LogLevel::DEBUG 67 | ), 68 | stringifyLogs($handler->getRecords()) 69 | ); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /test/Parser/DirectivesProcessors/HostProcessorTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 22 | 23 | $this->processor = new HostProcessor($log); 24 | } 25 | 26 | public function tearDown(): void { 27 | $this->processor = null; 28 | } 29 | 30 | public function testAddsHostIfCorrect() { 31 | $tree = []; 32 | $line = 'Host: www.example.com'; 33 | 34 | $this->processor->process($line, $tree); 35 | 36 | $this->assertArrayHasKey('*', $tree); 37 | $this->assertArrayHasKey(Directive::HOST, $tree['*']); 38 | $this->assertContains('www.example.com', $tree['*'], json_encode($tree)); 39 | } 40 | 41 | public function testSkipsAndLogsIfIpAddressPassed() { 42 | $tree = []; 43 | $line = 'Host: 192.168.0.1'; 44 | 45 | $this->processor->process($line, $tree); 46 | 47 | $this->assertArrayNotHasKey('*', $tree); 48 | $this->assertArrayNotHasKey(Directive::HOST, $tree); 49 | 50 | /** @var TestHandler $handler */ 51 | $handler = $this->processor->getLogger()->getHandlers()[0]; 52 | 53 | $this->assertTrue( 54 | $handler->hasRecord( 55 | 'host with value 192.168.0.1 dropped for * as invalid (IP address is not a valid hostname)', 56 | LogLevel::DEBUG 57 | ), 58 | stringifyLogs($handler->getRecords()) 59 | ); 60 | } 61 | 62 | public function testSkipsAndLogsIfNotValidHost() { 63 | $tree = []; 64 | $line = 'Host: bndgang!!!@#$da12345ngda]]'; 65 | 66 | $this->processor->process($line, $tree); 67 | 68 | $this->assertArrayNotHasKey('*', $tree); 69 | $this->assertArrayNotHasKey(Directive::HOST, $tree); 70 | 71 | /** @var TestHandler $handler */ 72 | $handler = $this->processor->getLogger()->getHandlers()[0]; 73 | 74 | $this->assertTrue( 75 | $handler->hasRecord( 76 | 'host with value bndgang!!!@#$da12345ngda]] dropped for * as invalid', 77 | LogLevel::DEBUG 78 | ), 79 | stringifyLogs($handler->getRecords()) 80 | ); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /test/Parser/DirectivesProcessors/SitemapProcessorTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 22 | 23 | $this->processor = new SitemapProcessor($log); 24 | } 25 | 26 | public function tearDown(): void { 27 | $this->processor = null; 28 | } 29 | 30 | public function testAddsSitemapDirectiveForDefaultUserAgent() { 31 | $tree = []; 32 | $line = 'Sitemap: https://www.example.com/sitemap.xml'; 33 | 34 | $this->processor->process($line, $tree); 35 | 36 | $this->assertArrayHasKey('*', $tree); 37 | $this->assertArrayHasKey(Directive::SITEMAP, $tree['*']); 38 | } 39 | 40 | public function testAddsSitemapDirectiveForCustomUserAgent() { 41 | $userAgent = 'Google'; 42 | $tree = []; 43 | $line = 'Sitemap: https://www.example.com/sitemap.xml'; 44 | 45 | $this->processor->process($line, $tree, $userAgent); 46 | 47 | $this->assertArrayHasKey('Google', $tree); 48 | $this->assertArrayHasKey(Directive::SITEMAP, $tree[$userAgent]); 49 | } 50 | 51 | public function testAddsSitemapSkipsExistingAndLogsIt() { 52 | $userAgent = 'Google'; 53 | $tree = [ 54 | $userAgent => [ 55 | Directive::SITEMAP => [ 56 | 'https://www.example.com/sitemap.xml' 57 | ] 58 | ] 59 | ]; 60 | $line = 'Sitemap: https://www.example.com/sitemap.xml'; 61 | 62 | $this->processor->process($line, $tree, $userAgent); 63 | 64 | $this->assertArrayHasKey('Google', $tree); 65 | $this->assertArrayHasKey(Directive::SITEMAP, $tree[$userAgent]); 66 | 67 | /** @var TestHandler $handler */ 68 | $handler = $this->processor->getLogger()->getHandlers()[0]; 69 | 70 | $this->assertTrue( 71 | $handler->hasRecord('sitemap with value https://www.example.com/sitemap.xml skipped as already exists for Google', LogLevel::DEBUG), 72 | stringifyLogs($handler->getRecords()) 73 | ); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /test/Parser/DirectivesProcessors/UserAgentProcessorTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 21 | 22 | $this->processor = new UserAgentProcessor($log); 23 | } 24 | 25 | public function tearDown(): void { 26 | $this->processor = null; 27 | } 28 | 29 | public function testAddsNewUserAgentSection() { 30 | $line = 'User-agent: Google'; 31 | $currentAgent = '*'; 32 | $tree = [ 33 | $currentAgent => [], 34 | ]; 35 | 36 | $this->processor->process($line, $tree, $currentAgent); 37 | 38 | $this->assertArrayHasKey('Google', $tree); 39 | $this->assertEquals('Google', $currentAgent); 40 | } 41 | 42 | public function testLogsIfNotChanged() { 43 | $line = 'User-agent: Google'; 44 | $currentAgent = 'Google'; 45 | $tree = [ 46 | $currentAgent => [], 47 | ]; 48 | 49 | $this->processor->process($line, $tree, $currentAgent); 50 | 51 | $this->assertCount(1, array_keys($tree)); 52 | 53 | /** @var TestHandler $handler */ 54 | $handler = $this->processor->getLogger()->getHandlers()[0]; 55 | 56 | $this->assertTrue( 57 | $handler->hasRecord('New useragent is equal to current one, skipping ...', LogLevel::DEBUG), 58 | stringifyLogs($handler->getRecords()) 59 | ); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /test/Parser/UserAgent/UserAgentMatcherTest.php: -------------------------------------------------------------------------------- 1 | pushHandler(new TestHandler(LogLevel::DEBUG)); 19 | 20 | $matcher = new UserAgentMatcher($logger); 21 | 22 | $match = $matcher->getMatching('Google', ['Google']); 23 | $this->assertEquals('Google', $match); 24 | 25 | $handler = $logger->getHandlers()[0]; 26 | 27 | $this->assertTrue( 28 | $handler->hasRecord("Matched Google for user agent Google", LogLevel::DEBUG), 29 | stringifyLogs($handler->getRecords()) 30 | ); 31 | } 32 | 33 | public function testLogsWhenNotMatched() { 34 | $logger = new Logger(static::class); 35 | $logger->pushHandler(new TestHandler(LogLevel::DEBUG)); 36 | 37 | $matcher = new UserAgentMatcher($logger); 38 | 39 | $match = $matcher->getMatching('Google', []); 40 | $this->assertEquals('*', $match); 41 | 42 | $handler = $logger->getHandlers()[0]; 43 | 44 | $this->assertTrue( 45 | $handler->hasRecord("Failed to match user agent 'Google', falling back to '*'", LogLevel::DEBUG), 46 | stringifyLogs($handler->getRecords()) 47 | ); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /test/RenderTest.php: -------------------------------------------------------------------------------- 1 | markTestSkipped('@TODO'); 16 | 17 | $parser = new RobotsTxtParser($robotsTxtContent); 18 | 19 | $this->assertEquals($rendered, $parser->render("\n")); 20 | } 21 | 22 | /** 23 | * Generate test data 24 | * 25 | * @return array 26 | */ 27 | public function generateDataForTest() 28 | { 29 | return [ 30 | [ 31 | <<pushHandler(new TestHandler(LogLevel::DEBUG)); 16 | 17 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/wikipedia-org.txt', 'r')); 18 | $this->parser->setLogger($log); 19 | } 20 | 21 | public function tearDown(): void { 22 | $this->parser = null; 23 | } 24 | 25 | public function testGetRulesAll() { 26 | $rules = $this->parser->getRules(); 27 | 28 | // should be all 33 UAs on top level 29 | $this->assertArrayHasKey("MJ12bot", $rules); 30 | $this->assertArrayHasKey("Mediapartners-Google*", $rules); 31 | $this->assertArrayHasKey("IsraBot", $rules); 32 | $this->assertArrayHasKey("Orthogaffe", $rules); 33 | $this->assertArrayHasKey("UbiCrawler", $rules); 34 | $this->assertArrayHasKey("DOC", $rules); 35 | $this->assertArrayHasKey("Zao", $rules); 36 | $this->assertArrayHasKey("sitecheck.internetseer.com", $rules); 37 | $this->assertArrayHasKey("Zealbot", $rules); 38 | $this->assertArrayHasKey("MSIECrawler", $rules); 39 | $this->assertArrayHasKey("SiteSnagger", $rules); 40 | $this->assertArrayHasKey("WebStripper", $rules); 41 | $this->assertArrayHasKey("WebCopier", $rules); 42 | $this->assertArrayHasKey("Fetch", $rules); 43 | $this->assertArrayHasKey("Offline Explorer", $rules); 44 | $this->assertArrayHasKey("Teleport", $rules); 45 | $this->assertArrayHasKey("TeleportPro", $rules); 46 | $this->assertArrayHasKey("WebZIP", $rules); 47 | $this->assertArrayHasKey("linko", $rules); 48 | $this->assertArrayHasKey("HTTrack", $rules); 49 | $this->assertArrayHasKey("Microsoft.URL.Control", $rules); 50 | $this->assertArrayHasKey("Xenu", $rules); 51 | $this->assertArrayHasKey("larbin", $rules); 52 | $this->assertArrayHasKey("libwww", $rules); 53 | $this->assertArrayHasKey("ZyBORG", $rules); 54 | $this->assertArrayHasKey("Download Ninja", $rules); 55 | $this->assertArrayHasKey("fast", $rules); 56 | $this->assertArrayHasKey("wget", $rules); 57 | $this->assertArrayHasKey("grub-client", $rules); 58 | $this->assertArrayHasKey("k2spider", $rules); 59 | $this->assertArrayHasKey("NPBot", $rules); 60 | $this->assertArrayHasKey("WebReaper", $rules); 61 | $this->assertArrayHasKey("*", $rules); 62 | } 63 | 64 | public function testTreeBuildOnlyOnce() { 65 | $this->parser->getRules(); 66 | $this->parser->getRules(); 67 | $this->parser->getRules(); 68 | $this->parser->getRules(); 69 | 70 | /** @var TestHandler $handler */ 71 | $handler = $this->parser->getLogger()->getHandlers()[0]; 72 | 73 | $treeCreateRecords = array_filter($handler->getRecords(), function(array $log) { 74 | return $log['message'] === 'Building directives tree...'; 75 | }); 76 | 77 | $this->assertCount(1, $treeCreateRecords); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /test/Stream/Filter/SkipCommentedLinesFilterTest.php: -------------------------------------------------------------------------------- 1 | assertContains(SkipCommentedLinesFilter::NAME, stream_get_filters()); 24 | } 25 | 26 | public function testFilter() { 27 | $stream = fopen(__DIR__ . '/../../Fixtures/with-commented-lines.txt','r'); 28 | 29 | // apply filter 30 | stream_filter_append($stream, SkipCommentedLinesFilter::NAME); 31 | 32 | $fstat = fstat($stream); 33 | $contents = fread($stream, $fstat['size']); 34 | 35 | // check commented not there 36 | $this->assertStringNotContainsString('# Disallow: /tech', $contents); 37 | $this->assertStringNotContainsString('# this is a commented line', $contents); 38 | $this->assertStringNotContainsString('# it should not be in the iterator', $contents); 39 | 40 | fclose($stream); 41 | } 42 | 43 | public function testFilterLargeSet() { 44 | $stream = fopen(__DIR__ . '/../../Fixtures/large-commented-lines.txt','r'); 45 | 46 | // apply filter 47 | stream_filter_append($stream, SkipCommentedLinesFilter::NAME); 48 | 49 | $fstat = fstat($stream); 50 | $contents = fread($stream, $fstat['size']); 51 | 52 | // check commented not there 53 | $this->assertStringNotContainsString('# Lorem ipsum dolor sit amet,', $contents); 54 | 55 | fclose($stream); 56 | } 57 | 58 | public function testFilterWithLogger() { 59 | $log = new Logger(static::class); 60 | $log->pushHandler(new TestHandler(LogLevel::DEBUG)); 61 | 62 | $stream = fopen(__DIR__ . '/../../Fixtures/large-commented-lines.txt','r'); 63 | 64 | // apply filter 65 | stream_filter_append($stream, SkipCommentedLinesFilter::NAME, STREAM_FILTER_READ, ['logger' => $log]); 66 | 67 | $fstat = fstat($stream); 68 | $contents = fread($stream, $fstat['size']); 69 | 70 | /** @var TestHandler $handler */ 71 | $handler = $log->getHandlers()[0]; 72 | 73 | $messagesOnly = array_map( 74 | function(array $record) { return $record['message']; }, 75 | $handler->getRecords() 76 | ); 77 | 78 | $expected = require __DIR__ . '/../../Fixtures/expected-skipped-lines-log.php'; 79 | 80 | $this->assertNotEmpty($contents); 81 | $this->assertEquals($messagesOnly, $expected); 82 | 83 | fclose($stream); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /test/Stream/Filter/SkipDirectivesWithInvalidValuesFilterTest.php: -------------------------------------------------------------------------------- 1 | assertContains(SkipDirectivesWithInvalidValuesFilter::NAME, stream_get_filters()); 21 | } 22 | 23 | /** 24 | * @TODO 25 | */ 26 | public function testFilter() { 27 | $stream = fopen(__DIR__ . '/../../Fixtures/with-invalid-request-rate.txt','r'); 28 | 29 | // apply filter 30 | stream_filter_append($stream, SkipDirectivesWithInvalidValuesFilter::NAME); 31 | 32 | $fstat = fstat($stream); 33 | $contents = fread($stream, $fstat['size']); 34 | 35 | // check other rules are still in place 36 | $this->assertStringContainsString('Useragent: GoogleBot', $contents); 37 | 38 | // check faulty removed 39 | $this->assertStringNotContainsString('Crawl-delay: ngfsngdndag', $contents); 40 | // $this->assertStringNotContainsString('Crawl-delay: 0.vfsbfsb # invalid', $contents); 41 | $this->assertStringNotContainsString('Request-rate: 100/bgdndgnd # invalid', $contents); 42 | $this->assertStringNotContainsString('Request-rate: 15686 # invalid', $contents); 43 | $this->assertStringNotContainsString('Request-rate: ngdndganda # invalid', $contents); 44 | 45 | fclose($stream); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /test/Stream/Filter/SkipEmptyLinesFilterTest.php: -------------------------------------------------------------------------------- 1 | assertContains(SkipEmptyLinesFilter::NAME, stream_get_filters()); 24 | } 25 | 26 | public function testFilter() { 27 | $beforeLines = 0; 28 | $afterLines = 0; 29 | 30 | $stream = fopen(__DIR__ . '/../../Fixtures/with-empty-lines.txt','r'); 31 | 32 | while (!feof($stream)) { 33 | fgets($stream); 34 | $beforeLines++; 35 | } 36 | 37 | rewind($stream); 38 | 39 | // apply filter 40 | stream_filter_append($stream, SkipEmptyLinesFilter::NAME); 41 | 42 | $contents = ""; 43 | 44 | while (!feof($stream)) { 45 | $contents .= fgets($stream); 46 | $afterLines++; 47 | } 48 | 49 | $this->assertNotEquals("", $contents); 50 | $this->assertTrue($afterLines < $beforeLines); 51 | 52 | fclose($stream); 53 | } 54 | 55 | public function testFilterEmptyFirst() { 56 | $stream = fopen(__DIR__ . '/../../Fixtures/with-empty-lines.txt','r'); 57 | 58 | // apply filter 59 | stream_filter_append($stream, SkipEmptyLinesFilter::NAME); 60 | 61 | $lines = []; 62 | 63 | while (!feof($stream)) { 64 | $lines[] = fgets($stream); 65 | } 66 | 67 | $this->assertNotEmpty($lines); 68 | $this->assertNotEmpty($lines[0]); 69 | 70 | fclose($stream); 71 | } 72 | 73 | public function testFilterWithLogger() { 74 | $log = new Logger(static::class); 75 | $log->pushHandler(new TestHandler(LogLevel::DEBUG)); 76 | 77 | $stream = fopen(__DIR__ . '/../../Fixtures/with-empty-lines.txt','r'); 78 | 79 | // apply filter 80 | stream_filter_append($stream, SkipEmptyLinesFilter::NAME, STREAM_FILTER_READ, ['logger' => $log]); 81 | 82 | // do read 83 | $lines = []; 84 | while (!feof($stream)) { 85 | $lines[] = fgets($stream); 86 | } 87 | 88 | /** @var TestHandler $handler */ 89 | $handler = $log->getHandlers()[0]; 90 | 91 | $this->assertNotEmpty($lines); 92 | $this->assertTrue( 93 | $handler->hasRecord('3 lines skipped as empty.', LogLevel::DEBUG), 94 | stringifyLogs($handler->getRecords()) 95 | ); 96 | fclose($stream); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /test/Stream/Filter/SkipEndOfCommentedLineFilterTest.php: -------------------------------------------------------------------------------- 1 | assertContains(SkipEndOfCommentedLineFilter::NAME, stream_get_filters()); 24 | } 25 | 26 | public function testFilter() { 27 | $stream = fopen(__DIR__ . '/../../Fixtures/with-commented-line-endings.txt','r'); 28 | 29 | // apply filter 30 | stream_filter_append($stream, SkipEndOfCommentedLineFilter::NAME); 31 | 32 | $fstat = fstat($stream); 33 | $contents = fread($stream, $fstat['size']); 34 | 35 | // check commented not there 36 | $this->assertStringNotContainsString('# ds', $contents); 37 | $this->assertStringNotContainsString('# low: /tech', $contents); 38 | $this->assertStringNotContainsString('#: /tech # ds', $contents); 39 | 40 | // should keep valid entries 41 | $this->assertStringContainsString('Disallow: /comment-after', $contents); 42 | 43 | fclose($stream); 44 | } 45 | 46 | public function testFilterWithLogger() { 47 | $log = new Logger(static::class); 48 | $log->pushHandler(new TestHandler(LogLevel::DEBUG)); 49 | 50 | $stream = fopen(__DIR__ . '/../../Fixtures/with-commented-line-endings.txt','r'); 51 | 52 | // apply filter 53 | stream_filter_append($stream, SkipEndOfCommentedLineFilter::NAME, STREAM_FILTER_READ, ['logger' => $log]); 54 | 55 | // do read 56 | $lines = []; 57 | while (!feof($stream)) { 58 | $lines[] = fgets($stream); 59 | } 60 | 61 | /** @var TestHandler $handler */ 62 | $handler = $log->getHandlers()[0]; 63 | 64 | $this->assertNotEmpty($lines); 65 | $this->assertTrue( 66 | $handler->hasRecord('5 char(s) dropped as commented out', LogLevel::DEBUG), 67 | stringifyLogs($handler->getRecords()) 68 | ); 69 | fclose($stream); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /test/Stream/Filter/SkipUnsupportedDirectivesTest.php: -------------------------------------------------------------------------------- 1 | assertContains(SkipUnsupportedDirectivesFilter::NAME, stream_get_filters()); 24 | } 25 | 26 | public function testFilter() { 27 | $stream = fopen(__DIR__ . '/../../Fixtures/with-faulty-directives.txt','r'); 28 | 29 | // apply filter 30 | stream_filter_append($stream, SkipUnsupportedDirectivesFilter::NAME); 31 | 32 | $fstat = fstat($stream); 33 | $contents = fread($stream, $fstat['size']); 34 | 35 | $this->assertStringNotContainsString('Disallow /admin/ # prohibits links from the admin panel', $contents); 36 | $this->assertStringNotContainsString('dis@llow: /admin/ # prohibits links from the admin panel', $contents); 37 | $this->assertStringNotContainsString('cleanParam: ref /some_dir/get_book.pl', $contents); 38 | $this->assertStringNotContainsString('User#agent: google4 #specifies the robots that the directives are set for', $contents); 39 | $this->assertStringNotContainsString('Disa#low: /bin/ # prohibits links from the Shopping Cart.', $contents); 40 | $this->assertStringNotContainsString('Disa#low: /search/ # prohibits page links of the search embedded on the site', $contents); 41 | $this->assertStringNotContainsString('Disa#low: /admin/ # prohibits links from the admin panel', $contents); 42 | $this->assertStringNotContainsString('Site#ap: http://example.com/sitemap # specifies the path to the site\'s Sitemap file for the robot', $contents); 43 | $this->assertStringNotContainsString('Clean#param: ref /some_dir/get_book.pl', $contents); 44 | 45 | fclose($stream); 46 | } 47 | 48 | public function testFilterWithLogger() { 49 | $log = new Logger(static::class); 50 | $log->pushHandler(new TestHandler(LogLevel::DEBUG)); 51 | 52 | $stream = fopen(__DIR__ . '/../../Fixtures/with-faulty-directives.txt', 'r'); 53 | 54 | // apply filter 55 | stream_filter_append($stream, SkipUnsupportedDirectivesFilter::NAME, STREAM_FILTER_READ, ['logger' => $log]); 56 | 57 | $fstat = fstat($stream); 58 | $contents = fread($stream, $fstat['size']); 59 | 60 | /** @var TestHandler $handler */ 61 | $handler = $log->getHandlers()[0]; 62 | 63 | $this->assertNotEmpty($contents); 64 | $this->assertTrue( 65 | $handler->hasRecord('9 lines skipped as un-supported', LogLevel::DEBUG), 66 | stringifyLogs($handler->getRecords()) 67 | ); 68 | 69 | fclose($stream); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /test/Stream/Filter/TrimSpacesLeftAndRightFilterTest.php: -------------------------------------------------------------------------------- 1 | assertContains(TrimSpacesLeftFilter::NAME, stream_get_filters()); 21 | } 22 | 23 | public function testFilter() { 24 | $stream = fopen(__DIR__ . '/../../Fixtures/with-empty-and-whitespace.txt', 'r'); 25 | 26 | // apply filter 27 | stream_filter_append($stream, TrimSpacesLeftFilter::NAME); 28 | 29 | $fstat = fstat($stream); 30 | $contents = fread($stream, $fstat['size']); 31 | 32 | $this->assertStringNotContainsString(' Crawl-Delay: 0.9', $contents); 33 | $this->assertStringContainsString('Crawl-Delay: 0.9', $contents); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /test/Stream/ReaderTest.php: -------------------------------------------------------------------------------- 1 | getContentIterated(); 17 | 18 | foreach ($generator as $line) { 19 | $this->assertNotEmpty($line); 20 | $this->assertStringNotContainsString('#', $line); 21 | } 22 | } 23 | 24 | public function testGetContentYaMarket() { 25 | $reader = GeneratorBasedReader::fromStream(fopen(__DIR__ . './../Fixtures/market-yandex-ru.txt', 'r')); 26 | $generator = $reader->getContentIterated(); 27 | 28 | foreach ($generator as $idx => $line) { 29 | $this->assertNotEmpty($line); 30 | $this->assertStringNotContainsString('#', $line); 31 | 32 | switch ($idx) { 33 | case '329': 34 | $this->assertStringContainsString('Sitemap', $line); 35 | break; 36 | 37 | case '330': 38 | $this->assertStringContainsString('Host', $line); 39 | break; 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /test/UnlistedPathTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($parser->isAllowed("/")); 24 | $this->assertFalse($parser->isDisallowed("/")); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /test/UserAgentTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($parser->isAllowed("/")); 22 | $this->assertTrue($parser->isAllowed("/article")); 23 | $this->assertTrue($parser->isDisallowed("/temp")); 24 | 25 | $this->assertFalse($parser->isDisallowed("/")); 26 | $this->assertFalse($parser->isDisallowed("/article")); 27 | $this->assertFalse($parser->isAllowed("/temp")); 28 | 29 | $this->assertTrue($parser->isAllowed("/foo", "agentU/2.0.1")); 30 | $this->assertTrue($parser->isDisallowed("/bar", "agentU/2.0.1")); 31 | 32 | $this->assertTrue($parser->isDisallowed("/foo", "agentV")); 33 | $this->assertTrue($parser->isAllowed("/bar", "agentV")); 34 | $this->assertTrue($parser->isDisallowed("/foo", "agentW")); 35 | $this->assertTrue($parser->isAllowed("/bar", "agentW")); 36 | 37 | $this->assertTrue($parser->isAllowed("/temp", "spiderX/1.0")); 38 | $this->assertTrue($parser->isDisallowed("/assets", "spiderX/1.0")); 39 | $this->assertTrue($parser->isAllowed("/forum", "spiderX/1.0")); 40 | 41 | $this->assertFalse($parser->isDisallowed("/temp", "spiderX/1.0")); 42 | $this->assertFalse($parser->isAllowed("/assets", "spiderX/1.0")); 43 | $this->assertFalse($parser->isDisallowed("/forum", "spiderX/1.0")); 44 | 45 | $this->assertTrue($parser->isDisallowed("/", "botY-test")); 46 | $this->assertTrue($parser->isAllowed("/forum/", "botY-test")); 47 | $this->assertTrue($parser->isDisallowed("/forum/topic", "botY-test")); 48 | $this->assertTrue($parser->isDisallowed("/public", "botY-test")); 49 | 50 | $this->assertFalse($parser->isAllowed("/", "botY-test")); 51 | $this->assertFalse($parser->isDisallowed("/forum/", "botY-test")); 52 | $this->assertFalse($parser->isAllowed("/forum/topic", "botY-test")); 53 | $this->assertFalse($parser->isAllowed("/public", "botY-test")); 54 | 55 | $this->assertTrue($parser->isAllowed("/", "crawlerZ")); 56 | $this->assertTrue($parser->isDisallowed("/forum", "crawlerZ")); 57 | $this->assertTrue($parser->isDisallowed("/public", "crawlerZ")); 58 | 59 | $this->assertFalse($parser->isDisallowed("/", "crawlerZ")); 60 | $this->assertFalse($parser->isAllowed("/forum", "crawlerZ")); 61 | $this->assertFalse($parser->isAllowed("/public", "crawlerZ")); 62 | } 63 | 64 | /** 65 | * Generate test case data 66 | * @return array 67 | */ 68 | public function generateDataForTest() 69 | { 70 | return array( 71 | array( 72 | " 73 | User-agent: * 74 | Disallow: /admin 75 | Disallow: /temp 76 | Disallow: /forum 77 | 78 | User-agent: agentU/2.0 79 | Disallow: /bar 80 | Allow: /foo 81 | 82 | User-agent: agentV 83 | User-agent: agentW 84 | Disallow: /foo 85 | Allow: /bar 86 | 87 | User-agent: spiderX 88 | Disallow: 89 | Disallow: /admin 90 | Disallow: /assets 91 | 92 | User-agent: botY 93 | Disallow: / 94 | Allow: /forum/$ 95 | Allow: /article 96 | 97 | User-agent: crawlerZ 98 | Disallow: 99 | Disallow: / 100 | Allow: /$ 101 | " 102 | ) 103 | ); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /test/WhitespacesTest.php: -------------------------------------------------------------------------------- 1 | getRules('*'); 19 | 20 | $this->assertNotEmpty($rules, 'expected rules for *'); 21 | $this->assertArrayHasKey('disallow', $rules); 22 | $this->assertNotEmpty($rules['disallow'], 'disallow failed'); 23 | $this->assertArrayHasKey('allow', $rules); 24 | $this->assertNotEmpty($rules['allow'], 'allow failed'); 25 | } 26 | 27 | /** 28 | * Generate test case data 29 | * @return array 30 | */ 31 | public function generateDataForTest() { 32 | return [ 33 | [ 34 | " 35 | User-agent: * 36 | Disallow : /admin 37 | Allow : /admin/front 38 | ", 39 | ], 40 | ]; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /test/bootstrap.php: -------------------------------------------------------------------------------- 1 | json_encode( 10 | array_map('extractMessageFromRecord', $handlerRecords), 11 | JSON_PRETTY_PRINT 12 | ) 13 | ]); 14 | } 15 | --------------------------------------------------------------------------------