23 | *
24 | * Logic schema and signals:
25 | * @link https://docs.google.com/document/d/1_rNjxpnUUeJG13ap6cnXM6Sx9ZQtd1ngADXnW9SHJSE
26 | *
27 | * Specifications:
28 | * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
29 | * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml
30 | * @link http://www.robotstxt.org/
31 | * @link http://www.w3.org/TR/html4/appendix/notes.html
32 | *
33 | * Useful links and materials:
34 | * @link http://www.the-art-of-web.com/php/parse-robots/
35 | * @link http://socoder.net/index.php?snippet=23824
36 | */
37 | class RobotsTxtParser implements LoggerAwareInterface {
38 |
39 | use LogsIfAvailableTrait;
40 |
41 | // default encoding
42 | const DEFAULT_ENCODING = 'UTF-8';
43 |
44 | // rules set
45 | protected $rules = [];
46 |
47 | // host set
48 | protected $host = null;
49 |
50 | // robots.txt http status code
51 | protected ?int $httpStatusCode;
52 |
53 | // url
54 | private $url = null;
55 |
56 | // UserAgent
57 | private $userAgent = '*';
58 |
59 | // robots.txt file content
60 | private $content = '';
61 | private string $encoding = '';
62 |
63 | private array $tree = [];
64 | private ?ReaderInterface $reader;
65 | private ?TreeBuilderInterface $treeBuilder;
66 | private ?UserAgentMatcherInterface $userAgentMatcher;
67 |
68 | public function __construct(
69 | $content,
70 | string $encoding = self::DEFAULT_ENCODING,
71 | ?TreeBuilderInterface $treeBuilder = null,
72 | ?ReaderInterface $reader = null,
73 | ?UserAgentMatcherInterface $userAgentMatcher = null
74 | ) {
75 | $this->treeBuilder = $treeBuilder;
76 | $this->reader = $reader;
77 | $this->encoding = $encoding;
78 | $this->userAgentMatcher = $userAgentMatcher;
79 |
80 | if (is_null($this->reader)) {
81 | $this->log('Reader is not passed, using a default one...');
82 |
83 | $this->reader = is_resource($content)
84 | ? GeneratorBasedReader::fromStream($content)
85 | : GeneratorBasedReader::fromString($content);
86 | }
87 |
88 | if (is_null($this->userAgentMatcher)) {
89 | $this->log('UserAgentMatcher is not passed, using a default one...');
90 |
91 | $this->userAgentMatcher = new UserAgentMatcher();
92 | }
93 | }
94 |
95 | private function buildTree() {
96 | if (!empty($this->tree)) {
97 | return;
98 | }
99 |
100 | if ($this->encoding !== static::DEFAULT_ENCODING) {
101 | $this->reader->setEncoding($this->encoding);
102 | }
103 |
104 | // construct a tree builder if not passed
105 | if (is_null($this->treeBuilder)) {
106 | $this->log('Creating a default tree builder as none passed...');
107 |
108 | $this->treeBuilder = new TreeBuilder(
109 | DirectiveProcessorsFactory::getDefault($this->logger),
110 | $this->logger
111 | );
112 | }
113 |
114 | $this->treeBuilder->setContent($this->reader->getContentIterated());
115 | $this->tree = $this->treeBuilder->build();
116 | }
117 |
118 | public function getLogger(): ?LoggerInterface {
119 | return $this->logger;
120 | }
121 |
122 | public function setLogger(LoggerInterface $logger): void {
123 | $this->logger = $logger;
124 |
125 | if ($this->reader instanceof LoggerAwareInterface) {
126 | $this->reader->setLogger($this->logger);
127 | }
128 |
129 | if ($this->userAgentMatcher instanceof LoggerAwareInterface) {
130 | $this->userAgentMatcher->setLogger($this->logger);
131 | }
132 | }
133 |
134 | private static function isValidHostName(string $host): bool {
135 | return HostName::isValid($host);
136 | }
137 |
138 | /**
139 | * Validate URL scheme
140 | *
141 | * @param string $scheme
142 | *
143 | * @return bool
144 | */
145 | private static function isValidScheme($scheme) {
146 | return Url::isValidScheme($scheme);
147 | }
148 |
149 | /**
150 | * Parse URL
151 | *
152 | * @param string $url
153 | *
154 | * @return array|false
155 | */
156 | protected function parseURL($url) {
157 | $parsed = parse_url($url);
158 | if ($parsed === false) {
159 | return false;
160 | } elseif (!isset($parsed['scheme']) || !$this->isValidScheme($parsed['scheme'])) {
161 | return false;
162 | } else {
163 | if (!isset($parsed['host']) || !$this->isValidHostName($parsed['host'])) {
164 | return false;
165 | } else {
166 | if (!isset($parsed['port'])) {
167 | $parsed['port'] = getservbyname($parsed['scheme'], 'tcp');
168 | if (!is_int($parsed['port'])) {
169 | return false;
170 | }
171 | }
172 | }
173 | }
174 | $parsed['custom'] = (isset($parsed['path']) ? $parsed['path'] : '/') . (isset($parsed['query']) ? '?' . $parsed['query'] : '');
175 | return $parsed;
176 | }
177 |
178 | /**
179 | * Explode Clean-Param rule
180 | *
181 | * @param string $rule
182 | *
183 | * @return array
184 | */
185 | private function explodeCleanParamRule($rule) {
186 | // strip multi-spaces
187 | $rule = preg_replace('/\s+/S', ' ', $rule);
188 | // split into parameter and path
189 | $array = explode(' ', $rule, 2);
190 | $cleanParam = [];
191 | // strip any invalid characters from path prefix
192 | $cleanParam['path'] = isset($array[1]) ? $this->encode_url(preg_replace('/[^A-Za-z0-9\.-\/\*\_]/', '', $array[1])) : '/*';
193 | $param = explode('&', $array[0]);
194 | foreach ($param as $key) {
195 | $cleanParam['param'][] = trim($key);
196 | }
197 | return $cleanParam;
198 | }
199 |
200 | /**
201 | * Set the HTTP status code
202 | *
203 | * @param int $code
204 | *
205 | * @return bool
206 | */
207 | public function setHttpStatusCode(int $code): bool {
208 | if (!is_int($code) || $code < 100 || $code > 599) {
209 | $this->log('Invalid HTTP status code, not taken into account.', ['code' => $code], LogLevel::WARNING);
210 | return false;
211 | }
212 |
213 | $this->httpStatusCode = $code;
214 |
215 | return true;
216 | }
217 |
218 | public function isAllowed(string $url, ?string $userAgent = '*'): bool {
219 | $this->buildTree();
220 |
221 | $url = new Url($url);
222 | !is_null($this->logger) && $url->setLogger($this->logger);
223 |
224 | return $this->checkRules(Directive::ALLOW, $url->getPath(), $userAgent);
225 | }
226 |
227 | /**
228 | * Set UserAgent
229 | *
230 | * @param string $userAgent
231 | *
232 | * @return void
233 | * @deprecated please check rules for exact user agent instead
234 | */
235 | public function setUserAgent(string $userAgent) {
236 | throw new \RuntimeException(WarmingMessages::SET_UA_DEPRECATED);
237 | }
238 |
239 | /**
240 | * Check rules
241 | *
242 | * @param string $rule - rule to check
243 | * @param string $path - path to check
244 | * @param string $userAgent - which robot to check for
245 | *
246 | * @return bool
247 | */
248 | protected function checkRules(string $rule, string $path, string $userAgent = '*'): bool {
249 | // check for disallowed http status code
250 | if ($this->checkHttpStatusCodeRule()) {
251 | return ($rule === Directive::DISALLOW);
252 | }
253 |
254 | // Check each directive for rules, allowed by default
255 | $result = ($rule === Directive::ALLOW);
256 | $userAgent = $this->userAgentMatcher->getMatching($userAgent, array_keys($this->tree));
257 |
258 | foreach ([Directive::DISALLOW, Directive::ALLOW] as $directive) {
259 | if (!isset($this->tree[$userAgent][$directive])) {
260 | continue;
261 | }
262 |
263 | foreach ($this->tree[$userAgent][$directive] as $robotRule) {
264 | // check rule
265 | if ($this->checkRuleSwitch($robotRule, $path)) {
266 | // rule match
267 | $result = ($rule === $directive);
268 | }
269 | }
270 | }
271 |
272 | return $result;
273 | }
274 |
275 | /**
276 | * Check HTTP status code rule
277 | *
278 | * @return bool
279 | */
280 | private function checkHttpStatusCodeRule(): bool {
281 | if (isset($this->httpStatusCode) && $this->httpStatusCode >= 500 && $this->httpStatusCode <= 599) {
282 | $this->log("Disallowed by HTTP status code {$this->httpStatusCode}");
283 | return true;
284 | }
285 |
286 | return false;
287 | }
288 |
289 | protected function checkRuleSwitch(string $rule, string $path): bool {
290 | switch (Directive::attemptGetInline($rule)) {
291 |
292 | case Directive::CLEAN_PARAM:
293 | if ($this->checkCleanParamRule(Directive::stripInline($rule), $path)) {
294 | return true;
295 | }
296 | break;
297 |
298 | case Directive::HOST;
299 | if ($this->checkHostRule(Directive::stripInline($rule))) {
300 | return true;
301 | }
302 | break;
303 |
304 | default:
305 | return $this->checkBasicRule($rule, $path);
306 | }
307 | }
308 |
309 | /**
310 | * Check Clean-Param rule
311 | *
312 | * @param string $rule
313 | * @param string $path
314 | *
315 | * @return bool
316 | */
317 | private function checkCleanParamRule($rule, $path) {
318 | $cleanParam = $this->explodeCleanParamRule($rule);
319 | // check if path prefix matches the path of the url we're checking
320 | if (!$this->checkBasicRule($cleanParam['path'], $path)) {
321 | return false;
322 | }
323 | foreach ($cleanParam['param'] as $param) {
324 | if (!strpos($path, "?$param=")
325 | && !strpos($path, "&$param=")
326 | ) {
327 | return false;
328 | }
329 | }
330 | $this->log('Rule match: ' . Directive::CLEAN_PARAM . ' directive');
331 | return true;
332 | }
333 |
334 | /**
335 | * Check basic rule
336 | */
337 | private function checkBasicRule(string $rule, string $path): bool {
338 | // change @ to \@
339 | $escaped = strtr($this->prepareRegexRule($rule), ['@' => '\@']);
340 |
341 | // match result
342 | if (preg_match('@' . $escaped . '@', $path)) {
343 | $this->log('Rule match: Path');
344 | return true;
345 | }
346 |
347 | return false;
348 | }
349 |
350 | protected function prepareRegexRule(string $value): string {
351 | $escape = ['$' => '\$', '?' => '\?', '.' => '\.', '*' => '.*', '[' => '\[', ']' => '\]'];
352 | $value = str_replace(array_keys($escape), array_values($escape), $value);
353 |
354 | if (mb_strlen($value) > 2 && mb_substr($value, -2) == '\$') {
355 | $value = substr($value, 0, -2) . '$';
356 | }
357 |
358 | if (mb_strrpos($value, '/') == (mb_strlen($value) - 1)
359 | || mb_strrpos($value, '=') == (mb_strlen($value) - 1)
360 | || mb_strrpos($value, '?') == (mb_strlen($value) - 1)
361 | ) {
362 | $value .= '.*';
363 | }
364 |
365 | if (substr($value, 0, 2) != '.*') {
366 | $value = '^' . $value;
367 | }
368 | return $value;
369 | }
370 |
371 | /**
372 | * Check Host rule
373 | *
374 | * @param string $rule
375 | *
376 | * @return bool
377 | */
378 | private function checkHostRule($rule) {
379 | if (!isset($this->url)) {
380 | $error_msg = WarmingMessages::INLINED_HOST;
381 | $this->log($error_msg, [], LogLevel::ERROR);
382 | return false;
383 | }
384 |
385 | $url = $this->parseURL($this->url);
386 | $host = trim(str_ireplace(Directive::HOST . ':', '', mb_strtolower($rule)));
387 | if (in_array(
388 | $host, [
389 | $url['host'],
390 | $url['host'] . ':' . $url['port'],
391 | $url['scheme'] . '://' . $url['host'],
392 | $url['scheme'] . '://' . $url['host'] . ':' . $url['port'],
393 | ]
394 | )) {
395 | $this->log('Rule match: ' . Directive::HOST . ' directive');
396 | return true;
397 | }
398 | return false;
399 | }
400 |
401 | /**
402 | * Check url wrapper
403 | *
404 | * @param string $url - url to check
405 | * @param string|null $userAgent - which robot to check for
406 | *
407 | * @return bool
408 | */
409 | public function isDisallowed(string $url, string $userAgent = '*'): bool {
410 | $this->buildTree();
411 |
412 | $url = new Url($url);
413 | !is_null($this->logger) && $url->setLogger($this->logger);
414 |
415 | return $this->checkRules(Directive::DISALLOW, $url->getPath(), $userAgent);
416 | }
417 |
418 | public function getDelay(string $userAgent = "*", string $type = Directive::CRAWL_DELAY) {
419 | $this->buildTree();
420 |
421 | $directive = in_array($type, [Directive::CACHE, Directive::CACHE_DELAY])
422 | ? Directive::CACHE_DELAY
423 | : Directive::CRAWL_DELAY;
424 |
425 | if (isset($this->tree[$userAgent][$directive])) {
426 | // return delay for requested directive
427 | return $this->tree[$userAgent][$directive];
428 | }
429 |
430 | if (isset($this->tree[$userAgent][Directive::CRAWL_DELAY])) {
431 | $this->log("{$directive} directive (unofficial): Not found, fallback to " . Directive::CRAWL_DELAY . " directive");
432 | return $this->tree[$userAgent][Directive::CRAWL_DELAY];
433 | }
434 |
435 | $this->log("$directive directive: Not found");
436 |
437 | return 0;
438 | }
439 |
440 | public function getCleanParam(): array {
441 | $this->buildTree();
442 |
443 | if (!isset($this->tree[Directive::CLEAN_PARAM]) || empty($this->tree[Directive::CLEAN_PARAM])) {
444 | $this->log(Directive::CLEAN_PARAM . ' directive: Not found');
445 | }
446 |
447 | return $this->tree[Directive::CLEAN_PARAM];
448 | }
449 |
450 | /**
451 | * @deprecated
452 | */
453 | public function getContent(): string {
454 | return $this->reader->getContentRaw();
455 | }
456 |
457 | /**
458 | * @return array
459 | * @deprecated
460 | * @see RobotsTxtParser::getLogger()
461 | */
462 | public function getLog(): array {
463 | return [];
464 | }
465 |
466 | /**
467 | * Render
468 | *
469 | * @param string $eol
470 | *
471 | * @return string
472 | */
473 | public function render($eol = "\r\n") {
474 | $input = $this->getRules();
475 | krsort($input);
476 | $output = [];
477 | foreach ($input as $userAgent => $rules) {
478 | $output[] = 'User-agent: ' . $userAgent;
479 | foreach ($rules as $directive => $value) {
480 | // Not multibyte
481 | $directive = ucfirst($directive);
482 | if (is_array($value)) {
483 | // Shorter paths later
484 | usort($value, function ($a, $b) {
485 | return mb_strlen($a) < mb_strlen($b);
486 | });
487 | foreach ($value as $subValue) {
488 | $output[] = $directive . ': ' . $subValue;
489 | }
490 | } else {
491 | $output[] = $directive . ': ' . $value;
492 | }
493 | }
494 | $output[] = '';
495 | }
496 |
497 | $host = $this->getHost();
498 | if ($host !== null) {
499 | $output[] = 'Host: ' . $host;
500 | }
501 |
502 | $sitemaps = $this->getSitemaps();
503 | foreach ($sitemaps as $sitemap) {
504 | $output[] = 'Sitemap: ' . $sitemap;
505 | }
506 |
507 | $output[] = '';
508 | return implode($eol, $output);
509 | }
510 |
511 | public function getRules(?string $userAgent = null) {
512 | $this->buildTree();
513 |
514 | // return all rules
515 | if ($userAgent === null) {
516 | return $this->tree;
517 | }
518 |
519 | $userAgent = $this->userAgentMatcher->getMatching($userAgent, array_keys($this->tree));
520 |
521 | // direct match
522 | if (isset($this->tree[$userAgent])) {
523 | return $this->tree[$userAgent];
524 | }
525 |
526 | // fallback for *
527 | if (isset($this->tree['*'])) {
528 | $this->log(sprintf("No direct match found for '%s', fallback to *", $userAgent));
529 | return $this->tree['*'];
530 | }
531 |
532 | $this->log(sprintf("Rules not found for the given User-Agent '%s'", $userAgent));
533 |
534 | return [];
535 | }
536 |
537 | /**
538 | * @param ?string $userAgent
539 | *
540 | * @note NULL is returned to public API compatibility reasons. Will be removed in the future.
541 | *
542 | * @return string[]|string|null
543 | */
544 | public function getHost(?string $userAgent = null) {
545 | $this->buildTree();
546 |
547 | if (!is_null($userAgent)) {
548 | $userAgent = $this->userAgentMatcher->getMatching($userAgent, array_keys($this->tree));
549 |
550 | if (isset($this->tree[$userAgent][Directive::HOST]) && !empty($this->tree[$userAgent][Directive::HOST])) {
551 | return $this->tree[$userAgent][Directive::HOST];
552 | }
553 |
554 | return null;
555 | }
556 |
557 | $hosts = [];
558 |
559 | foreach ($this->tree as $userAgentBased) {
560 | if (isset($userAgentBased[Directive::HOST]) && !empty($userAgentBased[Directive::HOST])) {
561 | array_push($hosts, $userAgentBased[Directive::HOST]);
562 | }
563 | }
564 |
565 | return !empty($hosts) ? $hosts : null;
566 | }
567 |
568 | public function getSitemaps(?string $userAgent = null): array {
569 | $this->buildTree();
570 | $maps = [];
571 |
572 | if (!is_null($userAgent)) {
573 | $userAgent = $this->userAgentMatcher->getMatching($userAgent, array_keys($this->tree));
574 |
575 | if (isset($this->tree[$userAgent][Directive::SITEMAP]) && !empty($this->tree[$userAgent][Directive::SITEMAP])) {
576 | return $this->tree[$userAgent][Directive::SITEMAP];
577 | }
578 | } else {
579 | foreach ($this->tree as $userAgentBased) {
580 | if (isset($userAgentBased[Directive::SITEMAP]) && !empty($userAgentBased[Directive::SITEMAP])) {
581 | $maps = array_merge($maps, $userAgentBased[Directive::SITEMAP]);
582 | }
583 | }
584 | }
585 |
586 | return $maps;
587 | }
588 | }
589 |
--------------------------------------------------------------------------------
/source/Stream/CustomFilterInterface.php:
--------------------------------------------------------------------------------
1 | datalen for each $bucket.
20 | * @param bool $closing
21 | * If the stream is in the process of closing (and therefore this is the last pass
22 | * through the filterchain), the closing parameter will be set to TRUE.
23 | *
24 | * @return int
25 | * The filter() method must return one of three values upon completion.
26 | * - PSFS_PASS_ON: Filter processed successfully with data available in the out
27 | * bucket brigade.
28 | * - PSFS_FEED_ME: Filter processed successfully, however no data was available to
29 | * return. More data is required from the stream or prior filter.
30 | * - PSFS_ERR_FATAL (default): The filter experienced an unrecoverable error and
31 | * cannot continue.
32 | */
33 | public function filter($in, $out, &$consumed, $closing);
34 |
35 | /**
36 | * Called when creating the filter.
37 | *
38 | * @return bool
39 | * Your implementation of this method should return FALSE on failure, or TRUE on success.
40 | */
41 | public function onCreate();
42 |
43 | /**
44 | * Called when closing the filter.
45 | */
46 | public function onClose();
47 | }
48 |
--------------------------------------------------------------------------------
/source/Stream/Filters/SkipCommentedLinesFilter.php:
--------------------------------------------------------------------------------
1 | data = preg_replace('/^#.*/mui', '', $bucket->data, -1, $replacedCount);
18 | $consumed += $bucket->datalen;
19 | stream_bucket_append($out, $bucket);
20 |
21 | if ($replacedCount > 0
22 | && isset($this->params['logger'])
23 | && $this->params['logger'] instanceof LoggerInterface
24 | ) {
25 | $this->params['logger']->debug($replacedCount . ' lines skipped as commented out');
26 | }
27 | }
28 |
29 | return PSFS_PASS_ON;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/source/Stream/Filters/SkipDirectivesWithInvalidValuesFilter.php:
--------------------------------------------------------------------------------
1 | data = preg_replace(Directive::getRequestRateRegex(), '', $bucket->data, -1, $skippedRequestRateValues);
25 | $bucket->data = preg_replace(Directive::getCrawlDelayRegex(), '', $bucket->data, -1, $skippedCrawlDelayValues);
26 | // $bucket->data = preg_replace(Directive::getAllowDisallowRegex(), '', $bucket->data, -1, $skippedAllowanceValues);
27 |
28 | $consumed += $bucket->datalen;
29 | stream_bucket_append($out, $bucket);
30 |
31 | if (isset($this->params['logger']) && $this->params['logger'] instanceof LoggerInterface) {
32 | if ($skippedRequestRateValues > 0) {
33 | $this->params['logger']->debug($skippedRequestRateValues . ' char(s) dropped as invalid Request-rate value.');
34 | }
35 | if ($skippedCrawlDelayValues > 0) {
36 | $this->params['logger']->debug($skippedCrawlDelayValues . ' char(s) dropped as invalid Crawl-delay value.');
37 | }
38 | if ($skippedAllowanceValues > 0) {
39 | $this->params['logger']->debug($skippedAllowanceValues . ' char(s) dropped as invalid allow/disallow value.');
40 | }
41 | }
42 | }
43 |
44 | return PSFS_PASS_ON;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/source/Stream/Filters/SkipEmptyLinesFilter.php:
--------------------------------------------------------------------------------
1 | data = preg_replace(
18 | '/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/mui',
19 | PHP_EOL,
20 | $bucket->data, -1,
21 | $replacedCount
22 | );
23 |
24 | $consumed += $bucket->datalen;
25 | stream_bucket_append($out, $bucket);
26 |
27 | if ($replacedCount > 0
28 | && isset($this->params['logger'])
29 | && $this->params['logger'] instanceof LoggerInterface
30 | ) {
31 | $this->params['logger']->debug($replacedCount . ' lines skipped as empty.');
32 | }
33 | }
34 |
35 | return PSFS_PASS_ON;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/source/Stream/Filters/SkipEndOfCommentedLineFilter.php:
--------------------------------------------------------------------------------
1 | data = preg_replace('/\s*#.*/mui', '', $bucket->data, -1, $replacedCount);
18 | $consumed += $bucket->datalen;
19 | stream_bucket_append($out, $bucket);
20 |
21 | if ($replacedCount > 0
22 | && isset($this->params['logger'])
23 | && $this->params['logger'] instanceof LoggerInterface
24 | ) {
25 | $this->params['logger']->debug($replacedCount . ' char(s) dropped as commented out');
26 | }
27 | }
28 |
29 | return PSFS_PASS_ON;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/source/Stream/Filters/SkipUnsupportedDirectivesFilter.php:
--------------------------------------------------------------------------------
1 | data = preg_replace(Directive::getRegex(), '', $bucket->data, -1, $replacedCount);
19 | $consumed += $bucket->datalen;
20 | stream_bucket_append($out, $bucket);
21 |
22 | if ($replacedCount > 0
23 | && isset($this->params['logger'])
24 | && $this->params['logger'] instanceof LoggerInterface
25 | ) {
26 | $this->params['logger']->debug($replacedCount . ' lines skipped as un-supported');
27 | }
28 | }
29 |
30 | return PSFS_PASS_ON;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/source/Stream/Filters/TrimSpacesLeftFilter.php:
--------------------------------------------------------------------------------
1 | data = preg_replace('/(^\s+)(?!\n$)/mui', '', $bucket->data);
16 | $consumed += $bucket->datalen;
17 | stream_bucket_append($out, $bucket);
18 | }
19 |
20 | return PSFS_PASS_ON;
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/source/Stream/GeneratorBasedReader.php:
--------------------------------------------------------------------------------
1 | filters = [
30 | SkipCommentedLinesFilter::class => false,
31 | SkipEndOfCommentedLineFilter::class => false,
32 | TrimSpacesLeftFilter::class => false,
33 | SkipUnsupportedDirectivesFilter::class => false,
34 | SkipDirectivesWithInvalidValuesFilter::class => false,
35 | SkipEmptyLinesFilter::class => false,
36 | ];
37 | }
38 |
39 | /**
40 | * @link https://www.php.net/manual/en/function.stream-filter-append.php#84637
41 | */
42 | public function __destruct() {
43 | foreach ($this->filters as $class => $instance) {
44 | try {
45 | if (is_resource($instance)) {
46 | stream_filter_remove($instance);
47 | }
48 | } catch (\Throwable $throwable) {
49 | $this->log('Failed to remove filter "{class}": {message}', [
50 | 'class' => $class,
51 | 'message' => $throwable->getMessage(),
52 | ]);
53 | }
54 | }
55 |
56 | if (is_resource($this->stream)) {
57 | fclose($this->stream);
58 | }
59 | }
60 |
61 | /**
62 | * @param string $input
63 | *
64 | * @return static
65 | */
66 | public static function fromString(string $input = ''): self {
67 | $reader = new GeneratorBasedReader();
68 | $stream = tmpfile();
69 |
70 | fwrite($stream, $input);
71 | fseek($stream, 0);
72 |
73 | $reader->log(WarmingMessages::STRING_INIT_DEPRECATE);
74 |
75 | return $reader->setStream($stream);
76 | }
77 |
78 | public static function fromStream($stream): self {
79 | if (!is_resource($stream)) {
80 | $error = sprintf('Argument must be a valid resource type. %s given.', gettype($stream));
81 | throw new \InvalidArgumentException($error);
82 | }
83 |
84 | $reader = new GeneratorBasedReader();
85 | rewind($stream);
86 |
87 | return $reader->setStream($stream);
88 | }
89 |
90 | protected function setStream($stream): GeneratorBasedReader {
91 | $this->stream = $stream;
92 |
93 | foreach ($this->filters as $filterClass => & $filter) {
94 | stream_filter_register($filterClass::NAME, $filterClass);
95 | $filter = stream_filter_append(
96 | $this->stream,
97 | $filterClass::NAME,
98 | STREAM_FILTER_READ,
99 | ['logger' => $this->logger] // pass logger to filters
100 | );
101 | }
102 |
103 | return $this;
104 | }
105 |
106 | /**
107 | * @param string $encoding
108 | *
109 | * @TODO check on composer install if we have filters available
110 | */
111 | public function setEncoding(string $encoding) {
112 | if (strtoupper($encoding) === RobotsTxtParser::DEFAULT_ENCODING) {
113 | return;
114 | }
115 |
116 | $this->log(WarmingMessages::ENCODING_NOT_UTF8, [], LogLevel::WARNING);
117 |
118 | $filterName = 'convert.iconv.' . $encoding . '/utf-8';
119 | $this->log('Adding encoding filter ' . $filterName);
120 |
121 | // convert encoding
122 | $this->filters['iconv'] = stream_filter_prepend($this->stream, $filterName, STREAM_FILTER_READ);
123 | }
124 |
125 | public function getContentIterated(): \Generator {
126 | rewind($this->stream);
127 |
128 | while (!feof($this->stream)) {
129 | $line = fgets($this->stream);
130 |
131 | if (false !== $line) {
132 | yield $line;
133 | }
134 | }
135 | }
136 |
137 | public function getContentRaw(): string {
138 | rewind($this->stream);
139 | return stream_get_contents($this->stream);
140 | }
141 | }
142 |
--------------------------------------------------------------------------------
/source/Stream/ReaderInterface.php:
--------------------------------------------------------------------------------
1 | parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/allow-spec.txt', 'r'));
17 | }
18 |
19 | public function tearDown(): void {
20 | $this->parser = null;
21 | }
22 |
23 | public function testForCrawlerZ() {
24 | $this->assertTrue($this->parser->isAllowed('/', 'crawlerZ'));
25 | $this->assertTrue($this->parser->isDisallowed('/forum', 'crawlerZ'));
26 | $this->assertTrue($this->parser->isDisallowed('/public', 'crawlerZ'));
27 | $this->assertFalse($this->parser->isDisallowed('/', 'crawlerZ'));
28 | $this->assertFalse($this->parser->isAllowed('/forum', 'crawlerZ'));
29 | $this->assertFalse($this->parser->isAllowed('/public', 'crawlerZ'));
30 | }
31 |
32 | public function testForDefaultUserAgent() {
33 | $this->assertTrue($this->parser->isAllowed('/'));
34 | $this->assertTrue($this->parser->isAllowed('/article'));
35 | $this->assertTrue($this->parser->isDisallowed('/temp'));
36 | $this->assertTrue($this->parser->isDisallowed('/Admin'));
37 | $this->assertTrue($this->parser->isDisallowed('/admin'));
38 | $this->assertTrue($this->parser->isDisallowed('/admin/cp/test/'));
39 | $this->assertFalse($this->parser->isDisallowed('/'));
40 | $this->assertFalse($this->parser->isDisallowed('/article'));
41 | $this->assertFalse($this->parser->isAllowed('/temp'));
42 | $this->assertFalse($this->parser->isDisallowed('/article'));
43 | }
44 |
45 | public function testForAgentV() {
46 | $this->assertTrue($this->parser->isDisallowed('/foo', 'agentV'));
47 | $this->assertTrue($this->parser->isAllowed('/bar', 'agentV'));
48 | $this->assertTrue($this->parser->isAllowed('/Foo', 'agentV'));
49 | }
50 |
51 | public function testForAgentW() {
52 | $this->assertTrue($this->parser->isDisallowed('/foo', 'agentW'));
53 | $this->assertTrue($this->parser->isAllowed('/bar', 'agentW'));
54 | $this->assertTrue($this->parser->isAllowed('/Foo', 'agentW'));
55 | }
56 |
57 | public function testForBotY() {
58 | $this->assertTrue($this->parser->isDisallowed('/', 'botY-test'));
59 | $this->assertTrue($this->parser->isDisallowed('/forum', 'botY-test'));
60 | $this->assertTrue($this->parser->isAllowed('/forum/', 'botY-test'));
61 | $this->assertTrue($this->parser->isDisallowed('/forum/topic', 'botY-test'));
62 | $this->assertTrue($this->parser->isDisallowed('/public', 'botY-test'));
63 | $this->assertFalse($this->parser->isAllowed('/', 'botY-test'));
64 | $this->assertFalse($this->parser->isAllowed('/forum', 'botY-test'));
65 | $this->assertFalse($this->parser->isDisallowed('/forum/', 'botY-test'));
66 | $this->assertFalse($this->parser->isAllowed('/forum/topic', 'botY-test'));
67 | $this->assertFalse($this->parser->isAllowed('/public', 'botY-test'));
68 | }
69 |
70 | /**
71 | * @param string $url
72 | * @param bool $isAllowed
73 | *
74 | * @dataProvider generateDataForSpiderX
75 | */
76 | public function testForSpiderX(string $url, bool $isAllowed) {
77 | if ($isAllowed) {
78 | $this->assertTrue($this->parser->isAllowed($url, 'spiderX/1.0'));
79 | $this->assertFalse($this->parser->isDisallowed($url, 'spiderX/1.0'));
80 | } else {
81 | $this->assertTrue($this->parser->isDisallowed($url, 'spiderX/1.0'));
82 | $this->assertFalse($this->parser->isAllowed($url, 'spiderX/1.0'));
83 | }
84 | }
85 |
86 | public function generateDataForSpiderX(): array {
87 | return [
88 | ['/temp', true],
89 | ['/assets', false],
90 | ['/forum', true],
91 | ];
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/test/AtSymbolTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($parser->isAllowed("/peanuts"));
20 | $this->assertFalse($parser->isDisallowed("/peanuts"));
21 | $this->assertFalse($parser->isAllowed("/url_containing_@_symbol"));
22 | $this->assertTrue($parser->isDisallowed("/url_containing_@_symbol"));
23 | }
24 |
25 | /**
26 | * Generate test case data
27 | * @return array
28 | */
29 | public function generateDataForTest()
30 | {
31 | return array(
32 | array("
33 | User-Agent: *
34 | Disallow: /url_containing_@_symbol
35 | Allow: /peanuts
36 | ")
37 | );
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/test/CommentsTest.php:
--------------------------------------------------------------------------------
1 | getRules('*');
20 | $this->assertEmpty($rules, 'expected remove comments');
21 | }
22 |
23 | /**
24 | * @dataProvider generateDataFor2Test
25 | * @param string $robotsTxtContent
26 | * @param string $expectedDisallowValue
27 | */
28 | public function testRemoveCommentsFromValue($robotsTxtContent, $expectedDisallowValue)
29 | {
30 | $parser = new RobotsTxtParser($robotsTxtContent);
31 | $this->assertNotEmpty($parser->getRules('*'), 'expected data');
32 | $this->assertArrayHasKey(Directive::DISALLOW, $parser->getRules('*'));
33 | $this->assertNotEmpty($parser->getRules('*')[Directive::DISALLOW], 'disallow expected');
34 | $this->assertEquals($expectedDisallowValue, $parser->getRules('*')[Directive::DISALLOW][0]);
35 | }
36 |
37 | /**
38 | * Generate test case data
39 | * @return array
40 | */
41 | public function generateDataForTest()
42 | {
43 | return array(
44 | array("
45 | User-agent: *
46 | #Disallow: /tech
47 | "),
48 | array("
49 | User-agent: *
50 | Disallow: #/tech
51 | "),
52 | array("
53 | User-agent: *
54 | Disal # low: /tech
55 | "),
56 | array("
57 | User-agent: *
58 | Disallow#: /tech # ds
59 | "),
60 | );
61 | }
62 |
63 | /**
64 | * Generate test case data
65 | * @return array
66 | */
67 | public function generateDataFor2Test()
68 | {
69 | return array(
70 | array(
71 | "User-agent: *
72 | Disallow: /tech #comment",
73 | 'disallowValue' => '/tech',
74 | ),
75 | );
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/test/Directives/CacheDelayTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
22 |
23 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/cache-delay-spec.txt', 'r'));
24 | $this->parser->setLogger($log);
25 | }
26 |
27 | public function tearDown(): void {
28 | $this->parser = null;
29 | }
30 |
31 | public function testCacheDelayForExistingUserAgents() {
32 | $this->assertEquals(0.5, $this->parser->getDelay('*', Directive::CACHE_DELAY));
33 | $this->assertEquals(3.7, $this->parser->getDelay('GoogleBot', Directive::CACHE_DELAY));
34 | $this->assertEquals(8, $this->parser->getDelay('AhrefsBot', Directive::CACHE_DELAY));
35 | }
36 |
37 | public function testCacheDelayFallsBackForNonStandardCacheDirective() {
38 | $this->assertEquals(0.5, $this->parser->getDelay('*', Directive::CACHE));
39 | $this->assertEquals(3.7, $this->parser->getDelay('GoogleBot', Directive::CACHE));
40 | $this->assertEquals(8, $this->parser->getDelay('AhrefsBot', Directive::CACHE));
41 | }
42 |
43 | public function testCacheDelayFallsBackToCrawlDelayIfNotSpecified() {
44 | $this->assertEquals(1.5, $this->parser->getDelay('Yandex', Directive::CACHE));
45 |
46 | /** @var TestHandler $handler */
47 | $handler = $this->parser->getLogger()->getHandlers()[0];
48 |
49 | $this->assertTrue($handler->hasRecord(
50 | 'cache-delay directive (unofficial): Not found, fallback to crawl-delay directive',
51 | LogLevel::DEBUG
52 | ));
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/test/Directives/CleanParamTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
21 |
22 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/with-clean-param.txt', 'r'));
23 | $this->parser->setLogger($log);
24 | }
25 |
26 | public function tearDown(): void {
27 | $this->parser = null;
28 | }
29 |
30 | public function testCleanParam() {
31 | $this->assertArrayHasKey('/forum/showthread.php', $this->parser->getCleanParam());
32 | $this->assertEquals(['abc'], $this->parser->getCleanParam()['/forum/showthread.php']);
33 |
34 | $this->assertArrayHasKey('/forum/*.php', $this->parser->getCleanParam());
35 | $this->assertEquals(['sid', 'sort'], $this->parser->getCleanParam()['/forum/*.php']);
36 |
37 | $this->assertArrayHasKey('/*', $this->parser->getCleanParam());
38 | $this->assertEquals(['someTrash', 'otherTrash'], $this->parser->getCleanParam()['/*']);
39 | }
40 |
41 | public function testCleanParamsAppliedForAllowDisallow() {
42 | $this->markTestIncomplete('@TODO this needs to be finished yet.');
43 |
44 | $this->assertTrue($this->parser->isDisallowed("http://www.site1.com/forums/showthread.php?s=681498b9648949605&ref=parent"));
45 | $this->assertFalse($this->parser->isAllowed("http://www.site1.com/forums/showthread.php?s=681498b9648949605&ref=parent"));
46 |
47 | /** @var TestHandler $handler */
48 | $handler = $this->parser->getLogger()->getHandlers()[0];
49 |
50 | $this->assertTrue(
51 | $handler->hasRecord('Rule match: clean-param directive', LogLevel::DEBUG),
52 | stringifyLogs($handler->getRecords())
53 | );
54 |
55 | $this->assertTrue($this->parser->isAllowed("http://www.site2.com/forums/showthread.php?s=681498b9648949605"));
56 | $this->assertFalse($this->parser->isDisallowed("http://www.site2.com/forums/showthread.php?s=681498b9648949605"));
57 |
58 | $this->assertTrue(
59 | $handler->hasRecord('Rule match: Path', LogLevel::DEBUG),
60 | stringifyLogs($handler->getRecords())
61 | );
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/test/Directives/CrawlDelayTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
22 |
23 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/crawl-delay-spec.txt', 'r'));
24 | $this->parser->setLogger($log);
25 | }
26 |
27 | public function tearDown(): void {
28 | $this->parser = null;
29 | }
30 |
31 | public function testCrawlDelayForExactUserAgent() {
32 | $this->assertEquals(0.9, $this->parser->getDelay('GoogleBot'));
33 | $this->assertEquals(1.5, $this->parser->getDelay('AhrefsBot'));
34 | }
35 |
36 | public function testCrawlDelayWithNoUserAgent() {
37 | $this->assertEquals(0, $this->parser->getDelay());
38 | }
39 |
40 | public function testCrawlDelayLogsFallbackToCrawlDelay() {
41 | $this->assertEquals(0.9, $this->parser->getDelay('GoogleBot', Directive::CACHE_DELAY));
42 |
43 | /** @var TestHandler $handler */
44 | $handler = $this->parser->getLogger()->getHandlers()[0];
45 |
46 | $this->assertTrue($handler->hasRecord(
47 | 'cache-delay directive (unofficial): Not found, fallback to crawl-delay directive',
48 | LogLevel::DEBUG
49 | ));
50 | }
51 |
52 | public function testCrawlDelayLogsFallbackForMissingUserAgent() {
53 | $this->assertEquals(0, $this->parser->getDelay('YandexBot', Directive::CACHE_DELAY));
54 |
55 | /** @var TestHandler $handler */
56 | $handler = $this->parser->getLogger()->getHandlers()[0];
57 |
58 | $this->assertTrue($handler->hasRecord(
59 | 'cache-delay directive: Not found',
60 | LogLevel::DEBUG
61 | ));
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/test/Directives/HostTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
21 |
22 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/with-hosts.txt', 'r'));
23 | $this->parser->setLogger($log);
24 | }
25 |
26 | public function tearDown(): void {
27 | $this->parser = null;
28 | }
29 |
30 | public function testGetAllHosts() {
31 | $allHosts = $this->parser->getHost();
32 | $this->assertContains('myhost.ru', $allHosts);
33 | $this->assertContains('www.myhost.ru', $allHosts);
34 | }
35 |
36 | public function testHostForSomeUserAgent() {
37 | $yandexHost = $this->parser->getHost('Yandex');
38 | $this->assertEquals('www.myhost.ru', $yandexHost);
39 | }
40 |
41 | public function testHostForSomeUserAgentFallsBackToDefault() {
42 | $googleHost = $this->parser->getHost('Google');
43 | $this->assertEquals('myhost.ru', $googleHost);
44 |
45 | /** @var TestHandler $handler */
46 | $handler = $this->parser->getLogger()->getHandlers()[0];
47 |
48 | $this->assertTrue(
49 | $handler->hasRecord("Failed to match user agent 'Google', falling back to '*'", LogLevel::DEBUG),
50 | stringifyLogs($handler->getRecords())
51 | );
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/test/Directives/SitemapsTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
21 |
22 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/../Fixtures/with-sitemaps.txt', 'r'));
23 | $this->parser->setLogger($log);
24 | }
25 |
26 | public function tearDown(): void {
27 | $this->parser = null;
28 | }
29 |
30 | public function testRemoveDuplicateSitemaps() {
31 | $allMaps = $this->parser->getSitemaps();
32 |
33 | $this->assertCount(5, $allMaps);
34 | $this->assertContains('http://example.com/sitemap.xml?year=2015', $allMaps);
35 | $this->assertContains('http://somesite.com/sitemap-for-all.xml', $allMaps);
36 | $this->assertContains('http://internet.com/sitemap-for-google-bot.xml', $allMaps);
37 | $this->assertContains('http://worldwideweb.com/sitemap-yahoo.xml', $allMaps);
38 | $this->assertContains('http://example.com/sitemap-yahoo.xml?year=2016', $allMaps);
39 | }
40 |
41 | public function testGetSitemapForExactUserAgent() {
42 | $yahooMaps = $this->parser->getSitemaps('Yahoo');
43 |
44 | $this->assertCount(2, $yahooMaps);
45 | $this->assertContains('http://worldwideweb.com/sitemap-yahoo.xml', $yahooMaps);
46 | $this->assertContains('http://example.com/sitemap-yahoo.xml?year=2016', $yahooMaps);
47 | }
48 |
49 | public function testGetSitemapFallsBackToDefault() {
50 | $fallenBack = $this->parser->getSitemaps('Yandex');
51 |
52 | $this->assertCount(2, $fallenBack);
53 | $this->assertContains('http://somesite.com/sitemap-for-all.xml', $fallenBack);
54 | $this->assertContains('http://example.com/sitemap.xml?year=2015', $fallenBack);
55 |
56 | /** @var TestHandler $handler */
57 | $handler = $this->parser->getLogger()->getHandlers()[0];
58 |
59 | $this->assertTrue(
60 | $handler->hasRecord("Failed to match user agent 'Yandex', falling back to '*'", LogLevel::DEBUG),
61 | stringifyLogs($handler->getRecords())
62 | );
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/test/DisallowAllTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($parser->isDisallowed("/index"));
15 | $this->assertFalse($parser->isAllowed("/index"));
16 | }
17 |
18 | public function testAllowWildcard() {
19 | $parser = new RobotsTxtParser(file_get_contents(__DIR__ . '/Fixtures/allow-all.txt'));
20 | $this->assertFalse($parser->isDisallowed("/index"));
21 | $this->assertFalse($parser->isDisallowed("/"));
22 | $this->assertTrue($parser->isAllowed("/index"));
23 | $this->assertTrue($parser->isAllowed("/"));
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/test/DisallowUppercasePathTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($parser->isDisallowed("/Admin"));
22 | $this->assertFalse($parser->isAllowed("/Admin"));
23 | }
24 |
25 | /**
26 | * Generate test case data
27 | * @return array
28 | */
29 | public function generateDataForTest(): array {
30 | return [
31 | [
32 | "
33 | User-agent: *
34 | Disallow : /Admin
35 | "
36 | ]
37 | ];
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/test/EmptyRulesShouldAllowEverythingTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($parser->isAllowed('/foo'));
20 | $this->assertFalse($parser->isDisallowed('/foo'));
21 | $this->assertNull($parser->getHost());
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/test/EncodingTest.php:
--------------------------------------------------------------------------------
1 | logger = new Logger(static::class);
17 | $this->logger->pushHandler(new TestHandler(LogLevel::DEBUG));
18 | }
19 |
20 | public function testLogsNonStandardEncoding() {
21 | $parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/market-yandex-Windows-1251.txt', 'r'), 'Windows-1251');
22 | $parser->setLogger($this->logger);
23 | $parser->getRules();
24 |
25 | /** @var TestHandler $handler */
26 | $handler = $parser->getLogger()->getHandlers()[0];
27 |
28 | $this->assertTrue(
29 | $handler->hasRecord(WarmingMessages::ENCODING_NOT_UTF8, LogLevel::WARNING),
30 | stringifyLogs($handler->getRecords())
31 | );
32 |
33 | $this->assertTrue(
34 | $handler->hasRecord('Adding encoding filter convert.iconv.Windows-1251/utf-8', LogLevel::DEBUG),
35 | stringifyLogs($handler->getRecords())
36 | );
37 | }
38 |
39 | public function testWindows1251Readable() {
40 | $parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/market-yandex-Windows-1251.txt', 'r'), 'Windows-1251');
41 | $parser->setLogger($this->logger);
42 |
43 | $allRules = $parser->getRules();
44 | $this->assertCount(5, $allRules, json_encode(array_keys($allRules)));
45 | }
46 |
47 | public function testShouldNotChangeInternalEncoding() {
48 | $this->assertEquals('UTF-8', mb_internal_encoding());
49 | $parser = new RobotsTxtParser('', 'iso-8859-1');
50 | $this->assertEquals('UTF-8', mb_internal_encoding());
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/test/EndAnchorTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($parser->isAllowed($path));
25 | $this->assertFalse($parser->isDisallowed($path));
26 | } else {
27 | $this->assertTrue($parser->isDisallowed($path));
28 | $this->assertFalse($parser->isAllowed($path));
29 | }
30 | }
31 |
32 | /**
33 | * Generate test case data
34 | * @return array
35 | */
36 | public function generateDataForTest() {
37 | // Data provider defined in format:
38 | // [tested path, robotsTxtContent, true when allowed / false when disallowed]
39 | return [
40 | [
41 | "/",
42 | "
43 | User-Agent: *
44 | Disallow: /*
45 | Allow: /$
46 | ",
47 | true,
48 | ],
49 | [
50 | "/asd",
51 | "
52 | User-Agent: *
53 | Disallow: /*
54 | Allow: /$
55 | ",
56 | false,
57 | ],
58 | [
59 | "/asd/",
60 | "
61 | User-Agent: *
62 | Disallow: /*
63 | Allow: /$
64 | ",
65 | false,
66 | ],
67 | [
68 | "/deny_all/",
69 | "
70 | User-Agent: *
71 | Disallow: *deny_all/$
72 | ",
73 | /**
74 | * @see InvalidPathTest for details why this is changed
75 | */
76 | true,
77 | ],
78 | [
79 | "/deny_all/",
80 | "
81 | User-Agent: *
82 | Disallow: /deny_all/$
83 | ",
84 | false,
85 | ],
86 | [
87 | "/deny_all/",
88 | "
89 | User-Agent: *
90 | Disallow: deny_all/$
91 | ",
92 | true,
93 | ],
94 | ];
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/test/Fixtures/allow-all.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Allow: /
3 |
--------------------------------------------------------------------------------
/test/Fixtures/allow-spec.txt:
--------------------------------------------------------------------------------
1 | User-agent: anyone
2 | User-agent: *
3 | Disallow: /admin
4 | Disallow: /admin
5 | Disallow: /Admin
6 | Disallow: /temp#comment
7 | Disallow: /forum
8 | Disallow: /admin/cp/test/
9 |
10 | User-agent: agentU/2.0
11 | Disallow: /bar
12 | Allow: /foo
13 |
14 | User-agent: agentV
15 | User-agent: agentW
16 | Disallow: /foo
17 | Allow: /bar #comment
18 |
19 | User-agent: spiderX
20 | Disallow:
21 | Disallow: /admin#
22 | Disallow: /assets
23 |
24 | User-agent: botY
25 | Disallow: /
26 | Allow: &&/1@| #invalid
27 | Allow: /forum/$
28 | Allow: /article
29 |
30 | User-agent: crawlerZ
31 | Disallow:
32 | Disallow: /
33 | Allow: /$
34 |
--------------------------------------------------------------------------------
/test/Fixtures/cache-delay-spec.txt:
--------------------------------------------------------------------------------
1 | User-Agent: *
2 | Crawl-Delay: 0.5
3 |
4 | User-Agent: GoogleBot
5 | Cache-Delay: 3.7
6 |
7 | User-Agent: AhrefsBot
8 | Cache-Delay: 8
9 |
10 | User-Agent: Yandex
11 | Crawl-Delay: 1.5
12 |
--------------------------------------------------------------------------------
/test/Fixtures/crawl-delay-spec.txt:
--------------------------------------------------------------------------------
1 | User-Agent: GoogleBot
2 | Crawl-Delay: 0.9
3 |
4 | User-Agent: AhrefsBot
5 | Crawl-Delay: 1.5
6 |
--------------------------------------------------------------------------------
/test/Fixtures/disallow-all.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow: /
3 |
--------------------------------------------------------------------------------
/test/Fixtures/expected-skipped-lines-log.php:
--------------------------------------------------------------------------------
1 |
432 | #
433 | # Localisable part of robots.txt for en.wikipedia.org
434 | #
435 | # Edit at https://en.wikipedia.org/w/index.php?title=MediaWiki:Robots.txt&action=edit
436 | # Don't add newlines here. All rules set here are active for every user-agent.
437 | #
438 | # Please check any changes using a syntax validator
439 | # Enter https://en.wikipedia.org/robots.txt as the URL to check.
440 | #
441 | # https://phabricator.wikimedia.org/T16075
442 | Disallow: /wiki/MediaWiki:Spam-blacklist
443 | Disallow: /wiki/MediaWiki%3ASpam-blacklist
444 | Disallow: /wiki/MediaWiki_talk:Spam-blacklist
445 | Disallow: /wiki/MediaWiki_talk%3ASpam-blacklist
446 | Disallow: /wiki/Wikipedia:WikiProject_Spam
447 | Disallow: /wiki/Wikipedia_talk:WikiProject_Spam
448 | #
449 | # Folks get annoyed when XfD discussions end up the number 1 google hit for
450 | # their name.
451 | # https://phabricator.wikimedia.org/T16075
452 | Disallow: /wiki/Wikipedia:Articles_for_deletion
453 | Disallow: /wiki/Wikipedia%3AArticles_for_deletion
454 | Disallow: /wiki/Wikipedia:Votes_for_deletion
455 | Disallow: /wiki/Wikipedia%3AVotes_for_deletion
456 | Disallow: /wiki/Wikipedia:Pages_for_deletion
457 | Disallow: /wiki/Wikipedia%3APages_for_deletion
458 | Disallow: /wiki/Wikipedia:Miscellany_for_deletion
459 | Disallow: /wiki/Wikipedia%3AMiscellany_for_deletion
460 | Disallow: /wiki/Wikipedia:Miscellaneous_deletion
461 | Disallow: /wiki/Wikipedia%3AMiscellaneous_deletion
462 | Disallow: /wiki/Wikipedia:Categories_for_discussion
463 | Disallow: /wiki/Wikipedia%3ACategories_for_discussion
464 | Disallow: /wiki/Wikipedia:Templates_for_deletion
465 | Disallow: /wiki/Wikipedia%3ATemplates_for_deletion
466 | Disallow: /wiki/Wikipedia:Redirects_for_discussion
467 | Disallow: /wiki/Wikipedia%3ARedirects_for_discussion
468 | Disallow: /wiki/Wikipedia:Deletion_review
469 | Disallow: /wiki/Wikipedia%3ADeletion_review
470 | Disallow: /wiki/Wikipedia:WikiProject_Deletion_sorting
471 | Disallow: /wiki/Wikipedia%3AWikiProject_Deletion_sorting
472 | Disallow: /wiki/Wikipedia:Files_for_deletion
473 | Disallow: /wiki/Wikipedia%3AFiles_for_deletion
474 | Disallow: /wiki/Wikipedia:Files_for_discussion
475 | Disallow: /wiki/Wikipedia%3AFiles_for_discussion
476 | Disallow: /wiki/Wikipedia:Possibly_unfree_files
477 | Disallow: /wiki/Wikipedia%3APossibly_unfree_files
478 | #
479 | # https://phabricator.wikimedia.org/T12288
480 | Disallow: /wiki/Wikipedia_talk:Articles_for_deletion
481 | Disallow: /wiki/Wikipedia_talk%3AArticles_for_deletion
482 | Disallow: /wiki/Wikipedia_talk:Votes_for_deletion
483 | Disallow: /wiki/Wikipedia_talk%3AVotes_for_deletion
484 | Disallow: /wiki/Wikipedia_talk:Pages_for_deletion
485 | Disallow: /wiki/Wikipedia_talk%3APages_for_deletion
486 | Disallow: /wiki/Wikipedia_talk:Miscellany_for_deletion
487 | Disallow: /wiki/Wikipedia_talk%3AMiscellany_for_deletion
488 | Disallow: /wiki/Wikipedia_talk:Miscellaneous_deletion
489 | Disallow: /wiki/Wikipedia_talk%3AMiscellaneous_deletion
490 | Disallow: /wiki/Wikipedia_talk:Templates_for_deletion
491 | Disallow: /wiki/Wikipedia_talk%3ATemplates_for_deletion
492 | Disallow: /wiki/Wikipedia_talk:Categories_for_discussion
493 | Disallow: /wiki/Wikipedia_talk%3ACategories_for_discussion
494 | Disallow: /wiki/Wikipedia_talk:Deletion_review
495 | Disallow: /wiki/Wikipedia_talk%3ADeletion_review
496 | Disallow: /wiki/Wikipedia_talk:WikiProject_Deletion_sorting
497 | Disallow: /wiki/Wikipedia_talk%3AWikiProject_Deletion_sorting
498 | Disallow: /wiki/Wikipedia_talk:Files_for_deletion
499 | Disallow: /wiki/Wikipedia_talk%3AFiles_for_deletion
500 | Disallow: /wiki/Wikipedia_talk:Files_for_discussion
501 | Disallow: /wiki/Wikipedia_talk%3AFiles_for_discussion
502 | Disallow: /wiki/Wikipedia_talk:Possibly_unfree_files
503 | Disallow: /wiki/Wikipedia_talk%3APossibly_unfree_files
504 | #
505 | Disallow: /wiki/Wikipedia:Copyright_problems
506 | Disallow: /wiki/Wikipedia%3ACopyright_problems
507 | Disallow: /wiki/Wikipedia_talk:Copyright_problems
508 | Disallow: /wiki/Wikipedia_talk%3ACopyright_problems
509 | Disallow: /wiki/Wikipedia:Suspected_copyright_violations
510 | Disallow: /wiki/Wikipedia%3ASuspected_copyright_violations
511 | Disallow: /wiki/Wikipedia_talk:Suspected_copyright_violations
512 | Disallow: /wiki/Wikipedia_talk%3ASuspected_copyright_violations
513 | Disallow: /wiki/Wikipedia:Contributor_copyright_investigations
514 | Disallow: /wiki/Wikipedia%3AContributor_copyright_investigations
515 | Disallow: /wiki/Wikipedia:Contributor_copyright_investigations
516 | Disallow: /wiki/Wikipedia%3AContributor_copyright_investigations
517 | Disallow: /wiki/Wikipedia_talk:Contributor_copyright_investigations
518 | Disallow: /wiki/Wikipedia_talk%3AContributor_copyright_investigations
519 | Disallow: /wiki/Wikipedia_talk:Contributor_copyright_investigations
520 | Disallow: /wiki/Wikipedia_talk%3AContributor_copyright_investigations
521 | Disallow: /wiki/Wikipedia:Protected_titles
522 | Disallow: /wiki/Wikipedia%3AProtected_titles
523 | Disallow: /wiki/Wikipedia_talk:Protected_titles
524 | Disallow: /wiki/Wikipedia_talk%3AProtected_titles
525 | Disallow: /wiki/Wikipedia:Articles_for_creation
526 | Disallow: /wiki/Wikipedia%3AArticles_for_creation
527 | Disallow: /wiki/Wikipedia_talk:Articles_for_creation
528 | Disallow: /wiki/Wikipedia_talk%3AArticles_for_creation
529 | Disallow: /wiki/Wikipedia_talk:Article_wizard
530 | Disallow: /wiki/Wikipedia_talk%3AArticle_wizard
531 | #
532 | # https://phabricator.wikimedia.org/T13261
533 | Disallow: /wiki/Wikipedia:Requests_for_arbitration
534 | Disallow: /wiki/Wikipedia%3ARequests_for_arbitration
535 | Disallow: /wiki/Wikipedia_talk:Requests_for_arbitration
536 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_arbitration
537 | Disallow: /wiki/Wikipedia:Requests_for_comment
538 | Disallow: /wiki/Wikipedia%3ARequests_for_comment
539 | Disallow: /wiki/Wikipedia_talk:Requests_for_comment
540 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_comment
541 | Disallow: /wiki/Wikipedia:Requests_for_adminship
542 | Disallow: /wiki/Wikipedia%3ARequests_for_adminship
543 | Disallow: /wiki/Wikipedia_talk:Requests_for_adminship
544 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_adminship
545 | #
546 | # https://phabricator.wikimedia.org/T14111
547 | Disallow: /wiki/Wikipedia:Requests_for_checkuser
548 | Disallow: /wiki/Wikipedia%3ARequests_for_checkuser
549 | Disallow: /wiki/Wikipedia_talk:Requests_for_checkuser
550 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_checkuser
551 | #
552 | # https://phabricator.wikimedia.org/T15398
553 | Disallow: /wiki/Wikipedia:WikiProject_Spam
554 | Disallow: /wiki/Wikipedia%3AWikiProject_Spam
555 | #
556 | # https://phabricator.wikimedia.org/T16793
557 | Disallow: /wiki/Wikipedia:Changing_username
558 | Disallow: /wiki/Wikipedia%3AChanging_username
559 | Disallow: /wiki/Wikipedia:Changing_username
560 | Disallow: /wiki/Wikipedia%3AChanging_username
561 | Disallow: /wiki/Wikipedia_talk:Changing_username
562 | Disallow: /wiki/Wikipedia_talk%3AChanging_username
563 | Disallow: /wiki/Wikipedia_talk:Changing_username
564 | Disallow: /wiki/Wikipedia_talk%3AChanging_username
565 | #
566 | Disallow: /wiki/Wikipedia:Administrators%27_noticeboard
567 | Disallow: /wiki/Wikipedia%3AAdministrators%27_noticeboard
568 | Disallow: /wiki/Wikipedia_talk:Administrators%27_noticeboard
569 | Disallow: /wiki/Wikipedia_talk%3AAdministrators%27_noticeboard
570 | Disallow: /wiki/Wikipedia:Community_sanction_noticeboard
571 | Disallow: /wiki/Wikipedia%3ACommunity_sanction_noticeboard
572 | Disallow: /wiki/Wikipedia_talk:Community_sanction_noticeboard
573 | Disallow: /wiki/Wikipedia_talk%3ACommunity_sanction_noticeboard
574 | Disallow: /wiki/Wikipedia:Bureaucrats%27_noticeboard
575 | Disallow: /wiki/Wikipedia%3ABureaucrats%27_noticeboard
576 | Disallow: /wiki/Wikipedia_talk:Bureaucrats%27_noticeboard
577 | Disallow: /wiki/Wikipedia_talk%3ABureaucrats%27_noticeboard
578 | #
579 | Disallow: /wiki/Wikipedia:Sockpuppet_investigations
580 | Disallow: /wiki/Wikipedia%3ASockpuppet_investigations
581 | Disallow: /wiki/Wikipedia_talk:Sockpuppet_investigations
582 | Disallow: /wiki/Wikipedia_talk%3ASockpuppet_investigations
583 | #
584 | Disallow: /wiki/Wikipedia:Neutral_point_of_view/Noticeboard
585 | Disallow: /wiki/Wikipedia%3ANeutral_point_of_view/Noticeboard
586 | Disallow: /wiki/Wikipedia_talk:Neutral_point_of_view/Noticeboard
587 | Disallow: /wiki/Wikipedia_talk%3ANeutral_point_of_view/Noticeboard
588 | #
589 | Disallow: /wiki/Wikipedia:No_original_research/noticeboard
590 | Disallow: /wiki/Wikipedia%3ANo_original_research/noticeboard
591 | Disallow: /wiki/Wikipedia_talk:No_original_research/noticeboard
592 | Disallow: /wiki/Wikipedia_talk%3ANo_original_research/noticeboard
593 | #
594 | Disallow: /wiki/Wikipedia:Fringe_theories/Noticeboard
595 | Disallow: /wiki/Wikipedia%3AFringe_theories/Noticeboard
596 | Disallow: /wiki/Wikipedia_talk:Fringe_theories/Noticeboard
597 | Disallow: /wiki/Wikipedia_talk%3AFringe_theories/Noticeboard
598 | #
599 | Disallow: /wiki/Wikipedia:Conflict_of_interest/Noticeboard
600 | Disallow: /wiki/Wikipedia%3AConflict_of_interest/Noticeboard
601 | Disallow: /wiki/Wikipedia_talk:Conflict_of_interest/Noticeboard
602 | Disallow: /wiki/Wikipedia_talk%3AConflict_of_interest/Noticeboard
603 | #
604 | Disallow: /wiki/Wikipedia:Long-term_abuse
605 | Disallow: /wiki/Wikipedia%3ALong-term_abuse
606 | Disallow: /wiki/Wikipedia_talk:Long-term_abuse
607 | Disallow: /wiki/Wikipedia_talk%3ALong-term_abuse
608 | Disallow: /wiki/Wikipedia:Long_term_abuse
609 | Disallow: /wiki/Wikipedia%3ALong_term_abuse
610 | Disallow: /wiki/Wikipedia_talk:Long_term_abuse
611 | Disallow: /wiki/Wikipedia_talk%3ALong_term_abuse
612 | #
613 | Disallow: /wiki/Wikipedia:Wikiquette_assistance
614 | Disallow: /wiki/Wikipedia%3AWikiquette_assistance
615 | #
616 | Disallow: /wiki/Wikipedia:Abuse_reports
617 | Disallow: /wiki/Wikipedia%3AAbuse_reports
618 | Disallow: /wiki/Wikipedia_talk:Abuse_reports
619 | Disallow: /wiki/Wikipedia_talk%3AAbuse_reports
620 | Disallow: /wiki/Wikipedia:Abuse_response
621 | Disallow: /wiki/Wikipedia%3AAbuse_response
622 | Disallow: /wiki/Wikipedia_talk:Abuse_response
623 | Disallow: /wiki/Wikipedia_talk%3AAbuse_response
624 | #
625 | Disallow: /wiki/Wikipedia:Reliable_sources/Noticeboard
626 | Disallow: /wiki/Wikipedia%3AReliable_sources/Noticeboard
627 | Disallow: /wiki/Wikipedia_talk:Reliable_sources/Noticeboard
628 | Disallow: /wiki/Wikipedia_talk%3AReliable_sources/Noticeboard
629 | #
630 | Disallow: /wiki/Wikipedia:Suspected_sock_puppets
631 | Disallow: /wiki/Wikipedia%3ASuspected_sock_puppets
632 | Disallow: /wiki/Wikipedia_talk:Suspected_sock_puppets
633 | Disallow: /wiki/Wikipedia_talk%3ASuspected_sock_puppets
634 | #
635 | Disallow: /wiki/Wikipedia:Biographies_of_living_persons/Noticeboard
636 | Disallow: /wiki/Wikipedia%3ABiographies_of_living_persons/Noticeboard
637 | Disallow: /wiki/Wikipedia_talk:Biographies_of_living_persons/Noticeboard
638 | Disallow: /wiki/Wikipedia_talk%3ABiographies_of_living_persons/Noticeboard
639 | Disallow: /wiki/Wikipedia:Biographies_of_living_persons%2FNoticeboard
640 | Disallow: /wiki/Wikipedia%3ABiographies_of_living_persons%2FNoticeboard
641 | Disallow: /wiki/Wikipedia_talk:Biographies_of_living_persons%2FNoticeboard
642 | Disallow: /wiki/Wikipedia_talk%3ABiographies_of_living_persons%2FNoticeboard
643 | #
644 | Disallow: /wiki/Wikipedia:Content_noticeboard
645 | Disallow: /wiki/Wikipedia%3AContent_noticeboard
646 | Disallow: /wiki/Wikipedia_talk:Content_noticeboard
647 | Disallow: /wiki/Wikipedia_talk%3AContent_noticeboard
648 | #
649 | Disallow: /wiki/Template:Editnotices
650 | Disallow: /wiki/Template%3AEditnotices
651 | #
652 | Disallow: /wiki/Wikipedia:Arbitration
653 | Disallow: /wiki/Wikipedia%3AArbitration
654 | Disallow: /wiki/Wikipedia_talk:Arbitration
655 | Disallow: /wiki/Wikipedia_talk%3AArbitration
656 | #
657 | Disallow: /wiki/Wikipedia:Arbitration_Committee
658 | Disallow: /wiki/Wikipedia%3AArbitration_Committee
659 | Disallow: /wiki/Wikipedia_talk:Arbitration_Committee
660 | Disallow: /wiki/Wikipedia_talk%3AArbitration_Committee
661 | #
662 | Disallow: /wiki/Wikipedia:Arbitration_Committee_Elections
663 | Disallow: /wiki/Wikipedia%3AArbitration_Committee_Elections
664 | Disallow: /wiki/Wikipedia_talk:Arbitration_Committee_Elections
665 | Disallow: /wiki/Wikipedia_talk%3AArbitration_Committee_Elections
666 | #
667 | Disallow: /wiki/Wikipedia:Mediation_Committee
668 | Disallow: /wiki/Wikipedia%3AMediation_Committee
669 | Disallow: /wiki/Wikipedia_talk:Mediation_Committee
670 | Disallow: /wiki/Wikipedia_talk%3AMediation_Committee
671 | #
672 | Disallow: /wiki/Wikipedia:Mediation_Cabal/Cases
673 | Disallow: /wiki/Wikipedia%3AMediation_Cabal/Cases
674 | #
675 | Disallow: /wiki/Wikipedia:Requests_for_bureaucratship
676 | Disallow: /wiki/Wikipedia%3ARequests_for_bureaucratship
677 | Disallow: /wiki/Wikipedia_talk:Requests_for_bureaucratship
678 | Disallow: /wiki/Wikipedia_talk%3ARequests_for_bureaucratship
679 | #
680 | Disallow: /wiki/Wikipedia:Administrator_review
681 | Disallow: /wiki/Wikipedia%3AAdministrator_review
682 | Disallow: /wiki/Wikipedia_talk:Administrator_review
683 | Disallow: /wiki/Wikipedia_talk%3AAdministrator_review
684 | #
685 | Disallow: /wiki/Wikipedia:Editor_review
686 | Disallow: /wiki/Wikipedia%3AEditor_review
687 | Disallow: /wiki/Wikipedia_talk:Editor_review
688 | Disallow: /wiki/Wikipedia_talk%3AEditor_review
689 | #
690 | Disallow: /wiki/Wikipedia:Article_Incubator
691 | Disallow: /wiki/Wikipedia%3AArticle_Incubator
692 | Disallow: /wiki/Wikipedia_talk:Article_Incubator
693 | Disallow: /wiki/Wikipedia_talk%3AArticle_Incubator
694 | #
695 | Disallow: /wiki/Category:Noindexed_pages
696 | Disallow: /wiki/Category%3ANoindexed_pages
697 | #
698 | # User sandboxes for modules and Template Styles are placed in these subpages for testing
699 | #
700 | Disallow: /wiki/Module:Sandbox
701 | Disallow: /wiki/Module%3ASandbox
702 | Disallow: /wiki/Template:TemplateStyles_sandbox
703 | Disallow: /wiki/Template%3ATemplateStyles_sandbox
704 | #
705 | #
706 |
--------------------------------------------------------------------------------
/test/Fixtures/with-clean-param.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow: /
3 | Clean-param: s&ref /forum*/sh*wthread.php
4 | Clean-param: abc /forum/showthread.php
5 | Clean-param: sid&sort /forum/*.php
6 | Clean-param: someTrash&otherTrash
7 |
--------------------------------------------------------------------------------
/test/Fixtures/with-commented-line-endings.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disal # low: /tech
3 |
4 | User-agent: google-bot
5 | Disallow: #/tech
6 |
7 | User-agent: yahoo-bot
8 | Disallow: /tech # ds
9 |
10 | User-agent: yandex-bot
11 | Disallow#: /tech # ds
12 |
13 | User-agent: *
14 | Disallow: /comment-after #comment
15 |
--------------------------------------------------------------------------------
/test/Fixtures/with-commented-lines.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | # Disallow: /tech
3 | # this is a commented line
4 | # it should not be in the iterator
5 | Allow: /some
6 |
--------------------------------------------------------------------------------
/test/Fixtures/with-empty-and-whitespace.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 | User-Agent: GoogleBot
4 | Crawl-Delay: 0.9
5 | User-Agent: AhrefsBot
6 | Crawl-Delay: 1.5
7 |
--------------------------------------------------------------------------------
/test/Fixtures/with-empty-lines.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | # Disallow: /tech
3 |
4 |
5 |
6 | # this is a commented line
7 |
8 |
9 |
10 | # it should not be in the iterator
11 |
12 |
13 |
14 | Allow: /some
15 |
--------------------------------------------------------------------------------
/test/Fixtures/with-empty-rules.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | #Disallow: /tech
3 |
--------------------------------------------------------------------------------
/test/Fixtures/with-faulty-directives.txt:
--------------------------------------------------------------------------------
1 | User-agent: google1 #specifies the robots that the directives are set for
2 | Disallow: /bin/ # prohibits links from the Shopping Cart.
3 | Disallow: /search/ # prohibits page links of the search embedded on the site
4 | Disallow: /admin/ # prohibits links from the admin panel
5 | Disallow /admin/ # prohibits links from the admin panel
6 | Sitemap: http://example.com/sitemap # specifies the path to the site's Sitemap file for the robot
7 | Clean-param: ref /some_dir/get_book.pl
8 |
9 | user-agent: google2 #specifies the robots that the directives are set for
10 | disallow: /bin/ # prohibits links from the Shopping Cart.
11 | disallow: /search/ # prohibits page links of the search embedded on the site
12 | disallow: /admin/ # prohibits links from the admin panel
13 | sitemap: http://example.com/sitemap # specifies the path to the site's Sitemap file for the robot
14 | clean-param: ref /some_dir/get_book.pl
15 |
16 | user-Agent: google3 #specifies the robots that the directives are set for
17 | disaLLow: /bin/ # prohibits links from the Shopping Cart.
18 | diSallow: /search/ # prohibits page links of the search embedded on the site
19 | dis@llow: /admin/ # prohibits links from the admin panel
20 | sitEmap: http://example.com/sitemap # specifies the path to the site's Sitemap file for the robot
21 | cleanParam: ref /some_dir/get_book.pl
22 |
23 | User#agent: google4 #specifies the robots that the directives are set for
24 | Disa#low: /bin/ # prohibits links from the Shopping Cart.
25 | Disa#low: /search/ # prohibits page links of the search embedded on the site
26 | Disa#low: /admin/ # prohibits links from the admin panel
27 | Site#ap: http://example.com/sitemap # specifies the path to the site's Sitemap file for the robot
28 | Clean#param: ref /some_dir/get_book.pl
29 |
--------------------------------------------------------------------------------
/test/Fixtures/with-hosts.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow: /cgi-bin
3 | Disallow: /
4 | Host: myhost.ru
5 |
6 | User-agent: Yandex
7 | Disallow: /cgi-bin
8 |
9 | # Examples of Host directives that will be ignored
10 | Host: www.myhost-.com
11 | Host: www.-myhost.com
12 | Host: www.myhost.com:100000
13 | Host: www.my_host.com
14 | Host: .my-host.com:8000
15 | Host: my-host.com.Host: my..host.com
16 | Host: www.myhost.com:8080/
17 | Host: 213.180.194.129
18 | Host: [2001:db8::1]
19 | Host: FE80::0202:B3FF:FE1E:8329
20 | Host: https://[2001:db8:0:1]:80
21 | Host: www.firsthost.ru,www.secondhost.com
22 | Host: www.firsthost.ru www.secondhost.com
23 |
24 | # Examples of valid Host directives
25 | Host: myhost.ru # uses this one
26 | Host: www.myhost.ru # is not used
27 |
--------------------------------------------------------------------------------
/test/Fixtures/with-invalid-request-rate.txt:
--------------------------------------------------------------------------------
1 | Useragent: GoogleBot
2 | Crawl-delay: 0.3 # valid
3 | Crawl-delay: 0.599 # valid
4 | Crawl-delay: 8888 # valid
5 | Crawl-delay: 8888
6 | Crawl-delay: ngfsngdndag
7 | Crawl-delay: ngfsn.gdndag # invalid
8 | Crawl-delay: 0.vfsbfsb # invalid
9 | Request-rate: 100/854000 # valid
10 | Request-rate: 100/bgdndgnd # invalid
11 | Request-rate: 15686 # invalid
12 | Request-rate: ngdndganda # invalid
13 |
--------------------------------------------------------------------------------
/test/Fixtures/with-sitemaps.txt:
--------------------------------------------------------------------------------
1 | Sitemap: http://example.com/sitemap.xml?year=2015
2 | Sitemap: http://example.com/sitemap.xml?year=2015
3 | Sitemap: http://example.com/sitemap.xml?year=2015
4 |
5 | User-agent: *
6 | Disallow: /admin/
7 | Sitemap: http://somesite.com/sitemap-for-all.xml
8 |
9 | User-agent: Googlebot
10 | Sitemap: http://internet.com/sitemap-for-google-bot.xml
11 |
12 | User-agent: Yahoo
13 | Sitemap: http://worldwideweb.com/sitemap-yahoo.xml
14 | Sitemap: http://example.com/sitemap-yahoo.xml?year=2016
15 |
--------------------------------------------------------------------------------
/test/HttpStatusCodeTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
16 |
17 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/allow-all.txt', 'r'));
18 | $this->parser->setLogger($log);
19 | }
20 |
21 | public function tearDown(): void {
22 | $this->parser = null;
23 | }
24 |
25 | public function testHttpStatusCodeValid() {
26 | $this->parser->setHttpStatusCode(200);
27 | $this->assertTrue($this->parser->isAllowed("/"));
28 | $this->assertFalse($this->parser->isDisallowed("/"));
29 |
30 | /** @var TestHandler $handler */
31 | $handler = $this->parser->getLogger()->getHandlers()[0];
32 |
33 | $this->assertTrue(
34 | $handler->hasRecord("Rule match: Path", LogLevel::DEBUG),
35 | stringifyLogs($handler->getRecords())
36 | );
37 | }
38 |
39 | public function testHttpStatusCodeInvalid() {
40 | $this->parser->setHttpStatusCode(503);
41 | $this->assertTrue($this->parser->isDisallowed("/"));
42 | $this->assertFalse($this->parser->isAllowed("/"));
43 |
44 | /** @var TestHandler $handler */
45 | $handler = $this->parser->getLogger()->getHandlers()[0];
46 |
47 | $this->assertTrue(
48 | $handler->hasRecord("Disallowed by HTTP status code 503", LogLevel::DEBUG),
49 | stringifyLogs($handler->getRecords())
50 | );
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/test/InvalidPathTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($parser->isAllowed('*wildcard'));
17 | $this->assertFalse($parser->isDisallowed("&&1@|"));
18 | $this->assertTrue($parser->isAllowed('+£€@@1¤'));
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/test/Parser/DirectivesProcessors/CleanParamProcessorTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
22 |
23 | $this->processor = new CleanParamProcessor($log);
24 | }
25 |
26 | public function tearDown(): void {
27 | $this->processor = null;
28 | }
29 |
30 | public function testProcessesCorrectlyWithPath() {
31 | $tree = [];
32 | $line = 'Clean-param: some&someMore /only/here';
33 |
34 | $this->processor->process($line, $tree);
35 |
36 | $this->assertArrayHasKey(Directive::CLEAN_PARAM, $tree);
37 | $this->assertArrayHasKey('/only/here', $tree[Directive::CLEAN_PARAM], json_encode($tree[Directive::CLEAN_PARAM]));
38 | $this->assertContains('some', $tree[Directive::CLEAN_PARAM]['/only/here'], json_encode($tree[Directive::CLEAN_PARAM]));
39 | $this->assertContains('someMore', $tree[Directive::CLEAN_PARAM]['/only/here'], json_encode($tree[Directive::CLEAN_PARAM]));
40 | }
41 |
42 | public function testProcessesCorrectlyWithNoPath() {
43 | $tree = [];
44 | $line = 'Clean-param: some&someMore';
45 |
46 | $this->processor->process($line, $tree);
47 |
48 | $this->assertArrayHasKey(Directive::CLEAN_PARAM, $tree);
49 | $this->assertArrayHasKey('/*', $tree[Directive::CLEAN_PARAM], json_encode($tree[Directive::CLEAN_PARAM]));
50 | $this->assertContains('some', $tree[Directive::CLEAN_PARAM]['/*'], json_encode($tree[Directive::CLEAN_PARAM]));
51 | $this->assertContains('someMore', $tree[Directive::CLEAN_PARAM]['/*'], json_encode($tree[Directive::CLEAN_PARAM]));
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/test/Parser/DirectivesProcessors/CrawlDelayProcessorTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
22 |
23 | $this->processor = new CrawlDelayProcessor($log);
24 | }
25 |
26 | public function tearDown(): void {
27 | $this->processor = null;
28 | }
29 |
30 | public function testSavesValidCrawlDelayInteger() {
31 | $tree = [];
32 | $line = 'Crawl-delay: 25';
33 |
34 | $this->processor->process($line, $tree);
35 |
36 | $this->assertArrayHasKey('*', $tree);
37 | $this->assertArrayHasKey(Directive::CRAWL_DELAY, $tree['*']);
38 | $this->assertEquals(25, $tree['*'][Directive::CRAWL_DELAY], json_encode($tree));
39 | }
40 |
41 | public function testSavesValidCrawlDelayDecimal() {
42 | $tree = [];
43 | $line = 'Crawl-delay: 0.5';
44 |
45 | $this->processor->process($line, $tree);
46 |
47 | $this->assertArrayHasKey('*', $tree);
48 | $this->assertArrayHasKey(Directive::CRAWL_DELAY, $tree['*']);
49 | $this->assertEquals(0.5, $tree['*'][Directive::CRAWL_DELAY], json_encode($tree));
50 | }
51 |
52 | public function testSkipsInvalidAndLogs() {
53 | $tree = [];
54 | $line = 'Crawl-delay: thisIsNotANumber';
55 |
56 | $this->processor->process($line, $tree);
57 |
58 | $this->assertArrayNotHasKey('*', $tree, json_encode($tree));
59 |
60 | /** @var TestHandler $handler */
61 | $handler = $this->processor->getLogger()->getHandlers()[0];
62 |
63 | $this->assertTrue(
64 | $handler->hasRecord(
65 | 'crawl-delay with value thisIsNotANumber dropped as invalid for *',
66 | LogLevel::DEBUG
67 | ),
68 | stringifyLogs($handler->getRecords())
69 | );
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/test/Parser/DirectivesProcessors/HostProcessorTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
22 |
23 | $this->processor = new HostProcessor($log);
24 | }
25 |
26 | public function tearDown(): void {
27 | $this->processor = null;
28 | }
29 |
30 | public function testAddsHostIfCorrect() {
31 | $tree = [];
32 | $line = 'Host: www.example.com';
33 |
34 | $this->processor->process($line, $tree);
35 |
36 | $this->assertArrayHasKey('*', $tree);
37 | $this->assertArrayHasKey(Directive::HOST, $tree['*']);
38 | $this->assertContains('www.example.com', $tree['*'], json_encode($tree));
39 | }
40 |
41 | public function testSkipsAndLogsIfIpAddressPassed() {
42 | $tree = [];
43 | $line = 'Host: 192.168.0.1';
44 |
45 | $this->processor->process($line, $tree);
46 |
47 | $this->assertArrayNotHasKey('*', $tree);
48 | $this->assertArrayNotHasKey(Directive::HOST, $tree);
49 |
50 | /** @var TestHandler $handler */
51 | $handler = $this->processor->getLogger()->getHandlers()[0];
52 |
53 | $this->assertTrue(
54 | $handler->hasRecord(
55 | 'host with value 192.168.0.1 dropped for * as invalid (IP address is not a valid hostname)',
56 | LogLevel::DEBUG
57 | ),
58 | stringifyLogs($handler->getRecords())
59 | );
60 | }
61 |
62 | public function testSkipsAndLogsIfNotValidHost() {
63 | $tree = [];
64 | $line = 'Host: bndgang!!!@#$da12345ngda]]';
65 |
66 | $this->processor->process($line, $tree);
67 |
68 | $this->assertArrayNotHasKey('*', $tree);
69 | $this->assertArrayNotHasKey(Directive::HOST, $tree);
70 |
71 | /** @var TestHandler $handler */
72 | $handler = $this->processor->getLogger()->getHandlers()[0];
73 |
74 | $this->assertTrue(
75 | $handler->hasRecord(
76 | 'host with value bndgang!!!@#$da12345ngda]] dropped for * as invalid',
77 | LogLevel::DEBUG
78 | ),
79 | stringifyLogs($handler->getRecords())
80 | );
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/test/Parser/DirectivesProcessors/SitemapProcessorTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
22 |
23 | $this->processor = new SitemapProcessor($log);
24 | }
25 |
26 | public function tearDown(): void {
27 | $this->processor = null;
28 | }
29 |
30 | public function testAddsSitemapDirectiveForDefaultUserAgent() {
31 | $tree = [];
32 | $line = 'Sitemap: https://www.example.com/sitemap.xml';
33 |
34 | $this->processor->process($line, $tree);
35 |
36 | $this->assertArrayHasKey('*', $tree);
37 | $this->assertArrayHasKey(Directive::SITEMAP, $tree['*']);
38 | }
39 |
40 | public function testAddsSitemapDirectiveForCustomUserAgent() {
41 | $userAgent = 'Google';
42 | $tree = [];
43 | $line = 'Sitemap: https://www.example.com/sitemap.xml';
44 |
45 | $this->processor->process($line, $tree, $userAgent);
46 |
47 | $this->assertArrayHasKey('Google', $tree);
48 | $this->assertArrayHasKey(Directive::SITEMAP, $tree[$userAgent]);
49 | }
50 |
51 | public function testAddsSitemapSkipsExistingAndLogsIt() {
52 | $userAgent = 'Google';
53 | $tree = [
54 | $userAgent => [
55 | Directive::SITEMAP => [
56 | 'https://www.example.com/sitemap.xml'
57 | ]
58 | ]
59 | ];
60 | $line = 'Sitemap: https://www.example.com/sitemap.xml';
61 |
62 | $this->processor->process($line, $tree, $userAgent);
63 |
64 | $this->assertArrayHasKey('Google', $tree);
65 | $this->assertArrayHasKey(Directive::SITEMAP, $tree[$userAgent]);
66 |
67 | /** @var TestHandler $handler */
68 | $handler = $this->processor->getLogger()->getHandlers()[0];
69 |
70 | $this->assertTrue(
71 | $handler->hasRecord('sitemap with value https://www.example.com/sitemap.xml skipped as already exists for Google', LogLevel::DEBUG),
72 | stringifyLogs($handler->getRecords())
73 | );
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/test/Parser/DirectivesProcessors/UserAgentProcessorTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
21 |
22 | $this->processor = new UserAgentProcessor($log);
23 | }
24 |
25 | public function tearDown(): void {
26 | $this->processor = null;
27 | }
28 |
29 | public function testAddsNewUserAgentSection() {
30 | $line = 'User-agent: Google';
31 | $currentAgent = '*';
32 | $tree = [
33 | $currentAgent => [],
34 | ];
35 |
36 | $this->processor->process($line, $tree, $currentAgent);
37 |
38 | $this->assertArrayHasKey('Google', $tree);
39 | $this->assertEquals('Google', $currentAgent);
40 | }
41 |
42 | public function testLogsIfNotChanged() {
43 | $line = 'User-agent: Google';
44 | $currentAgent = 'Google';
45 | $tree = [
46 | $currentAgent => [],
47 | ];
48 |
49 | $this->processor->process($line, $tree, $currentAgent);
50 |
51 | $this->assertCount(1, array_keys($tree));
52 |
53 | /** @var TestHandler $handler */
54 | $handler = $this->processor->getLogger()->getHandlers()[0];
55 |
56 | $this->assertTrue(
57 | $handler->hasRecord('New useragent is equal to current one, skipping ...', LogLevel::DEBUG),
58 | stringifyLogs($handler->getRecords())
59 | );
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/test/Parser/UserAgent/UserAgentMatcherTest.php:
--------------------------------------------------------------------------------
1 | pushHandler(new TestHandler(LogLevel::DEBUG));
19 |
20 | $matcher = new UserAgentMatcher($logger);
21 |
22 | $match = $matcher->getMatching('Google', ['Google']);
23 | $this->assertEquals('Google', $match);
24 |
25 | $handler = $logger->getHandlers()[0];
26 |
27 | $this->assertTrue(
28 | $handler->hasRecord("Matched Google for user agent Google", LogLevel::DEBUG),
29 | stringifyLogs($handler->getRecords())
30 | );
31 | }
32 |
33 | public function testLogsWhenNotMatched() {
34 | $logger = new Logger(static::class);
35 | $logger->pushHandler(new TestHandler(LogLevel::DEBUG));
36 |
37 | $matcher = new UserAgentMatcher($logger);
38 |
39 | $match = $matcher->getMatching('Google', []);
40 | $this->assertEquals('*', $match);
41 |
42 | $handler = $logger->getHandlers()[0];
43 |
44 | $this->assertTrue(
45 | $handler->hasRecord("Failed to match user agent 'Google', falling back to '*'", LogLevel::DEBUG),
46 | stringifyLogs($handler->getRecords())
47 | );
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/test/RenderTest.php:
--------------------------------------------------------------------------------
1 | markTestSkipped('@TODO');
16 |
17 | $parser = new RobotsTxtParser($robotsTxtContent);
18 |
19 | $this->assertEquals($rendered, $parser->render("\n"));
20 | }
21 |
22 | /**
23 | * Generate test data
24 | *
25 | * @return array
26 | */
27 | public function generateDataForTest()
28 | {
29 | return [
30 | [
31 | <<pushHandler(new TestHandler(LogLevel::DEBUG));
16 |
17 | $this->parser = new RobotsTxtParser(fopen(__DIR__ . '/Fixtures/wikipedia-org.txt', 'r'));
18 | $this->parser->setLogger($log);
19 | }
20 |
21 | public function tearDown(): void {
22 | $this->parser = null;
23 | }
24 |
25 | public function testGetRulesAll() {
26 | $rules = $this->parser->getRules();
27 |
28 | // should be all 33 UAs on top level
29 | $this->assertArrayHasKey("MJ12bot", $rules);
30 | $this->assertArrayHasKey("Mediapartners-Google*", $rules);
31 | $this->assertArrayHasKey("IsraBot", $rules);
32 | $this->assertArrayHasKey("Orthogaffe", $rules);
33 | $this->assertArrayHasKey("UbiCrawler", $rules);
34 | $this->assertArrayHasKey("DOC", $rules);
35 | $this->assertArrayHasKey("Zao", $rules);
36 | $this->assertArrayHasKey("sitecheck.internetseer.com", $rules);
37 | $this->assertArrayHasKey("Zealbot", $rules);
38 | $this->assertArrayHasKey("MSIECrawler", $rules);
39 | $this->assertArrayHasKey("SiteSnagger", $rules);
40 | $this->assertArrayHasKey("WebStripper", $rules);
41 | $this->assertArrayHasKey("WebCopier", $rules);
42 | $this->assertArrayHasKey("Fetch", $rules);
43 | $this->assertArrayHasKey("Offline Explorer", $rules);
44 | $this->assertArrayHasKey("Teleport", $rules);
45 | $this->assertArrayHasKey("TeleportPro", $rules);
46 | $this->assertArrayHasKey("WebZIP", $rules);
47 | $this->assertArrayHasKey("linko", $rules);
48 | $this->assertArrayHasKey("HTTrack", $rules);
49 | $this->assertArrayHasKey("Microsoft.URL.Control", $rules);
50 | $this->assertArrayHasKey("Xenu", $rules);
51 | $this->assertArrayHasKey("larbin", $rules);
52 | $this->assertArrayHasKey("libwww", $rules);
53 | $this->assertArrayHasKey("ZyBORG", $rules);
54 | $this->assertArrayHasKey("Download Ninja", $rules);
55 | $this->assertArrayHasKey("fast", $rules);
56 | $this->assertArrayHasKey("wget", $rules);
57 | $this->assertArrayHasKey("grub-client", $rules);
58 | $this->assertArrayHasKey("k2spider", $rules);
59 | $this->assertArrayHasKey("NPBot", $rules);
60 | $this->assertArrayHasKey("WebReaper", $rules);
61 | $this->assertArrayHasKey("*", $rules);
62 | }
63 |
64 | public function testTreeBuildOnlyOnce() {
65 | $this->parser->getRules();
66 | $this->parser->getRules();
67 | $this->parser->getRules();
68 | $this->parser->getRules();
69 |
70 | /** @var TestHandler $handler */
71 | $handler = $this->parser->getLogger()->getHandlers()[0];
72 |
73 | $treeCreateRecords = array_filter($handler->getRecords(), function(array $log) {
74 | return $log['message'] === 'Building directives tree...';
75 | });
76 |
77 | $this->assertCount(1, $treeCreateRecords);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/test/Stream/Filter/SkipCommentedLinesFilterTest.php:
--------------------------------------------------------------------------------
1 | assertContains(SkipCommentedLinesFilter::NAME, stream_get_filters());
24 | }
25 |
26 | public function testFilter() {
27 | $stream = fopen(__DIR__ . '/../../Fixtures/with-commented-lines.txt','r');
28 |
29 | // apply filter
30 | stream_filter_append($stream, SkipCommentedLinesFilter::NAME);
31 |
32 | $fstat = fstat($stream);
33 | $contents = fread($stream, $fstat['size']);
34 |
35 | // check commented not there
36 | $this->assertStringNotContainsString('# Disallow: /tech', $contents);
37 | $this->assertStringNotContainsString('# this is a commented line', $contents);
38 | $this->assertStringNotContainsString('# it should not be in the iterator', $contents);
39 |
40 | fclose($stream);
41 | }
42 |
43 | public function testFilterLargeSet() {
44 | $stream = fopen(__DIR__ . '/../../Fixtures/large-commented-lines.txt','r');
45 |
46 | // apply filter
47 | stream_filter_append($stream, SkipCommentedLinesFilter::NAME);
48 |
49 | $fstat = fstat($stream);
50 | $contents = fread($stream, $fstat['size']);
51 |
52 | // check commented not there
53 | $this->assertStringNotContainsString('# Lorem ipsum dolor sit amet,', $contents);
54 |
55 | fclose($stream);
56 | }
57 |
58 | public function testFilterWithLogger() {
59 | $log = new Logger(static::class);
60 | $log->pushHandler(new TestHandler(LogLevel::DEBUG));
61 |
62 | $stream = fopen(__DIR__ . '/../../Fixtures/large-commented-lines.txt','r');
63 |
64 | // apply filter
65 | stream_filter_append($stream, SkipCommentedLinesFilter::NAME, STREAM_FILTER_READ, ['logger' => $log]);
66 |
67 | $fstat = fstat($stream);
68 | $contents = fread($stream, $fstat['size']);
69 |
70 | /** @var TestHandler $handler */
71 | $handler = $log->getHandlers()[0];
72 |
73 | $messagesOnly = array_map(
74 | function(array $record) { return $record['message']; },
75 | $handler->getRecords()
76 | );
77 |
78 | $expected = require __DIR__ . '/../../Fixtures/expected-skipped-lines-log.php';
79 |
80 | $this->assertNotEmpty($contents);
81 | $this->assertEquals($messagesOnly, $expected);
82 |
83 | fclose($stream);
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/test/Stream/Filter/SkipDirectivesWithInvalidValuesFilterTest.php:
--------------------------------------------------------------------------------
1 | assertContains(SkipDirectivesWithInvalidValuesFilter::NAME, stream_get_filters());
21 | }
22 |
23 | /**
24 | * @TODO
25 | */
26 | public function testFilter() {
27 | $stream = fopen(__DIR__ . '/../../Fixtures/with-invalid-request-rate.txt','r');
28 |
29 | // apply filter
30 | stream_filter_append($stream, SkipDirectivesWithInvalidValuesFilter::NAME);
31 |
32 | $fstat = fstat($stream);
33 | $contents = fread($stream, $fstat['size']);
34 |
35 | // check other rules are still in place
36 | $this->assertStringContainsString('Useragent: GoogleBot', $contents);
37 |
38 | // check faulty removed
39 | $this->assertStringNotContainsString('Crawl-delay: ngfsngdndag', $contents);
40 | // $this->assertStringNotContainsString('Crawl-delay: 0.vfsbfsb # invalid', $contents);
41 | $this->assertStringNotContainsString('Request-rate: 100/bgdndgnd # invalid', $contents);
42 | $this->assertStringNotContainsString('Request-rate: 15686 # invalid', $contents);
43 | $this->assertStringNotContainsString('Request-rate: ngdndganda # invalid', $contents);
44 |
45 | fclose($stream);
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/test/Stream/Filter/SkipEmptyLinesFilterTest.php:
--------------------------------------------------------------------------------
1 | assertContains(SkipEmptyLinesFilter::NAME, stream_get_filters());
24 | }
25 |
26 | public function testFilter() {
27 | $beforeLines = 0;
28 | $afterLines = 0;
29 |
30 | $stream = fopen(__DIR__ . '/../../Fixtures/with-empty-lines.txt','r');
31 |
32 | while (!feof($stream)) {
33 | fgets($stream);
34 | $beforeLines++;
35 | }
36 |
37 | rewind($stream);
38 |
39 | // apply filter
40 | stream_filter_append($stream, SkipEmptyLinesFilter::NAME);
41 |
42 | $contents = "";
43 |
44 | while (!feof($stream)) {
45 | $contents .= fgets($stream);
46 | $afterLines++;
47 | }
48 |
49 | $this->assertNotEquals("", $contents);
50 | $this->assertTrue($afterLines < $beforeLines);
51 |
52 | fclose($stream);
53 | }
54 |
55 | public function testFilterEmptyFirst() {
56 | $stream = fopen(__DIR__ . '/../../Fixtures/with-empty-lines.txt','r');
57 |
58 | // apply filter
59 | stream_filter_append($stream, SkipEmptyLinesFilter::NAME);
60 |
61 | $lines = [];
62 |
63 | while (!feof($stream)) {
64 | $lines[] = fgets($stream);
65 | }
66 |
67 | $this->assertNotEmpty($lines);
68 | $this->assertNotEmpty($lines[0]);
69 |
70 | fclose($stream);
71 | }
72 |
73 | public function testFilterWithLogger() {
74 | $log = new Logger(static::class);
75 | $log->pushHandler(new TestHandler(LogLevel::DEBUG));
76 |
77 | $stream = fopen(__DIR__ . '/../../Fixtures/with-empty-lines.txt','r');
78 |
79 | // apply filter
80 | stream_filter_append($stream, SkipEmptyLinesFilter::NAME, STREAM_FILTER_READ, ['logger' => $log]);
81 |
82 | // do read
83 | $lines = [];
84 | while (!feof($stream)) {
85 | $lines[] = fgets($stream);
86 | }
87 |
88 | /** @var TestHandler $handler */
89 | $handler = $log->getHandlers()[0];
90 |
91 | $this->assertNotEmpty($lines);
92 | $this->assertTrue(
93 | $handler->hasRecord('3 lines skipped as empty.', LogLevel::DEBUG),
94 | stringifyLogs($handler->getRecords())
95 | );
96 | fclose($stream);
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/test/Stream/Filter/SkipEndOfCommentedLineFilterTest.php:
--------------------------------------------------------------------------------
1 | assertContains(SkipEndOfCommentedLineFilter::NAME, stream_get_filters());
24 | }
25 |
26 | public function testFilter() {
27 | $stream = fopen(__DIR__ . '/../../Fixtures/with-commented-line-endings.txt','r');
28 |
29 | // apply filter
30 | stream_filter_append($stream, SkipEndOfCommentedLineFilter::NAME);
31 |
32 | $fstat = fstat($stream);
33 | $contents = fread($stream, $fstat['size']);
34 |
35 | // check commented not there
36 | $this->assertStringNotContainsString('# ds', $contents);
37 | $this->assertStringNotContainsString('# low: /tech', $contents);
38 | $this->assertStringNotContainsString('#: /tech # ds', $contents);
39 |
40 | // should keep valid entries
41 | $this->assertStringContainsString('Disallow: /comment-after', $contents);
42 |
43 | fclose($stream);
44 | }
45 |
46 | public function testFilterWithLogger() {
47 | $log = new Logger(static::class);
48 | $log->pushHandler(new TestHandler(LogLevel::DEBUG));
49 |
50 | $stream = fopen(__DIR__ . '/../../Fixtures/with-commented-line-endings.txt','r');
51 |
52 | // apply filter
53 | stream_filter_append($stream, SkipEndOfCommentedLineFilter::NAME, STREAM_FILTER_READ, ['logger' => $log]);
54 |
55 | // do read
56 | $lines = [];
57 | while (!feof($stream)) {
58 | $lines[] = fgets($stream);
59 | }
60 |
61 | /** @var TestHandler $handler */
62 | $handler = $log->getHandlers()[0];
63 |
64 | $this->assertNotEmpty($lines);
65 | $this->assertTrue(
66 | $handler->hasRecord('5 char(s) dropped as commented out', LogLevel::DEBUG),
67 | stringifyLogs($handler->getRecords())
68 | );
69 | fclose($stream);
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/test/Stream/Filter/SkipUnsupportedDirectivesTest.php:
--------------------------------------------------------------------------------
1 | assertContains(SkipUnsupportedDirectivesFilter::NAME, stream_get_filters());
24 | }
25 |
26 | public function testFilter() {
27 | $stream = fopen(__DIR__ . '/../../Fixtures/with-faulty-directives.txt','r');
28 |
29 | // apply filter
30 | stream_filter_append($stream, SkipUnsupportedDirectivesFilter::NAME);
31 |
32 | $fstat = fstat($stream);
33 | $contents = fread($stream, $fstat['size']);
34 |
35 | $this->assertStringNotContainsString('Disallow /admin/ # prohibits links from the admin panel', $contents);
36 | $this->assertStringNotContainsString('dis@llow: /admin/ # prohibits links from the admin panel', $contents);
37 | $this->assertStringNotContainsString('cleanParam: ref /some_dir/get_book.pl', $contents);
38 | $this->assertStringNotContainsString('User#agent: google4 #specifies the robots that the directives are set for', $contents);
39 | $this->assertStringNotContainsString('Disa#low: /bin/ # prohibits links from the Shopping Cart.', $contents);
40 | $this->assertStringNotContainsString('Disa#low: /search/ # prohibits page links of the search embedded on the site', $contents);
41 | $this->assertStringNotContainsString('Disa#low: /admin/ # prohibits links from the admin panel', $contents);
42 | $this->assertStringNotContainsString('Site#ap: http://example.com/sitemap # specifies the path to the site\'s Sitemap file for the robot', $contents);
43 | $this->assertStringNotContainsString('Clean#param: ref /some_dir/get_book.pl', $contents);
44 |
45 | fclose($stream);
46 | }
47 |
48 | public function testFilterWithLogger() {
49 | $log = new Logger(static::class);
50 | $log->pushHandler(new TestHandler(LogLevel::DEBUG));
51 |
52 | $stream = fopen(__DIR__ . '/../../Fixtures/with-faulty-directives.txt', 'r');
53 |
54 | // apply filter
55 | stream_filter_append($stream, SkipUnsupportedDirectivesFilter::NAME, STREAM_FILTER_READ, ['logger' => $log]);
56 |
57 | $fstat = fstat($stream);
58 | $contents = fread($stream, $fstat['size']);
59 |
60 | /** @var TestHandler $handler */
61 | $handler = $log->getHandlers()[0];
62 |
63 | $this->assertNotEmpty($contents);
64 | $this->assertTrue(
65 | $handler->hasRecord('9 lines skipped as un-supported', LogLevel::DEBUG),
66 | stringifyLogs($handler->getRecords())
67 | );
68 |
69 | fclose($stream);
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/test/Stream/Filter/TrimSpacesLeftAndRightFilterTest.php:
--------------------------------------------------------------------------------
1 | assertContains(TrimSpacesLeftFilter::NAME, stream_get_filters());
21 | }
22 |
23 | public function testFilter() {
24 | $stream = fopen(__DIR__ . '/../../Fixtures/with-empty-and-whitespace.txt', 'r');
25 |
26 | // apply filter
27 | stream_filter_append($stream, TrimSpacesLeftFilter::NAME);
28 |
29 | $fstat = fstat($stream);
30 | $contents = fread($stream, $fstat['size']);
31 |
32 | $this->assertStringNotContainsString(' Crawl-Delay: 0.9', $contents);
33 | $this->assertStringContainsString('Crawl-Delay: 0.9', $contents);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/test/Stream/ReaderTest.php:
--------------------------------------------------------------------------------
1 | getContentIterated();
17 |
18 | foreach ($generator as $line) {
19 | $this->assertNotEmpty($line);
20 | $this->assertStringNotContainsString('#', $line);
21 | }
22 | }
23 |
24 | public function testGetContentYaMarket() {
25 | $reader = GeneratorBasedReader::fromStream(fopen(__DIR__ . './../Fixtures/market-yandex-ru.txt', 'r'));
26 | $generator = $reader->getContentIterated();
27 |
28 | foreach ($generator as $idx => $line) {
29 | $this->assertNotEmpty($line);
30 | $this->assertStringNotContainsString('#', $line);
31 |
32 | switch ($idx) {
33 | case '329':
34 | $this->assertStringContainsString('Sitemap', $line);
35 | break;
36 |
37 | case '330':
38 | $this->assertStringContainsString('Host', $line);
39 | break;
40 | }
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/test/UnlistedPathTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($parser->isAllowed("/"));
24 | $this->assertFalse($parser->isDisallowed("/"));
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/test/UserAgentTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($parser->isAllowed("/"));
22 | $this->assertTrue($parser->isAllowed("/article"));
23 | $this->assertTrue($parser->isDisallowed("/temp"));
24 |
25 | $this->assertFalse($parser->isDisallowed("/"));
26 | $this->assertFalse($parser->isDisallowed("/article"));
27 | $this->assertFalse($parser->isAllowed("/temp"));
28 |
29 | $this->assertTrue($parser->isAllowed("/foo", "agentU/2.0.1"));
30 | $this->assertTrue($parser->isDisallowed("/bar", "agentU/2.0.1"));
31 |
32 | $this->assertTrue($parser->isDisallowed("/foo", "agentV"));
33 | $this->assertTrue($parser->isAllowed("/bar", "agentV"));
34 | $this->assertTrue($parser->isDisallowed("/foo", "agentW"));
35 | $this->assertTrue($parser->isAllowed("/bar", "agentW"));
36 |
37 | $this->assertTrue($parser->isAllowed("/temp", "spiderX/1.0"));
38 | $this->assertTrue($parser->isDisallowed("/assets", "spiderX/1.0"));
39 | $this->assertTrue($parser->isAllowed("/forum", "spiderX/1.0"));
40 |
41 | $this->assertFalse($parser->isDisallowed("/temp", "spiderX/1.0"));
42 | $this->assertFalse($parser->isAllowed("/assets", "spiderX/1.0"));
43 | $this->assertFalse($parser->isDisallowed("/forum", "spiderX/1.0"));
44 |
45 | $this->assertTrue($parser->isDisallowed("/", "botY-test"));
46 | $this->assertTrue($parser->isAllowed("/forum/", "botY-test"));
47 | $this->assertTrue($parser->isDisallowed("/forum/topic", "botY-test"));
48 | $this->assertTrue($parser->isDisallowed("/public", "botY-test"));
49 |
50 | $this->assertFalse($parser->isAllowed("/", "botY-test"));
51 | $this->assertFalse($parser->isDisallowed("/forum/", "botY-test"));
52 | $this->assertFalse($parser->isAllowed("/forum/topic", "botY-test"));
53 | $this->assertFalse($parser->isAllowed("/public", "botY-test"));
54 |
55 | $this->assertTrue($parser->isAllowed("/", "crawlerZ"));
56 | $this->assertTrue($parser->isDisallowed("/forum", "crawlerZ"));
57 | $this->assertTrue($parser->isDisallowed("/public", "crawlerZ"));
58 |
59 | $this->assertFalse($parser->isDisallowed("/", "crawlerZ"));
60 | $this->assertFalse($parser->isAllowed("/forum", "crawlerZ"));
61 | $this->assertFalse($parser->isAllowed("/public", "crawlerZ"));
62 | }
63 |
64 | /**
65 | * Generate test case data
66 | * @return array
67 | */
68 | public function generateDataForTest()
69 | {
70 | return array(
71 | array(
72 | "
73 | User-agent: *
74 | Disallow: /admin
75 | Disallow: /temp
76 | Disallow: /forum
77 |
78 | User-agent: agentU/2.0
79 | Disallow: /bar
80 | Allow: /foo
81 |
82 | User-agent: agentV
83 | User-agent: agentW
84 | Disallow: /foo
85 | Allow: /bar
86 |
87 | User-agent: spiderX
88 | Disallow:
89 | Disallow: /admin
90 | Disallow: /assets
91 |
92 | User-agent: botY
93 | Disallow: /
94 | Allow: /forum/$
95 | Allow: /article
96 |
97 | User-agent: crawlerZ
98 | Disallow:
99 | Disallow: /
100 | Allow: /$
101 | "
102 | )
103 | );
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/test/WhitespacesTest.php:
--------------------------------------------------------------------------------
1 | getRules('*');
19 |
20 | $this->assertNotEmpty($rules, 'expected rules for *');
21 | $this->assertArrayHasKey('disallow', $rules);
22 | $this->assertNotEmpty($rules['disallow'], 'disallow failed');
23 | $this->assertArrayHasKey('allow', $rules);
24 | $this->assertNotEmpty($rules['allow'], 'allow failed');
25 | }
26 |
27 | /**
28 | * Generate test case data
29 | * @return array
30 | */
31 | public function generateDataForTest() {
32 | return [
33 | [
34 | "
35 | User-agent: *
36 | Disallow : /admin
37 | Allow : /admin/front
38 | ",
39 | ],
40 | ];
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/test/bootstrap.php:
--------------------------------------------------------------------------------
1 | json_encode(
10 | array_map('extractMessageFromRecord', $handlerRecords),
11 | JSON_PRETTY_PRINT
12 | )
13 | ]);
14 | }
15 |
--------------------------------------------------------------------------------