├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── composer.json ├── composer.lock ├── examples ├── callback.php └── links.php ├── spec └── Centipede │ ├── Checker │ └── HostCheckerSpec.php │ ├── CrawlerSpec.php │ ├── Extractor │ └── UrlExtractorSpec.php │ └── Filter │ └── UrlFilterSpec.php └── src └── Centipede ├── Checker ├── CheckerInterface.php └── HostChecker.php ├── Crawler.php ├── Extractor ├── ExtractorInterface.php └── UrlExtractor.php └── Filter ├── FilterInterface.php └── UrlFilter.php /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | vendor 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - 5.4 5 | - 5.5 6 | - 5.6 7 | - hhvm 8 | 9 | install: 10 | - composer install 11 | 12 | script: php bin/phpspec run -f pretty 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Saša Stamenković 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | 5 |

6 |

7 | symfony upgrade fixer • 8 | twig gettext extractor • 9 | wisdom • 10 | centipede • 11 | permissions handler • 12 | extraload • 13 | gravatar • 14 | locurro • 15 | country list • 16 | transliterator 17 |

18 | 19 | # Centipede Crawler [![Build Status](https://travis-ci.org/umpirsky/centipede-crawler.svg?branch=master)](https://travis-ci.org/umpirsky/centipede-crawler) 20 | 21 | Crawls all unique links. 22 | 23 | ## Usage 24 | 25 | ```php 26 | $urls = (new Centipede\Crawler('http://dev.umpirsky.com'))->crawl(); 27 | ``` 28 | 29 | ## Asynchronous 30 | 31 | ```php 32 | (new Centipede\Crawler('http://dev.umpirsky.com'))->crawl(function ($url, GuzzleHttp\Message\FutureResponse $response) { 33 | printf('(%d) %s', $response->getStatusCode(), $url); 34 | }); 35 | ``` 36 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "umpirsky/centipede-crawler", 3 | "type": "library", 4 | "description": "Crawls all unique links.", 5 | "license": "MIT", 6 | "authors": [ 7 | { 8 | "name": "Saša Stamenković", 9 | "email": "umpirsky@gmail.com" 10 | } 11 | ], 12 | "require": { 13 | "php": ">=5.4", 14 | "guzzlehttp/guzzle": "~5.3" 15 | }, 16 | "require-dev": { 17 | "phpspec/phpspec": "~2.0" 18 | }, 19 | "config": { 20 | "bin-dir": "bin" 21 | }, 22 | "autoload": { 23 | "psr-0": { "Centipede\\": "src/" } 24 | }, 25 | "extra": { 26 | "branch-alias": { 27 | "dev-master": "0.1-dev" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /composer.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_readme": [ 3 | "This file locks the dependencies of your project to a known state", 4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", 5 | "This file is @generated automatically" 6 | ], 7 | "hash": "27f84d4bd801195f4f29ab442644c49b", 8 | "packages": [ 9 | { 10 | "name": "guzzlehttp/guzzle", 11 | "version": "5.3.0", 12 | "source": { 13 | "type": "git", 14 | "url": "https://github.com/guzzle/guzzle.git", 15 | "reference": "f3c8c22471cb55475105c14769644a49c3262b93" 16 | }, 17 | "dist": { 18 | "type": "zip", 19 | "url": "https://api.github.com/repos/guzzle/guzzle/zipball/f3c8c22471cb55475105c14769644a49c3262b93", 20 | "reference": "f3c8c22471cb55475105c14769644a49c3262b93", 21 | "shasum": "" 22 | }, 23 | "require": { 24 | "guzzlehttp/ringphp": "^1.1", 25 | "php": ">=5.4.0" 26 | }, 27 | "require-dev": { 28 | "ext-curl": "*", 29 | "phpunit/phpunit": "^4.0", 30 | "psr/log": "^1.0" 31 | }, 32 | "type": "library", 33 | "extra": { 34 | "branch-alias": { 35 | "dev-master": "5.0-dev" 36 | } 37 | }, 38 | "autoload": { 39 | "psr-4": { 40 | "GuzzleHttp\\": "src/" 41 | } 42 | }, 43 | "notification-url": "https://packagist.org/downloads/", 44 | "license": [ 45 | "MIT" 46 | ], 47 | "authors": [ 48 | { 49 | "name": "Michael Dowling", 50 | "email": "mtdowling@gmail.com", 51 | "homepage": "https://github.com/mtdowling" 52 | } 53 | ], 54 | "description": "Guzzle is a PHP HTTP client library and framework for building RESTful web service clients", 55 | "homepage": "http://guzzlephp.org/", 56 | "keywords": [ 57 | "client", 58 | "curl", 59 | "framework", 60 | "http", 61 | "http client", 62 | "rest", 63 | "web service" 64 | ], 65 | "time": "2015-05-20 03:47:55" 66 | }, 67 | { 68 | "name": "guzzlehttp/ringphp", 69 | "version": "1.1.0", 70 | "source": { 71 | "type": "git", 72 | "url": "https://github.com/guzzle/RingPHP.git", 73 | "reference": "dbbb91d7f6c191e5e405e900e3102ac7f261bc0b" 74 | }, 75 | "dist": { 76 | "type": "zip", 77 | "url": "https://api.github.com/repos/guzzle/RingPHP/zipball/dbbb91d7f6c191e5e405e900e3102ac7f261bc0b", 78 | "reference": "dbbb91d7f6c191e5e405e900e3102ac7f261bc0b", 79 | "shasum": "" 80 | }, 81 | "require": { 82 | "guzzlehttp/streams": "~3.0", 83 | "php": ">=5.4.0", 84 | "react/promise": "~2.0" 85 | }, 86 | "require-dev": { 87 | "ext-curl": "*", 88 | "phpunit/phpunit": "~4.0" 89 | }, 90 | "suggest": { 91 | "ext-curl": "Guzzle will use specific adapters if cURL is present" 92 | }, 93 | "type": "library", 94 | "extra": { 95 | "branch-alias": { 96 | "dev-master": "1.1-dev" 97 | } 98 | }, 99 | "autoload": { 100 | "psr-4": { 101 | "GuzzleHttp\\Ring\\": "src/" 102 | } 103 | }, 104 | "notification-url": "https://packagist.org/downloads/", 105 | "license": [ 106 | "MIT" 107 | ], 108 | "authors": [ 109 | { 110 | "name": "Michael Dowling", 111 | "email": "mtdowling@gmail.com", 112 | "homepage": "https://github.com/mtdowling" 113 | } 114 | ], 115 | "description": "Provides a simple API and specification that abstracts away the details of HTTP into a single PHP function.", 116 | "time": "2015-05-20 03:37:09" 117 | }, 118 | { 119 | "name": "guzzlehttp/streams", 120 | "version": "3.0.0", 121 | "source": { 122 | "type": "git", 123 | "url": "https://github.com/guzzle/streams.git", 124 | "reference": "47aaa48e27dae43d39fc1cea0ccf0d84ac1a2ba5" 125 | }, 126 | "dist": { 127 | "type": "zip", 128 | "url": "https://api.github.com/repos/guzzle/streams/zipball/47aaa48e27dae43d39fc1cea0ccf0d84ac1a2ba5", 129 | "reference": "47aaa48e27dae43d39fc1cea0ccf0d84ac1a2ba5", 130 | "shasum": "" 131 | }, 132 | "require": { 133 | "php": ">=5.4.0" 134 | }, 135 | "require-dev": { 136 | "phpunit/phpunit": "~4.0" 137 | }, 138 | "type": "library", 139 | "extra": { 140 | "branch-alias": { 141 | "dev-master": "3.0-dev" 142 | } 143 | }, 144 | "autoload": { 145 | "psr-4": { 146 | "GuzzleHttp\\Stream\\": "src/" 147 | } 148 | }, 149 | "notification-url": "https://packagist.org/downloads/", 150 | "license": [ 151 | "MIT" 152 | ], 153 | "authors": [ 154 | { 155 | "name": "Michael Dowling", 156 | "email": "mtdowling@gmail.com", 157 | "homepage": "https://github.com/mtdowling" 158 | } 159 | ], 160 | "description": "Provides a simple abstraction over streams of data", 161 | "homepage": "http://guzzlephp.org/", 162 | "keywords": [ 163 | "Guzzle", 164 | "stream" 165 | ], 166 | "time": "2014-10-12 19:18:40" 167 | }, 168 | { 169 | "name": "react/promise", 170 | "version": "v2.2.0", 171 | "source": { 172 | "type": "git", 173 | "url": "https://github.com/reactphp/promise.git", 174 | "reference": "365fcee430dfa4ace1fbc75737ca60ceea7eeeef" 175 | }, 176 | "dist": { 177 | "type": "zip", 178 | "url": "https://api.github.com/repos/reactphp/promise/zipball/365fcee430dfa4ace1fbc75737ca60ceea7eeeef", 179 | "reference": "365fcee430dfa4ace1fbc75737ca60ceea7eeeef", 180 | "shasum": "" 181 | }, 182 | "require": { 183 | "php": ">=5.4.0" 184 | }, 185 | "type": "library", 186 | "extra": { 187 | "branch-alias": { 188 | "dev-master": "2.0-dev" 189 | } 190 | }, 191 | "autoload": { 192 | "psr-4": { 193 | "React\\Promise\\": "src/" 194 | }, 195 | "files": [ 196 | "src/functions_include.php" 197 | ] 198 | }, 199 | "notification-url": "https://packagist.org/downloads/", 200 | "license": [ 201 | "MIT" 202 | ], 203 | "authors": [ 204 | { 205 | "name": "Jan Sorgalla", 206 | "email": "jsorgalla@googlemail.com" 207 | } 208 | ], 209 | "description": "A lightweight implementation of CommonJS Promises/A for PHP", 210 | "time": "2014-12-30 13:32:42" 211 | } 212 | ], 213 | "packages-dev": [ 214 | { 215 | "name": "doctrine/instantiator", 216 | "version": "1.0.4", 217 | "source": { 218 | "type": "git", 219 | "url": "https://github.com/doctrine/instantiator.git", 220 | "reference": "f976e5de371104877ebc89bd8fecb0019ed9c119" 221 | }, 222 | "dist": { 223 | "type": "zip", 224 | "url": "https://api.github.com/repos/doctrine/instantiator/zipball/f976e5de371104877ebc89bd8fecb0019ed9c119", 225 | "reference": "f976e5de371104877ebc89bd8fecb0019ed9c119", 226 | "shasum": "" 227 | }, 228 | "require": { 229 | "php": ">=5.3,<8.0-DEV" 230 | }, 231 | "require-dev": { 232 | "athletic/athletic": "~0.1.8", 233 | "ext-pdo": "*", 234 | "ext-phar": "*", 235 | "phpunit/phpunit": "~4.0", 236 | "squizlabs/php_codesniffer": "2.0.*@ALPHA" 237 | }, 238 | "type": "library", 239 | "extra": { 240 | "branch-alias": { 241 | "dev-master": "1.0.x-dev" 242 | } 243 | }, 244 | "autoload": { 245 | "psr-0": { 246 | "Doctrine\\Instantiator\\": "src" 247 | } 248 | }, 249 | "notification-url": "https://packagist.org/downloads/", 250 | "license": [ 251 | "MIT" 252 | ], 253 | "authors": [ 254 | { 255 | "name": "Marco Pivetta", 256 | "email": "ocramius@gmail.com", 257 | "homepage": "http://ocramius.github.com/" 258 | } 259 | ], 260 | "description": "A small, lightweight utility to instantiate objects in PHP without invoking their constructors", 261 | "homepage": "https://github.com/doctrine/instantiator", 262 | "keywords": [ 263 | "constructor", 264 | "instantiate" 265 | ], 266 | "time": "2014-10-13 12:58:55" 267 | }, 268 | { 269 | "name": "phpdocumentor/reflection-docblock", 270 | "version": "2.0.4", 271 | "source": { 272 | "type": "git", 273 | "url": "https://github.com/phpDocumentor/ReflectionDocBlock.git", 274 | "reference": "d68dbdc53dc358a816f00b300704702b2eaff7b8" 275 | }, 276 | "dist": { 277 | "type": "zip", 278 | "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/d68dbdc53dc358a816f00b300704702b2eaff7b8", 279 | "reference": "d68dbdc53dc358a816f00b300704702b2eaff7b8", 280 | "shasum": "" 281 | }, 282 | "require": { 283 | "php": ">=5.3.3" 284 | }, 285 | "require-dev": { 286 | "phpunit/phpunit": "~4.0" 287 | }, 288 | "suggest": { 289 | "dflydev/markdown": "~1.0", 290 | "erusev/parsedown": "~1.0" 291 | }, 292 | "type": "library", 293 | "extra": { 294 | "branch-alias": { 295 | "dev-master": "2.0.x-dev" 296 | } 297 | }, 298 | "autoload": { 299 | "psr-0": { 300 | "phpDocumentor": [ 301 | "src/" 302 | ] 303 | } 304 | }, 305 | "notification-url": "https://packagist.org/downloads/", 306 | "license": [ 307 | "MIT" 308 | ], 309 | "authors": [ 310 | { 311 | "name": "Mike van Riel", 312 | "email": "mike.vanriel@naenius.com" 313 | } 314 | ], 315 | "time": "2015-02-03 12:10:50" 316 | }, 317 | { 318 | "name": "phpspec/php-diff", 319 | "version": "v1.0.2", 320 | "source": { 321 | "type": "git", 322 | "url": "https://github.com/phpspec/php-diff.git", 323 | "reference": "30e103d19519fe678ae64a60d77884ef3d71b28a" 324 | }, 325 | "dist": { 326 | "type": "zip", 327 | "url": "https://api.github.com/repos/phpspec/php-diff/zipball/30e103d19519fe678ae64a60d77884ef3d71b28a", 328 | "reference": "30e103d19519fe678ae64a60d77884ef3d71b28a", 329 | "shasum": "" 330 | }, 331 | "type": "library", 332 | "autoload": { 333 | "psr-0": { 334 | "Diff": "lib/" 335 | } 336 | }, 337 | "notification-url": "https://packagist.org/downloads/", 338 | "license": [ 339 | "BSD-3-Clause" 340 | ], 341 | "authors": [ 342 | { 343 | "name": "Chris Boulton", 344 | "homepage": "http://github.com/chrisboulton", 345 | "role": "Original developer" 346 | } 347 | ], 348 | "description": "A comprehensive library for generating differences between two hashable objects (strings or arrays).", 349 | "time": "2013-11-01 13:02:21" 350 | }, 351 | { 352 | "name": "phpspec/phpspec", 353 | "version": "2.2.1", 354 | "source": { 355 | "type": "git", 356 | "url": "https://github.com/phpspec/phpspec.git", 357 | "reference": "e9a40577323e67f1de2e214abf32976a0352d8f8" 358 | }, 359 | "dist": { 360 | "type": "zip", 361 | "url": "https://api.github.com/repos/phpspec/phpspec/zipball/e9a40577323e67f1de2e214abf32976a0352d8f8", 362 | "reference": "e9a40577323e67f1de2e214abf32976a0352d8f8", 363 | "shasum": "" 364 | }, 365 | "require": { 366 | "doctrine/instantiator": "^1.0.1", 367 | "php": ">=5.3.3", 368 | "phpspec/php-diff": "~1.0.0", 369 | "phpspec/prophecy": "~1.4", 370 | "sebastian/exporter": "~1.0", 371 | "symfony/console": "~2.3", 372 | "symfony/event-dispatcher": "~2.1", 373 | "symfony/finder": "~2.1", 374 | "symfony/process": "~2.1", 375 | "symfony/yaml": "~2.1" 376 | }, 377 | "require-dev": { 378 | "behat/behat": "^3.0.11", 379 | "bossa/phpspec2-expect": "~1.0", 380 | "phpunit/phpunit": "~4.4", 381 | "symfony/filesystem": "~2.1", 382 | "symfony/process": "~2.1" 383 | }, 384 | "suggest": { 385 | "phpspec/nyan-formatters": "~1.0 – Adds Nyan formatters" 386 | }, 387 | "bin": [ 388 | "bin/phpspec" 389 | ], 390 | "type": "library", 391 | "extra": { 392 | "branch-alias": { 393 | "dev-master": "2.2.x-dev" 394 | } 395 | }, 396 | "autoload": { 397 | "psr-0": { 398 | "PhpSpec": "src/" 399 | } 400 | }, 401 | "notification-url": "https://packagist.org/downloads/", 402 | "license": [ 403 | "MIT" 404 | ], 405 | "authors": [ 406 | { 407 | "name": "Konstantin Kudryashov", 408 | "email": "ever.zet@gmail.com", 409 | "homepage": "http://everzet.com" 410 | }, 411 | { 412 | "name": "Marcello Duarte", 413 | "homepage": "http://marcelloduarte.net/" 414 | } 415 | ], 416 | "description": "Specification-oriented BDD framework for PHP 5.3+", 417 | "homepage": "http://phpspec.net/", 418 | "keywords": [ 419 | "BDD", 420 | "SpecBDD", 421 | "TDD", 422 | "spec", 423 | "specification", 424 | "testing", 425 | "tests" 426 | ], 427 | "time": "2015-05-30 15:21:40" 428 | }, 429 | { 430 | "name": "phpspec/prophecy", 431 | "version": "v1.4.1", 432 | "source": { 433 | "type": "git", 434 | "url": "https://github.com/phpspec/prophecy.git", 435 | "reference": "3132b1f44c7bf2ec4c7eb2d3cb78fdeca760d373" 436 | }, 437 | "dist": { 438 | "type": "zip", 439 | "url": "https://api.github.com/repos/phpspec/prophecy/zipball/3132b1f44c7bf2ec4c7eb2d3cb78fdeca760d373", 440 | "reference": "3132b1f44c7bf2ec4c7eb2d3cb78fdeca760d373", 441 | "shasum": "" 442 | }, 443 | "require": { 444 | "doctrine/instantiator": "^1.0.2", 445 | "phpdocumentor/reflection-docblock": "~2.0", 446 | "sebastian/comparator": "~1.1" 447 | }, 448 | "require-dev": { 449 | "phpspec/phpspec": "~2.0" 450 | }, 451 | "type": "library", 452 | "extra": { 453 | "branch-alias": { 454 | "dev-master": "1.4.x-dev" 455 | } 456 | }, 457 | "autoload": { 458 | "psr-0": { 459 | "Prophecy\\": "src/" 460 | } 461 | }, 462 | "notification-url": "https://packagist.org/downloads/", 463 | "license": [ 464 | "MIT" 465 | ], 466 | "authors": [ 467 | { 468 | "name": "Konstantin Kudryashov", 469 | "email": "ever.zet@gmail.com", 470 | "homepage": "http://everzet.com" 471 | }, 472 | { 473 | "name": "Marcello Duarte", 474 | "email": "marcello.duarte@gmail.com" 475 | } 476 | ], 477 | "description": "Highly opinionated mocking framework for PHP 5.3+", 478 | "homepage": "https://github.com/phpspec/prophecy", 479 | "keywords": [ 480 | "Double", 481 | "Dummy", 482 | "fake", 483 | "mock", 484 | "spy", 485 | "stub" 486 | ], 487 | "time": "2015-04-27 22:15:08" 488 | }, 489 | { 490 | "name": "sebastian/comparator", 491 | "version": "1.1.1", 492 | "source": { 493 | "type": "git", 494 | "url": "https://github.com/sebastianbergmann/comparator.git", 495 | "reference": "1dd8869519a225f7f2b9eb663e225298fade819e" 496 | }, 497 | "dist": { 498 | "type": "zip", 499 | "url": "https://api.github.com/repos/sebastianbergmann/comparator/zipball/1dd8869519a225f7f2b9eb663e225298fade819e", 500 | "reference": "1dd8869519a225f7f2b9eb663e225298fade819e", 501 | "shasum": "" 502 | }, 503 | "require": { 504 | "php": ">=5.3.3", 505 | "sebastian/diff": "~1.2", 506 | "sebastian/exporter": "~1.2" 507 | }, 508 | "require-dev": { 509 | "phpunit/phpunit": "~4.4" 510 | }, 511 | "type": "library", 512 | "extra": { 513 | "branch-alias": { 514 | "dev-master": "1.1.x-dev" 515 | } 516 | }, 517 | "autoload": { 518 | "classmap": [ 519 | "src/" 520 | ] 521 | }, 522 | "notification-url": "https://packagist.org/downloads/", 523 | "license": [ 524 | "BSD-3-Clause" 525 | ], 526 | "authors": [ 527 | { 528 | "name": "Jeff Welch", 529 | "email": "whatthejeff@gmail.com" 530 | }, 531 | { 532 | "name": "Volker Dusch", 533 | "email": "github@wallbash.com" 534 | }, 535 | { 536 | "name": "Bernhard Schussek", 537 | "email": "bschussek@2bepublished.at" 538 | }, 539 | { 540 | "name": "Sebastian Bergmann", 541 | "email": "sebastian@phpunit.de" 542 | } 543 | ], 544 | "description": "Provides the functionality to compare PHP values for equality", 545 | "homepage": "http://www.github.com/sebastianbergmann/comparator", 546 | "keywords": [ 547 | "comparator", 548 | "compare", 549 | "equality" 550 | ], 551 | "time": "2015-01-29 16:28:08" 552 | }, 553 | { 554 | "name": "sebastian/diff", 555 | "version": "1.3.0", 556 | "source": { 557 | "type": "git", 558 | "url": "https://github.com/sebastianbergmann/diff.git", 559 | "reference": "863df9687835c62aa423a22412d26fa2ebde3fd3" 560 | }, 561 | "dist": { 562 | "type": "zip", 563 | "url": "https://api.github.com/repos/sebastianbergmann/diff/zipball/863df9687835c62aa423a22412d26fa2ebde3fd3", 564 | "reference": "863df9687835c62aa423a22412d26fa2ebde3fd3", 565 | "shasum": "" 566 | }, 567 | "require": { 568 | "php": ">=5.3.3" 569 | }, 570 | "require-dev": { 571 | "phpunit/phpunit": "~4.2" 572 | }, 573 | "type": "library", 574 | "extra": { 575 | "branch-alias": { 576 | "dev-master": "1.3-dev" 577 | } 578 | }, 579 | "autoload": { 580 | "classmap": [ 581 | "src/" 582 | ] 583 | }, 584 | "notification-url": "https://packagist.org/downloads/", 585 | "license": [ 586 | "BSD-3-Clause" 587 | ], 588 | "authors": [ 589 | { 590 | "name": "Kore Nordmann", 591 | "email": "mail@kore-nordmann.de" 592 | }, 593 | { 594 | "name": "Sebastian Bergmann", 595 | "email": "sebastian@phpunit.de" 596 | } 597 | ], 598 | "description": "Diff implementation", 599 | "homepage": "http://www.github.com/sebastianbergmann/diff", 600 | "keywords": [ 601 | "diff" 602 | ], 603 | "time": "2015-02-22 15:13:53" 604 | }, 605 | { 606 | "name": "sebastian/exporter", 607 | "version": "1.2.0", 608 | "source": { 609 | "type": "git", 610 | "url": "https://github.com/sebastianbergmann/exporter.git", 611 | "reference": "84839970d05254c73cde183a721c7af13aede943" 612 | }, 613 | "dist": { 614 | "type": "zip", 615 | "url": "https://api.github.com/repos/sebastianbergmann/exporter/zipball/84839970d05254c73cde183a721c7af13aede943", 616 | "reference": "84839970d05254c73cde183a721c7af13aede943", 617 | "shasum": "" 618 | }, 619 | "require": { 620 | "php": ">=5.3.3", 621 | "sebastian/recursion-context": "~1.0" 622 | }, 623 | "require-dev": { 624 | "phpunit/phpunit": "~4.4" 625 | }, 626 | "type": "library", 627 | "extra": { 628 | "branch-alias": { 629 | "dev-master": "1.2.x-dev" 630 | } 631 | }, 632 | "autoload": { 633 | "classmap": [ 634 | "src/" 635 | ] 636 | }, 637 | "notification-url": "https://packagist.org/downloads/", 638 | "license": [ 639 | "BSD-3-Clause" 640 | ], 641 | "authors": [ 642 | { 643 | "name": "Jeff Welch", 644 | "email": "whatthejeff@gmail.com" 645 | }, 646 | { 647 | "name": "Volker Dusch", 648 | "email": "github@wallbash.com" 649 | }, 650 | { 651 | "name": "Bernhard Schussek", 652 | "email": "bschussek@2bepublished.at" 653 | }, 654 | { 655 | "name": "Sebastian Bergmann", 656 | "email": "sebastian@phpunit.de" 657 | }, 658 | { 659 | "name": "Adam Harvey", 660 | "email": "aharvey@php.net" 661 | } 662 | ], 663 | "description": "Provides the functionality to export PHP variables for visualization", 664 | "homepage": "http://www.github.com/sebastianbergmann/exporter", 665 | "keywords": [ 666 | "export", 667 | "exporter" 668 | ], 669 | "time": "2015-01-27 07:23:06" 670 | }, 671 | { 672 | "name": "sebastian/recursion-context", 673 | "version": "1.0.0", 674 | "source": { 675 | "type": "git", 676 | "url": "https://github.com/sebastianbergmann/recursion-context.git", 677 | "reference": "3989662bbb30a29d20d9faa04a846af79b276252" 678 | }, 679 | "dist": { 680 | "type": "zip", 681 | "url": "https://api.github.com/repos/sebastianbergmann/recursion-context/zipball/3989662bbb30a29d20d9faa04a846af79b276252", 682 | "reference": "3989662bbb30a29d20d9faa04a846af79b276252", 683 | "shasum": "" 684 | }, 685 | "require": { 686 | "php": ">=5.3.3" 687 | }, 688 | "require-dev": { 689 | "phpunit/phpunit": "~4.4" 690 | }, 691 | "type": "library", 692 | "extra": { 693 | "branch-alias": { 694 | "dev-master": "1.0.x-dev" 695 | } 696 | }, 697 | "autoload": { 698 | "classmap": [ 699 | "src/" 700 | ] 701 | }, 702 | "notification-url": "https://packagist.org/downloads/", 703 | "license": [ 704 | "BSD-3-Clause" 705 | ], 706 | "authors": [ 707 | { 708 | "name": "Jeff Welch", 709 | "email": "whatthejeff@gmail.com" 710 | }, 711 | { 712 | "name": "Sebastian Bergmann", 713 | "email": "sebastian@phpunit.de" 714 | }, 715 | { 716 | "name": "Adam Harvey", 717 | "email": "aharvey@php.net" 718 | } 719 | ], 720 | "description": "Provides functionality to recursively process PHP variables", 721 | "homepage": "http://www.github.com/sebastianbergmann/recursion-context", 722 | "time": "2015-01-24 09:48:32" 723 | }, 724 | { 725 | "name": "symfony/console", 726 | "version": "v2.7.0", 727 | "source": { 728 | "type": "git", 729 | "url": "https://github.com/symfony/Console.git", 730 | "reference": "7f0bec04961c61c961df0cb8c2ae88dbfd83f399" 731 | }, 732 | "dist": { 733 | "type": "zip", 734 | "url": "https://api.github.com/repos/symfony/Console/zipball/7f0bec04961c61c961df0cb8c2ae88dbfd83f399", 735 | "reference": "7f0bec04961c61c961df0cb8c2ae88dbfd83f399", 736 | "shasum": "" 737 | }, 738 | "require": { 739 | "php": ">=5.3.9" 740 | }, 741 | "require-dev": { 742 | "psr/log": "~1.0", 743 | "symfony/event-dispatcher": "~2.1", 744 | "symfony/phpunit-bridge": "~2.7", 745 | "symfony/process": "~2.1" 746 | }, 747 | "suggest": { 748 | "psr/log": "For using the console logger", 749 | "symfony/event-dispatcher": "", 750 | "symfony/process": "" 751 | }, 752 | "type": "library", 753 | "extra": { 754 | "branch-alias": { 755 | "dev-master": "2.7-dev" 756 | } 757 | }, 758 | "autoload": { 759 | "psr-4": { 760 | "Symfony\\Component\\Console\\": "" 761 | } 762 | }, 763 | "notification-url": "https://packagist.org/downloads/", 764 | "license": [ 765 | "MIT" 766 | ], 767 | "authors": [ 768 | { 769 | "name": "Fabien Potencier", 770 | "email": "fabien@symfony.com" 771 | }, 772 | { 773 | "name": "Symfony Community", 774 | "homepage": "https://symfony.com/contributors" 775 | } 776 | ], 777 | "description": "Symfony Console Component", 778 | "homepage": "https://symfony.com", 779 | "time": "2015-05-29 16:22:24" 780 | }, 781 | { 782 | "name": "symfony/event-dispatcher", 783 | "version": "v2.7.0", 784 | "source": { 785 | "type": "git", 786 | "url": "https://github.com/symfony/EventDispatcher.git", 787 | "reference": "687039686d0e923429ba6e958d0baa920cd5d458" 788 | }, 789 | "dist": { 790 | "type": "zip", 791 | "url": "https://api.github.com/repos/symfony/EventDispatcher/zipball/687039686d0e923429ba6e958d0baa920cd5d458", 792 | "reference": "687039686d0e923429ba6e958d0baa920cd5d458", 793 | "shasum": "" 794 | }, 795 | "require": { 796 | "php": ">=5.3.9" 797 | }, 798 | "require-dev": { 799 | "psr/log": "~1.0", 800 | "symfony/config": "~2.0,>=2.0.5", 801 | "symfony/dependency-injection": "~2.6", 802 | "symfony/expression-language": "~2.6", 803 | "symfony/phpunit-bridge": "~2.7", 804 | "symfony/stopwatch": "~2.3" 805 | }, 806 | "suggest": { 807 | "symfony/dependency-injection": "", 808 | "symfony/http-kernel": "" 809 | }, 810 | "type": "library", 811 | "extra": { 812 | "branch-alias": { 813 | "dev-master": "2.7-dev" 814 | } 815 | }, 816 | "autoload": { 817 | "psr-4": { 818 | "Symfony\\Component\\EventDispatcher\\": "" 819 | } 820 | }, 821 | "notification-url": "https://packagist.org/downloads/", 822 | "license": [ 823 | "MIT" 824 | ], 825 | "authors": [ 826 | { 827 | "name": "Fabien Potencier", 828 | "email": "fabien@symfony.com" 829 | }, 830 | { 831 | "name": "Symfony Community", 832 | "homepage": "https://symfony.com/contributors" 833 | } 834 | ], 835 | "description": "Symfony EventDispatcher Component", 836 | "homepage": "https://symfony.com", 837 | "time": "2015-05-02 15:21:08" 838 | }, 839 | { 840 | "name": "symfony/finder", 841 | "version": "v2.7.0", 842 | "source": { 843 | "type": "git", 844 | "url": "https://github.com/symfony/Finder.git", 845 | "reference": "ccb8ed8339cf24824f2ef35dacec30d92ff44368" 846 | }, 847 | "dist": { 848 | "type": "zip", 849 | "url": "https://api.github.com/repos/symfony/Finder/zipball/ccb8ed8339cf24824f2ef35dacec30d92ff44368", 850 | "reference": "ccb8ed8339cf24824f2ef35dacec30d92ff44368", 851 | "shasum": "" 852 | }, 853 | "require": { 854 | "php": ">=5.3.9" 855 | }, 856 | "require-dev": { 857 | "symfony/phpunit-bridge": "~2.7" 858 | }, 859 | "type": "library", 860 | "extra": { 861 | "branch-alias": { 862 | "dev-master": "2.7-dev" 863 | } 864 | }, 865 | "autoload": { 866 | "psr-4": { 867 | "Symfony\\Component\\Finder\\": "" 868 | } 869 | }, 870 | "notification-url": "https://packagist.org/downloads/", 871 | "license": [ 872 | "MIT" 873 | ], 874 | "authors": [ 875 | { 876 | "name": "Fabien Potencier", 877 | "email": "fabien@symfony.com" 878 | }, 879 | { 880 | "name": "Symfony Community", 881 | "homepage": "https://symfony.com/contributors" 882 | } 883 | ], 884 | "description": "Symfony Finder Component", 885 | "homepage": "https://symfony.com", 886 | "time": "2015-05-15 14:02:48" 887 | }, 888 | { 889 | "name": "symfony/process", 890 | "version": "v2.7.0", 891 | "source": { 892 | "type": "git", 893 | "url": "https://github.com/symfony/Process.git", 894 | "reference": "e0a82b58e36afc60f8e79b8bc85a22bb064077c1" 895 | }, 896 | "dist": { 897 | "type": "zip", 898 | "url": "https://api.github.com/repos/symfony/Process/zipball/e0a82b58e36afc60f8e79b8bc85a22bb064077c1", 899 | "reference": "e0a82b58e36afc60f8e79b8bc85a22bb064077c1", 900 | "shasum": "" 901 | }, 902 | "require": { 903 | "php": ">=5.3.9" 904 | }, 905 | "require-dev": { 906 | "symfony/phpunit-bridge": "~2.7" 907 | }, 908 | "type": "library", 909 | "extra": { 910 | "branch-alias": { 911 | "dev-master": "2.7-dev" 912 | } 913 | }, 914 | "autoload": { 915 | "psr-4": { 916 | "Symfony\\Component\\Process\\": "" 917 | } 918 | }, 919 | "notification-url": "https://packagist.org/downloads/", 920 | "license": [ 921 | "MIT" 922 | ], 923 | "authors": [ 924 | { 925 | "name": "Fabien Potencier", 926 | "email": "fabien@symfony.com" 927 | }, 928 | { 929 | "name": "Symfony Community", 930 | "homepage": "https://symfony.com/contributors" 931 | } 932 | ], 933 | "description": "Symfony Process Component", 934 | "homepage": "https://symfony.com", 935 | "time": "2015-05-15 13:33:16" 936 | }, 937 | { 938 | "name": "symfony/yaml", 939 | "version": "v2.7.0", 940 | "source": { 941 | "type": "git", 942 | "url": "https://github.com/symfony/Yaml.git", 943 | "reference": "4a29a5248aed4fb45f626a7bbbd330291492f5c3" 944 | }, 945 | "dist": { 946 | "type": "zip", 947 | "url": "https://api.github.com/repos/symfony/Yaml/zipball/4a29a5248aed4fb45f626a7bbbd330291492f5c3", 948 | "reference": "4a29a5248aed4fb45f626a7bbbd330291492f5c3", 949 | "shasum": "" 950 | }, 951 | "require": { 952 | "php": ">=5.3.9" 953 | }, 954 | "require-dev": { 955 | "symfony/phpunit-bridge": "~2.7" 956 | }, 957 | "type": "library", 958 | "extra": { 959 | "branch-alias": { 960 | "dev-master": "2.7-dev" 961 | } 962 | }, 963 | "autoload": { 964 | "psr-4": { 965 | "Symfony\\Component\\Yaml\\": "" 966 | } 967 | }, 968 | "notification-url": "https://packagist.org/downloads/", 969 | "license": [ 970 | "MIT" 971 | ], 972 | "authors": [ 973 | { 974 | "name": "Fabien Potencier", 975 | "email": "fabien@symfony.com" 976 | }, 977 | { 978 | "name": "Symfony Community", 979 | "homepage": "https://symfony.com/contributors" 980 | } 981 | ], 982 | "description": "Symfony Yaml Component", 983 | "homepage": "https://symfony.com", 984 | "time": "2015-05-02 15:21:08" 985 | } 986 | ], 987 | "aliases": [], 988 | "minimum-stability": "stable", 989 | "stability-flags": [], 990 | "prefer-stable": false, 991 | "prefer-lowest": false, 992 | "platform": { 993 | "php": "~5.4" 994 | }, 995 | "platform-dev": [] 996 | } 997 | -------------------------------------------------------------------------------- /examples/callback.php: -------------------------------------------------------------------------------- 1 | crawl(function ($url, FutureResponse $response) { 9 | printf('(%d) %s%s', $response->getStatusCode(), $url, PHP_EOL); 10 | }); 11 | -------------------------------------------------------------------------------- /examples/links.php: -------------------------------------------------------------------------------- 1 | crawl()); 6 | -------------------------------------------------------------------------------- /spec/Centipede/Checker/HostCheckerSpec.php: -------------------------------------------------------------------------------- 1 | beConstructedWith('github.com'); 13 | } 14 | 15 | function it_is_initializable() 16 | { 17 | $this->shouldHaveType('Centipede\Checker\HostCheker'); 18 | } 19 | 20 | function it_is_centipede_decider() 21 | { 22 | $this->shouldImplement('Centipede\Checker\CheckerInterface'); 23 | } 24 | 25 | function it_decides_to_crawl_internal_urls() 26 | { 27 | $this->isCrawlable('http://github.com/umpirsky')->shouldReturn(true); 28 | } 29 | 30 | function it_decides_not_to_crawl_external_urls() 31 | { 32 | $this->isCrawlable('http://umpirsky.com/github')->shouldReturn(false); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spec/Centipede/CrawlerSpec.php: -------------------------------------------------------------------------------- 1 | beConstructedWith('https://github.com'); 13 | } 14 | 15 | function it_is_initializable() 16 | { 17 | $this->shouldHaveType('Centipede\Crawler'); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /spec/Centipede/Extractor/UrlExtractorSpec.php: -------------------------------------------------------------------------------- 1 | shouldHaveType('Centipede\Extractor\UrlExtractor'); 13 | } 14 | 15 | function it_is_centipede_extractor() 16 | { 17 | $this->shouldImplement('Centipede\Extractor\ExtractorInterface'); 18 | } 19 | 20 | function it_extracts_urls() 21 | { 22 | $this 23 | ->extract('') 24 | ->shouldReturn(['https://github.com', 'http://umpirsky.com']) 25 | ; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spec/Centipede/Filter/UrlFilterSpec.php: -------------------------------------------------------------------------------- 1 | beConstructedWith('http://github.com'); 13 | } 14 | 15 | function it_is_initializable() 16 | { 17 | $this->shouldHaveType('Centipede\Filter\UrlFilter'); 18 | } 19 | 20 | function it_is_centipede_filter() 21 | { 22 | $this->shouldImplement('Centipede\Filter\FilterInterface'); 23 | } 24 | 25 | function it_filters_url_hash() 26 | { 27 | $this->filter('https://github.com#hash')->shouldReturn('https://github.com'); 28 | } 29 | 30 | function it_filters_trailing_slash() 31 | { 32 | $this->filter('https://github.com/')->shouldReturn('https://github.com'); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/Centipede/Checker/CheckerInterface.php: -------------------------------------------------------------------------------- 1 | host = $host; 12 | } 13 | 14 | public function isCrawlable($url) 15 | { 16 | if (empty($url) || preg_match('/^tel:.*/i', $url)) { 17 | return false; 18 | } 19 | 20 | $host = parse_url($url, PHP_URL_HOST); 21 | if (null === $host) { 22 | return true; 23 | } 24 | 25 | return $host === $this->host; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/Centipede/Crawler.php: -------------------------------------------------------------------------------- 1 | baseUrl = $baseUrl; 28 | $this->depth = $depth; 29 | 30 | $this->client = new Client(); 31 | $this->filter = new UrlFilter($baseUrl); 32 | $this->checker = new HostChecker(parse_url($baseUrl, PHP_URL_HOST)); 33 | $this->extractor = new UrlExtractor(); 34 | } 35 | 36 | public function crawl(callable $callable = null) 37 | { 38 | $urls = [$this->baseUrl]; 39 | 40 | $response = $this->client->get($this->baseUrl, ['future' => true]); 41 | 42 | $this->doCrawl( 43 | $this->baseUrl, 44 | $response, 45 | $this->depth, 46 | $callable, 47 | $urls 48 | ); 49 | 50 | $response->wait(); 51 | 52 | return $urls; 53 | } 54 | 55 | public function setClient(ClientInterface $client) 56 | { 57 | $this->client = $client; 58 | 59 | return $this; 60 | } 61 | 62 | public function getClient() 63 | { 64 | return $this->client; 65 | } 66 | 67 | public function setFilter(FilterInterface $filter) 68 | { 69 | $this->filter = $filter; 70 | 71 | return $this; 72 | } 73 | 74 | public function setExtractor(ExtractorInterface $extractor) 75 | { 76 | $this->extractor = $extractor; 77 | 78 | return $this; 79 | } 80 | 81 | public function setChecker(CheckerInterface $checker) 82 | { 83 | $this->checker = $checker; 84 | 85 | return $this; 86 | } 87 | 88 | private function doCrawl($url, FutureResponse $response, $depth, callable $callable = null, array &$urls = []) 89 | { 90 | if (null !== $callable) { 91 | $callable($url, $response); 92 | } 93 | 94 | if (0 === $depth) { 95 | return; 96 | } 97 | 98 | $response->then(function (Response $response) use ($url, $depth, $callable, &$urls) { 99 | $hrefs = $this->extractor->extract( 100 | $response->getBody()->getContents() 101 | ); 102 | 103 | foreach ($hrefs as $href) { 104 | $href = $this->filter->filter($href); 105 | 106 | if (!in_array($href, $urls) && $this->checker->isCrawlable($href)) { 107 | $this->doCrawl( 108 | $href, 109 | $this->client->get($href, ['future' => true]), 110 | $depth - 1, 111 | $callable, 112 | $urls 113 | ); 114 | 115 | $urls[] = $href; 116 | } 117 | } 118 | })->done(); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/Centipede/Extractor/ExtractorInterface.php: -------------------------------------------------------------------------------- 1 | loadHTML($value); 15 | libxml_use_internal_errors(false); 16 | 17 | foreach ($document->getElementsByTagName('a') as $node) { 18 | $urls[] = $node->getAttribute('href'); 19 | } 20 | 21 | return $urls; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/Centipede/Filter/FilterInterface.php: -------------------------------------------------------------------------------- 1 | baseUrl = rtrim($baseUrl, '/'); 12 | } 13 | 14 | public function filter($value) 15 | { 16 | $value = rtrim($value, '/'); 17 | 18 | if (false !== $position = strpos($value, '#')) { 19 | $value = substr($value, 0, $position); 20 | } 21 | 22 | if (null !== parse_url($value, PHP_URL_SCHEME) || preg_match('/^\/\/.*/', $value)) { 23 | return $value; 24 | } 25 | 26 | return $this->baseUrl.$value; 27 | } 28 | } 29 | --------------------------------------------------------------------------------