├── .github ├── CODEOWNERS ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation.md │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── commitlint.yml │ ├── fix-style.yml │ ├── release-please.yml │ └── run-tests.yml ├── .gitignore ├── .php-cs-fixer.php ├── .phpactor.json ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── README.md ├── composer.json ├── composer.lock ├── phpstan-baseline.neon ├── phpstan.neon ├── phpunit.xml ├── psalm.xml ├── roach ├── src ├── Core │ ├── DefaultContainer.php │ ├── Engine.php │ ├── EngineInterface.php │ ├── FakeRunner.php │ ├── Run.php │ ├── RunFactory.php │ ├── Runner.php │ ├── RunnerInterface.php │ └── Version.php ├── Downloader │ ├── Downloader.php │ ├── DownloaderMiddlewareInterface.php │ ├── Middleware │ │ ├── CookieMiddleware.php │ │ ├── DownloaderMiddlewareAdapter.php │ │ ├── ExecuteJavascriptMiddleware.php │ │ ├── FakeMiddleware.php │ │ ├── HttpErrorMiddleware.php │ │ ├── ProxyMiddleware.php │ │ ├── RequestDeduplicationMiddleware.php │ │ ├── RequestMiddlewareInterface.php │ │ ├── ResponseMiddlewareInterface.php │ │ ├── RobotsTxtMiddleware.php │ │ └── UserAgentMiddleware.php │ └── Proxy │ │ ├── ArrayConfigurationLoader.php │ │ ├── ConfigurationLoaderInterface.php │ │ ├── Proxy.php │ │ └── ProxyOptions.php ├── Events │ ├── FakeDispatcher.php │ ├── ItemDropped.php │ ├── ItemScraped.php │ ├── RequestDropped.php │ ├── RequestScheduling.php │ ├── RequestSending.php │ ├── ResponseDropped.php │ ├── ResponseReceived.php │ ├── ResponseReceiving.php │ ├── RunFinished.php │ └── RunStarting.php ├── Extensions │ ├── ExtensionInterface.php │ ├── LoggerExtension.php │ ├── MaxRequestExtension.php │ ├── ScrapedItemCollectorExtension.php │ └── StatsCollectorExtension.php ├── Http │ ├── Client.php │ ├── ClientInterface.php │ ├── FakeClient.php │ ├── MalformedUriException.php │ ├── Query.php │ ├── QueryParameterTypeMismatchException.php │ ├── Request.php │ ├── RequestException.php │ ├── Response.php │ ├── URL.php │ └── UnknownQueryParameterException.php ├── ItemPipeline │ ├── AbstractItem.php │ ├── Item.php │ ├── ItemInterface.php │ ├── ItemPipeline.php │ ├── ItemPipelineInterface.php │ └── Processors │ │ ├── ConditionalItemProcessor.php │ │ ├── CustomItemProcessor.php │ │ ├── FakeProcessor.php │ │ └── ItemProcessorInterface.php ├── Roach.php ├── Scheduling │ ├── ArrayRequestScheduler.php │ ├── RequestSchedulerInterface.php │ └── Timing │ │ ├── ClockInterface.php │ │ ├── FakeClock.php │ │ └── SystemClock.php ├── Shell │ ├── Commands │ │ ├── FetchCommand.php │ │ └── RunSpiderCommand.php │ ├── InvalidSpiderException.php │ ├── Repl.php │ ├── Resolver │ │ ├── DefaultNamespaceResolverDecorator.php │ │ ├── FakeNamespaceResolver.php │ │ ├── NamespaceResolverInterface.php │ │ └── StaticNamespaceResolver.php │ └── ShellCaster.php ├── Spider │ ├── AbstractSpider.php │ ├── BasicSpider.php │ ├── Configuration │ │ ├── ArrayLoader.php │ │ ├── Configuration.php │ │ └── Overrides.php │ ├── ConfigurationLoaderStrategy.php │ ├── Middleware │ │ ├── FakeHandler.php │ │ ├── ItemMiddlewareInterface.php │ │ ├── MaximumCrawlDepthMiddleware.php │ │ ├── RequestMiddlewareInterface.php │ │ ├── ResponseMiddlewareInterface.php │ │ └── SpiderMiddlewareAdapter.php │ ├── ParseResult.php │ ├── Processor.php │ ├── SpiderInterface.php │ └── SpiderMiddlewareInterface.php ├── Support │ ├── Configurable.php │ ├── ConfigurableInterface.php │ ├── Droppable.php │ ├── DroppableInterface.php │ └── HasMetaData.php └── Testing │ ├── Concerns │ └── InteractsWithRequestsAndResponses.php │ └── FakeLogger.php └── tests ├── Core ├── EngineTest.php └── RunFactoryTest.php ├── Downloader ├── DownloaderMiddlewareAdapterTest.php ├── DownloaderTest.php ├── Middleware │ ├── CookieMiddlewareTest.php │ ├── ExecuteJavascriptMiddlewareTest.php │ ├── FakeMiddlewareTest.php │ ├── HttpErrorMiddlewareTest.php │ ├── ProxyMiddlewareTest.php │ ├── RequestDeduplicationMiddlewareTest.php │ ├── RobotsTxtMiddlewareTest.php │ └── UserAgentMiddlewareTest.php └── Proxy │ ├── ArrayConfigurationLoaderTest.php │ ├── ProxyOptionsTest.php │ └── ProxyTest.php ├── Events └── FakeDispatcherTest.php ├── Extensions ├── ExtensionTestCase.php ├── LoggerExtensionTest.php ├── MaxRequestExtensionTest.php ├── ScrapedItemCollectorExtensionTest.php └── StatsCollectorExtensionTest.php ├── Fixtures ├── Extension.php ├── ItemProcessor.php ├── ItemSpiderMiddleware.php ├── RequestDownloaderMiddleware.php ├── RequestSpiderMiddleware.php ├── ResponseDownloaderMiddleware.php ├── ResponseSpiderMiddleware.php ├── TestCustomItemProcessor.php ├── TestItem.php ├── TestItem2.php ├── TestSpider.php └── TestSpider2.php ├── Http ├── ClientTest.php ├── FakeClientTest.php ├── QueryTest.php ├── RequestTest.php ├── ResponseTest.php └── URLTest.php ├── IntegrationTestCase.php ├── ItemPipeline ├── AbstractItemTest.php ├── CustomItemProcessorTest.php ├── ItemPipelineTest.php └── ItemTest.php ├── RoachTest.php ├── Scheduling ├── ArrayRequestSchedulerTest.php └── Timing │ └── FakeClockTest.php ├── Server ├── index.php └── tmp │ └── .gitignore ├── Shell ├── Commands │ └── RunSpiderCommandTest.php └── Resolver │ ├── DefaultNamespaceResolverDecoratorTest.php │ ├── FakeNamespaceResolverTest.php │ └── StaticNamespaceResolverTest.php ├── Spider ├── Configuration │ ├── ArrayLoaderTest.php │ └── ConfigurationTest.php ├── Middleware │ ├── FakeHandlerTest.php │ ├── FakeProcessorTest.php │ ├── MaximumCrawlDepthMiddlewareTest.php │ └── SpiderMiddlewareAdapterTest.php ├── ParseResultTest.php ├── ProcessorTest.php └── SpiderTestCase.php ├── Support └── DroppableTestCase.php └── Testing ├── FakeLoggerTest.php └── FakeRunnerTest.php /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @ksassnowski -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: ksassnowski 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help improve Motion Canvas 4 | title: '' 5 | labels: bug 6 | assignees: ksassnowski 7 | --- 8 | 9 | **Describe the bug** 10 | A clear and concise description of what the bug is. 11 | 12 | **Reproduction** 13 | Please include a link to a minimal repository that reproduces the issue. 14 | 15 | **Expected behavior** 16 | If applicable, a clear and concise description of what you expected to happen. 17 | 18 | **Package versions (please complete the following information):** 19 | 20 | - core: [e.g. 1.0.0] 21 | 22 | **Additional context** 23 | Add any other context about the problem here. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation 3 | about: Report an issue or suggest improvements for documentation 4 | title: '' 5 | labels: documentation 6 | assignees: ksassnowski 7 | --- 8 | 9 | **Description** 10 | A clear and concise description of what the issue is. 11 | How can it be improved? -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for Motion Canvas 4 | title: '' 5 | labels: enhancement 6 | assignees: ksassnowski 7 | --- 8 | 9 | **Description** 10 | A clear and concise description of why the feature is needed. 11 | What problem does it aim to fix? 12 | What benefits does it bring? 13 | 14 | **Proposed solution** 15 | A clear and concise description of how the feature would work. 16 | If applicable, provide an example of the API and how it would be used. 17 | 18 | **Considered alternatives** 19 | A clear and concise description of any alternative solutions or features you've considered. 20 | 21 | **Additional context** 22 | Add any other context or screenshots about the feature request here. -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "composer" 4 | directory: "/" 5 | commit-message: 6 | prefix: "chore" 7 | include: "scope" 8 | schedule: 9 | interval: "weekly" 10 | -------------------------------------------------------------------------------- /.github/workflows/commitlint.yml: -------------------------------------------------------------------------------- 1 | name: Lint Commit Messages 2 | on: [pull_request] 3 | 4 | jobs: 5 | commitlint: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v4 9 | with: 10 | fetch-depth: 0 11 | - uses: wagoid/commitlint-github-action@v5 -------------------------------------------------------------------------------- /.github/workflows/fix-style.yml: -------------------------------------------------------------------------------- 1 | name: fix-style 2 | 3 | on: [push] 4 | 5 | jobs: 6 | cs-fix: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Get branch names 11 | id: branch-name 12 | uses: tj-actions/branch-names@v8 13 | 14 | - name: Checkout code 15 | uses: actions/checkout@v4 16 | with: 17 | ref: ${{ github.head_ref }} 18 | 19 | - name: Setup PHP 20 | uses: shivammathur/setup-php@v2 21 | with: 22 | php-version: 8.3 23 | 24 | - name: Install dependencies 25 | run: composer install 26 | 27 | - name: Fix style 28 | run: ./vendor/bin/php-cs-fixer fix --allow-risky=yes --using-cache=no 29 | 30 | - name: Commit style fixes 31 | uses: stefanzweifel/git-auto-commit-action@v5 32 | with: 33 | commit_message: Apply php-cs-fixer changes 34 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | permissions: 7 | contents: write 8 | pull-requests: write 9 | 10 | name: release-please 11 | 12 | jobs: 13 | release-please: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: google-github-actions/release-please-action@v3 17 | with: 18 | release-type: php 19 | package-name: roach-php/core -------------------------------------------------------------------------------- /.github/workflows/run-tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | os: [ubuntu-latest] 12 | php: ['8.2', '8.3', '8.4'] 13 | dependency-version: [prefer-lowest, prefer-stable] 14 | 15 | name: P${{ matrix.php }} - ${{ matrix.dependency-version }} - ${{ matrix.os }} 16 | 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v4 20 | 21 | - name: Disable AppArmor 22 | run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns 23 | 24 | - name: Install Puppeteer 25 | run: npm install puppeteer 26 | 27 | - name: Setup PHP 28 | uses: shivammathur/setup-php@v2 29 | with: 30 | php-version: ${{ matrix.php }} 31 | extensions: dom, curl 32 | coverage: none 33 | 34 | - name: "Validate composer.json and composer.lock" 35 | run: "composer validate --strict" 36 | 37 | - name: "Determine composer cache directory" 38 | id: "determine-composer-cache-directory" 39 | run: echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT 40 | 41 | - name: "Cache dependencies installed with composer" 42 | uses: "actions/cache@v4" 43 | with: 44 | path: "${{ steps.determine-composer-cache-directory.outputs.dir }}" 45 | key: "php-${{ matrix.php }}-composer-${{ matrix.dependency-version }}-${{ hashFiles('composer.lock') }}" 46 | restore-keys: "php-${{ matrix.php }}-composer-${{ matrix.dependency-version }}-" 47 | 48 | - name: Install dependencies 49 | run: composer update --${{ matrix.dependency-version }} --no-interaction --prefer-dist 50 | 51 | - name: Run phpstan 52 | run: composer analyze 53 | 54 | - name: Start server 55 | run: (php -S localhost:8000 -t ./tests/Server &) || /bin/true 56 | 57 | - name: Wait for server bootup 58 | run: sleep 3 59 | 60 | - name: Execute tests 61 | run: vendor/bin/phpunit 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /vendor/ 2 | /.build/ 3 | .phpunit.result.cache 4 | .phpunit.cache/ 5 | /coverage/ 6 | /package-lock.json 7 | /package.json 8 | /node_modules 9 | -------------------------------------------------------------------------------- /.php-cs-fixer.php: -------------------------------------------------------------------------------- 1 | withHeader($header) 16 | ->withRules(Config\Rules::fromArray([ 17 | 'php_unit_test_class_requires_covers' => false, 18 | 'class_attributes_separation' => [ 19 | 'elements' => [ 20 | 'const' => 'one', 21 | 'method' => 'one', 22 | 'property' => 'one', 23 | 'trait_import' => 'none', 24 | ], 25 | ], 26 | 'error_suppression' => [ 27 | 'noise_remaining_usages' => false, 28 | ], 29 | ])); 30 | 31 | $config = Config\Factory::fromRuleSet($ruleSet); 32 | 33 | $config->getFinder()->in(__DIR__); 34 | $config->setCacheFile(__DIR__ . '/.build/php-cs-fixer/.php-cs-fixer.cache'); 35 | 36 | return $config; -------------------------------------------------------------------------------- /.phpactor.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "/phpactor.schema.json", 3 | "language_server_phpstan.enabled": false, 4 | "language_server_php_cs_fixer.enabled": false 5 | } -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Roach 2 | 3 | This is the Contribution Guide for Roach PHP. Please read this document 4 | carefully before opening an issue or a pull request. 5 | 6 | ## Code of Conduct 7 | 8 | Before contributing to the project, please read our 9 | [Code of Conduct](./CODE_OF_CONDUCT.md). 10 | 11 | ## Reporting a bug 12 | 13 | Before you submit an issue, please search [the issue tracker][issues]. An issue 14 | for your problem might already exist and the discussion might inform you of 15 | workarounds readily available. 16 | 17 | You can file new issues by [selecting an issue template][new-issue] and filling 18 | out the necessary information. 19 | 20 | ## Proposing a Change 21 | 22 | If you intend to change the public API or make any non-trivial changes to the 23 | implementation, make sure to [create an issue][new-feature] first. This will let 24 | us discuss a proposal before you put significant effort into it. 25 | 26 | If you're only fixing a bug or a typo, it's fine to submit a pull request right 27 | away without creating an issue, but make sure it contains a clear and concise 28 | description of the bug. 29 | 30 | ## Working on Issues 31 | 32 | Before you start working on an issue make sure that it has been accepted 33 | (indicated by an [`accepted`][label-accepted] label) and that no one has 34 | claimed it yet. Otherwise, you may duplicate other people's efforts. If somebody 35 | claims an issue but doesn't follow up for more than two weeks, it’s fine to take 36 | it over, but you should still leave a comment. You should also leave a comment 37 | on any issue you're working on, to let others know. 38 | 39 | ## Semantic Versioning 40 | 41 | Roach follows [semantic versioning][semver]. 42 | 43 | ## Making a Pull Request 44 | 45 | 1. Fork the roach-php/core repo. 46 | 2. In your forked repo, create a new branch for your changes: 47 | ```shell 48 | git checkout -b my-fix-branch main 49 | ``` 50 | 3. Update the code. **Make sure that all your changes are covered by tests.** 51 | 4. Commit your changes using a **descriptive commit message** that follows the 52 | [Angular Commit Message Conventions][commit-format]. 53 | ```shell 54 | git commit --all 55 | ``` 56 | 5. Push your branch to GitHub: 57 | ```shell 58 | git push origin my-fix-branch 59 | ``` 60 | 6. In GitHub, send a pull request to [the main branch][main]. 61 | 62 | ### Addressing review feedback 63 | 64 | 1. Make required updates to the code. 65 | 2. Create a fixup commit and push it to your GitHub repo: 66 | ```shell 67 | git commit --all --fixup HEAD 68 | git push 69 | ``` 70 | 71 | ## Attribution 72 | 73 | This Contribution Guide was adapted from the [Motion Canvas][motion-canvas] 74 | Contribution guide 75 | 76 | [semver]: https://semver.org/ 77 | [semantic-release]: https://semantic-release.gitbook.io/semantic-release/support/faq#can-i-set-the-initial-release-version-of-my-package-to-0.0.1 78 | [main]: https://github.com/roach-php/core/tree/main 79 | [issues]: https://github.com/roach-php/core/issues 80 | [new-issue]: https://github.com/roach-php/core/issues/new/choose 81 | [new-feature]: https://github.com/roach-php/core/issues/new?template=feature_request.md 82 | [commit-format]: https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit 83 | [motion-canvas]: https://github.com/motion-canvas/motion-canvas/blob/main/CONTRIBUTING.md 84 | [label-accepted]: https://github.com/roach-php/core/labels/accepted -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 🐴 Roach 3 |

4 | 5 |

6 | 7 | Latest Stable Version 8 | 9 | 10 | 11 | Total Downloads 12 | 13 | 14 | 15 | 16 | 17 |

18 | 19 |

A complete web scraping toolkit for PHP

20 | 21 | ## About 22 | 23 | Roach is a complete web scraping toolkit for PHP. It is heavily inspired (read: a shameless clone) of the popular [Scrapy](https://docs.scrapy.org) 24 | package for Python. 25 | 26 | ## Installation 27 | 28 | Install the package via composer 29 | 30 | ```bash 31 | composer require roach-php/core 32 | ``` 33 | 34 | ## Documentation 35 | 36 | The full documentation can be found [here](https://roach-php.dev). 37 | 38 | ## Contributing 39 | 40 | Please read our [Contribution Guide][contribution-guide] before opening issues 41 | or pull requests. 42 | 43 | ## Credits 44 | 45 | - [Kai Sassnowski](https://github.com/ksassnowski) 46 | - [All contributors](https://github.com/roach-php/core/contributors) 47 | 48 | ## License 49 | 50 | MIT 51 | 52 | [contribution-guide]: https://github.com/roach-php/core/blob/main/CONTRIBUTING.md -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "roach-php/core", 3 | "description": "A complete web scraping toolkit for PHP", 4 | "license": "MIT", 5 | "type": "library", 6 | "authors": [ 7 | { 8 | "name": "Kai Sassnowski", 9 | "email": "me@kai-sassnowski.com" 10 | } 11 | ], 12 | "require": { 13 | "php": "~8.2.0 || ~8.3.0 || ~8.4.0", 14 | "guzzlehttp/guzzle": "^7.8.0", 15 | "jakeasmith/http_build_url": "^1.0.1", 16 | "league/container": "^4.2", 17 | "monolog/monolog": "^3.5", 18 | "nyholm/psr7": "^1.8.1", 19 | "nyholm/psr7-server": "^1.1", 20 | "psr/container": "^2.0.2", 21 | "psy/psysh": "^0.11.22 || ^0.12.0", 22 | "spatie/robots-txt": "^2.0.3", 23 | "symfony/console": "^7.0", 24 | "symfony/css-selector": "^7.0", 25 | "symfony/dom-crawler": "^7.0", 26 | "symfony/event-dispatcher": "^7.0", 27 | "symfony/options-resolver": "^7.0" 28 | }, 29 | "require-dev": { 30 | "ergebnis/composer-normalize": "^2.45", 31 | "ergebnis/php-cs-fixer-config": "^6.45.0", 32 | "http-interop/http-factory-guzzle": "^1.2", 33 | "phpstan/phpstan": "^2.1", 34 | "phpunit/phpunit": "^10.4.2", 35 | "psr/http-message": "^1.1.0", 36 | "roave/security-advisories": "dev-latest", 37 | "slim/slim": "^4.12", 38 | "spatie/browsershot": "^5.0" 39 | }, 40 | "suggest": { 41 | "spatie/browsershot": "Required to execute Javascript in spiders" 42 | }, 43 | "autoload": { 44 | "psr-4": { 45 | "RoachPHP\\": "src/" 46 | } 47 | }, 48 | "autoload-dev": { 49 | "psr-4": { 50 | "RoachPHP\\Tests\\": "tests/" 51 | } 52 | }, 53 | "bin": [ 54 | "roach" 55 | ], 56 | "config": { 57 | "allow-plugins": { 58 | "composer/package-versions-deprecated": true, 59 | "ergebnis/composer-normalize": true 60 | } 61 | }, 62 | "scripts": { 63 | "post-install-cmd": [ 64 | "composer normalize" 65 | ], 66 | "post-update-cmd": [ 67 | "composer normalize" 68 | ], 69 | "analyze": [ 70 | "vendor/bin/phpstan" 71 | ], 72 | "coding-standards": [ 73 | "mkdir -p .build/php-cs-fixer", 74 | "php-cs-fixer fix --diff --verbose" 75 | ], 76 | "test-server": [ 77 | "php -S localhost:8000 -t ./tests/Server" 78 | ], 79 | "test-watch": [ 80 | "vendor/bin/phpunit-watcher watch" 81 | ] 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /phpstan-baseline.neon: -------------------------------------------------------------------------------- 1 | parameters: 2 | ignoreErrors: 3 | - 4 | message: '#^Parameter \#1 \$item of method RoachPHP\\Spider\\Middleware\\ItemMiddlewareInterface\:\:handleItem\(\) expects RoachPHP\\ItemPipeline\\ItemInterface, RoachPHP\\Http\\Request\|RoachPHP\\ItemPipeline\\ItemInterface given\.$#' 5 | identifier: argument.type 6 | count: 1 7 | path: src/Spider/Processor.php 8 | 9 | - 10 | message: '#^Parameter \#1 \$request of method RoachPHP\\Spider\\Middleware\\RequestMiddlewareInterface\:\:handleRequest\(\) expects RoachPHP\\Http\\Request, RoachPHP\\Http\\Request\|RoachPHP\\ItemPipeline\\ItemInterface given\.$#' 11 | identifier: argument.type 12 | count: 1 13 | path: src/Spider/Processor.php 14 | -------------------------------------------------------------------------------- /phpstan.neon: -------------------------------------------------------------------------------- 1 | includes: 2 | - phpstan-baseline.neon 3 | 4 | parameters: 5 | paths: 6 | - src 7 | 8 | level: 9 9 | 10 | ignoreErrors: 11 | - identifier: missingType.iterableValue 12 | 13 | excludePaths: [] 14 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | tests 6 | 7 | 8 | 9 | 10 | 11 | src 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /psalm.xml: -------------------------------------------------------------------------------- 1 | 2 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /roach: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | setName(Version::getVersionString()); 30 | 31 | $application->add(new Repl()); 32 | $application->add(new RunSpiderCommand()); 33 | 34 | $application->run(); 35 | -------------------------------------------------------------------------------- /src/Core/EngineInterface.php: -------------------------------------------------------------------------------- 1 | 30 | */ 31 | public function collect(Run $run): array; 32 | } 33 | -------------------------------------------------------------------------------- /src/Core/FakeRunner.php: -------------------------------------------------------------------------------- 1 | , array> 24 | */ 25 | private array $runs = []; 26 | 27 | public function startSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): void 28 | { 29 | $this->recordRun($spiderClass, $overrides, $context); 30 | } 31 | 32 | public function collectSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): array 33 | { 34 | $this->recordRun($spiderClass, $overrides, $context); 35 | 36 | return []; 37 | } 38 | 39 | /** 40 | * @param class-string $spider 41 | * 42 | * @psalm-param (callable(Overrides|null, array): bool)|null $callback 43 | */ 44 | public function assertRunWasStarted(string $spider, ?callable $callback = null): void 45 | { 46 | Assert::assertArrayHasKey( 47 | $spider, 48 | $this->runs, 49 | "Expected run for spider {$spider} to exist but no runs were started instead.", 50 | ); 51 | 52 | if (null !== $callback) { 53 | foreach ($this->runs[$spider] as $run) { 54 | if ($callback($run['overrides'], $run['context'])) { 55 | return; 56 | } 57 | } 58 | 59 | Assert::fail("Found run for spider {$spider}, but passed callback returned false"); 60 | } 61 | } 62 | 63 | /** 64 | * @param class-string $spider 65 | */ 66 | public function assertRunWasNotStarted(string $spider): void 67 | { 68 | Assert::assertArrayNotHasKey( 69 | $spider, 70 | $this->runs, 71 | "Unexpected run for spider {$spider} was started", 72 | ); 73 | } 74 | 75 | /** 76 | * @param class-string $spiderClass 77 | */ 78 | private function recordRun(string $spiderClass, ?Overrides $overrides = null, array $context = []): void 79 | { 80 | $this->runs[$spiderClass][] = [ 81 | 'overrides' => $overrides, 82 | 'context' => $context, 83 | ]; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/Core/Run.php: -------------------------------------------------------------------------------- 1 | $startRequests 29 | * @param array $downloaderMiddleware 30 | * @param array $itemProcessors 31 | * @param array $responseMiddleware 32 | * @param array $extensions 33 | */ 34 | public function __construct( 35 | public array $startRequests, 36 | public string $namespace, 37 | public array $downloaderMiddleware = [], 38 | public array $itemProcessors = [], 39 | public array $responseMiddleware = [], 40 | public array $extensions = [], 41 | public int $concurrency = 25, 42 | public int $requestDelay = 0, 43 | ) { 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/Core/Runner.php: -------------------------------------------------------------------------------- 1 | engine->start($this->createRun($spiderClass, $overrides, $context)); 31 | } 32 | 33 | public function collectSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): array 34 | { 35 | return $this->engine->collect($this->createRun($spiderClass, $overrides, $context)); 36 | } 37 | 38 | private function createRun(string $spiderClass, ?Overrides $overrides, array $context): Run 39 | { 40 | /** @var SpiderInterface $spider */ 41 | $spider = $this->container->get($spiderClass); 42 | 43 | $spider->withContext($context); 44 | 45 | return (new RunFactory($this->container))->fromSpider($spider, $overrides); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/Core/RunnerInterface.php: -------------------------------------------------------------------------------- 1 | $spiderClass 24 | */ 25 | public function startSpider( 26 | string $spiderClass, 27 | ?Overrides $overrides = null, 28 | array $context = [], 29 | ): void; 30 | 31 | /** 32 | * @param class-string $spiderClass 33 | * 34 | * @return array 35 | */ 36 | public function collectSpider( 37 | string $spiderClass, 38 | ?Overrides $overrides = null, 39 | array $context = [], 40 | ): array; 41 | } 42 | -------------------------------------------------------------------------------- /src/Core/Version.php: -------------------------------------------------------------------------------- 1 | cookieJar = $cookieJar ?: new CookieJar(); 30 | } 31 | 32 | public function handleRequest(Request $request): Request 33 | { 34 | return $request->addOption('cookies', $this->cookieJar); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/Downloader/Middleware/DownloaderMiddlewareAdapter.php: -------------------------------------------------------------------------------- 1 | middleware instanceof RequestMiddlewareInterface) { 43 | return $this->middleware->handleRequest($request); 44 | } 45 | 46 | return $request; 47 | } 48 | 49 | public function handleResponse(Response $response): Response 50 | { 51 | if ($this->middleware instanceof ResponseMiddlewareInterface) { 52 | return $this->middleware->handleResponse($response); 53 | } 54 | 55 | return $response; 56 | } 57 | 58 | public function configure(array $options): void 59 | { 60 | $this->middleware->configure($options); 61 | } 62 | 63 | public function getMiddleware(): RequestMiddlewareInterface|ResponseMiddlewareInterface 64 | { 65 | return $this->middleware; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/Downloader/Middleware/FakeMiddleware.php: -------------------------------------------------------------------------------- 1 | requestsHandled[] = $request; 52 | 53 | if (null !== $this->requestHandler) { 54 | return ($this->requestHandler)($request); 55 | } 56 | 57 | return $request; 58 | } 59 | 60 | public function handleResponse(Response $response): Response 61 | { 62 | $this->responsesHandled[] = $response; 63 | 64 | if (null !== $this->responseHandler) { 65 | return ($this->responseHandler)($response); 66 | } 67 | 68 | return $response; 69 | } 70 | 71 | public function assertRequestHandled(Request $request): void 72 | { 73 | Assert::assertContains($request, $this->requestsHandled); 74 | } 75 | 76 | public function assertRequestNotHandled(Request $request): void 77 | { 78 | Assert::assertNotContains($request, $this->requestsHandled); 79 | } 80 | 81 | public function assertNoRequestsHandled(): void 82 | { 83 | Assert::assertEmpty($this->requestsHandled); 84 | } 85 | 86 | public function assertResponseHandled(Response $response): void 87 | { 88 | Assert::assertContains($response, $this->responsesHandled); 89 | } 90 | 91 | public function assertResponseNotHandled(Response $response): void 92 | { 93 | Assert::assertNotContains($response, $this->responsesHandled); 94 | } 95 | 96 | public function assertNoResponseHandled(): void 97 | { 98 | Assert::assertEmpty($this->responsesHandled); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/Downloader/Middleware/HttpErrorMiddleware.php: -------------------------------------------------------------------------------- 1 | getStatus(); 31 | 32 | if (200 <= $status && 300 > $status) { 33 | return $response; 34 | } 35 | 36 | /** @var array $allowedStatus */ 37 | $allowedStatus = $this->option('handleStatus'); 38 | 39 | if (\in_array($status, $allowedStatus, true)) { 40 | return $response; 41 | } 42 | 43 | $this->logger->info( 44 | '[HttpErrorMiddleware] Dropping unsuccessful response', 45 | [ 46 | 'uri' => $response->getRequest()->getUri(), 47 | 'status' => $status, 48 | ], 49 | ); 50 | 51 | return $response->drop('Unallowed HTTP status: ' . $status); 52 | } 53 | 54 | private function defaultOptions(): array 55 | { 56 | return [ 57 | 'handleStatus' => [], 58 | ]; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/Downloader/Middleware/ProxyMiddleware.php: -------------------------------------------------------------------------------- 1 | proxy) { 39 | $this->logger->warning( 40 | '[ProxyMiddleware] No proxy configured for middleware', 41 | ); 42 | 43 | return $request; 44 | } 45 | 46 | $options = $this->proxy->optionsFor($request); 47 | 48 | if ($options->isEmpty()) { 49 | return $request; 50 | } 51 | 52 | $this->logger->info( 53 | '[ProxyMiddleware] Using proxy for request', 54 | $options->toArray(), 55 | ); 56 | 57 | return $request->addOption('proxy', $options->toArray()); 58 | } 59 | 60 | private function defaultOptions(): array 61 | { 62 | return [ 63 | 'proxy' => [], 64 | 'loader' => null, 65 | ]; 66 | } 67 | 68 | private function onAfterConfigured(): void 69 | { 70 | /** @var null|class-string $loaderClass */ 71 | $loaderClass = $this->option('loader'); 72 | 73 | if (null !== $loaderClass) { 74 | /** @var ConfigurationLoaderInterface $loader */ 75 | $loader = $this->container->get($loaderClass); 76 | } else { 77 | /** @var array}|string>|string $options */ 78 | $options = $this->option('proxy'); 79 | $loader = new ArrayConfigurationLoader($options); 80 | } 81 | 82 | $this->proxy = $loader->loadProxyConfiguration(); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/Downloader/Middleware/RequestDeduplicationMiddleware.php: -------------------------------------------------------------------------------- 1 | 26 | */ 27 | private array $seenUris = []; 28 | 29 | public function __construct(private LoggerInterface $logger) 30 | { 31 | } 32 | 33 | public function handleRequest(Request $request): Request 34 | { 35 | $uri = $request->getUri(); 36 | $replaceFlags = \HTTP_URL_REPLACE; 37 | $parts = \parse_url($uri); 38 | 39 | if ($this->option('ignore_url_fragments')) { 40 | $replaceFlags |= \HTTP_URL_STRIP_FRAGMENT; 41 | } 42 | 43 | if ($this->option('ignore_trailing_slashes') && isset($parts['path'])) { 44 | $parts['path'] = \mb_rtrim($parts['path'], '/'); 45 | } 46 | 47 | if ($this->option('ignore_query_string')) { 48 | $replaceFlags |= \HTTP_URL_STRIP_QUERY; 49 | } 50 | 51 | /** @phpstan-ignore argument.type */ 52 | $uri = http_build_url($uri, $parts, $replaceFlags); 53 | 54 | if (\in_array($uri, $this->seenUris, true)) { 55 | $this->logger->info( 56 | '[RequestDeduplicationMiddleware] Dropping duplicate request', 57 | ['uri' => $request->getUri()], 58 | ); 59 | 60 | return $request->drop('Duplicate request'); 61 | } 62 | 63 | $this->seenUris[] = $uri; 64 | 65 | return $request; 66 | } 67 | 68 | private function defaultOptions(): array 69 | { 70 | return [ 71 | 'ignore_url_fragments' => false, 72 | 'ignore_trailing_slashes' => true, 73 | 'ignore_query_string' => false, 74 | ]; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/Downloader/Middleware/RequestMiddlewareInterface.php: -------------------------------------------------------------------------------- 1 | 26 | */ 27 | private array $robots = []; 28 | 29 | public function handleRequest(Request $request): Request 30 | { 31 | /** @var string $userAgent */ 32 | $userAgent = $request->getHeader('User-Agent')[0] ?? ''; 33 | $uri = $request->getUri(); 34 | $robotsUrl = $this->createRobotsUrl($uri); 35 | 36 | if (!isset($this->robots[$robotsUrl])) { 37 | $this->robots[$robotsUrl] = Robots::create($userAgent, $robotsUrl); 38 | } 39 | 40 | $robots = $this->robots[$robotsUrl]; 41 | 42 | if (!$robots->mayIndex($uri, $userAgent)) { 43 | return $request->drop("robots.txt forbids crawling {$uri} for user agent {$userAgent}"); 44 | } 45 | 46 | return $request; 47 | } 48 | 49 | private function createRobotsUrl(string $url): string 50 | { 51 | $robotsUrl = \parse_url($url, \PHP_URL_SCHEME) . '://' . \parse_url($url, \PHP_URL_HOST); 52 | 53 | $port = \parse_url($url, \PHP_URL_PORT); 54 | 55 | if (null !== $port && false !== $port) { 56 | $robotsUrl .= ":{$port}"; 57 | } 58 | 59 | /** @var string $fileName */ 60 | $fileName = $this->option('fileName'); 61 | 62 | return "{$robotsUrl}/{$fileName}"; 63 | } 64 | 65 | private function defaultOptions(): array 66 | { 67 | return [ 68 | 'fileName' => 'robots.txt', 69 | ]; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/Downloader/Middleware/UserAgentMiddleware.php: -------------------------------------------------------------------------------- 1 | addHeader('User-Agent', $this->option('userAgent')); 27 | } 28 | 29 | private function defaultOptions(): array 30 | { 31 | return [ 32 | 'userAgent' => 'roach-php', 33 | ]; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/Downloader/Proxy/ArrayConfigurationLoader.php: -------------------------------------------------------------------------------- 1 | }|string>|string $params 20 | */ 21 | public function __construct(private readonly array|string $params) 22 | { 23 | } 24 | 25 | public function loadProxyConfiguration(): Proxy 26 | { 27 | if (\is_string($this->params)) { 28 | return new Proxy([ 29 | '*' => ProxyOptions::allProtocols($this->params), 30 | ]); 31 | } 32 | 33 | /** @var array $proxyList */ 34 | $proxyList = []; 35 | 36 | foreach ($this->params as $domain => $options) { 37 | if (\is_string($options)) { 38 | $proxyList[$domain] = ProxyOptions::allProtocols($options); 39 | } else { 40 | $proxyList[$domain] = new ProxyOptions( 41 | $options['http'] ?? null, 42 | $options['https'] ?? null, 43 | $options['no'] ?? [], 44 | ); 45 | } 46 | } 47 | 48 | return new Proxy($proxyList); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/Downloader/Proxy/ConfigurationLoaderInterface.php: -------------------------------------------------------------------------------- 1 | $proxyList 22 | */ 23 | public function __construct(private readonly array $proxyList = []) 24 | { 25 | } 26 | 27 | public function optionsFor(Request $request): ProxyOptions 28 | { 29 | $host = $request->url->host; 30 | 31 | if (null === $host) { 32 | return ProxyOptions::make(); 33 | } 34 | 35 | if (\array_key_exists($host, $this->proxyList)) { 36 | return $this->proxyList[$host]; 37 | } 38 | 39 | if (\array_key_exists('*', $this->proxyList)) { 40 | return $this->proxyList['*']; 41 | } 42 | 43 | return ProxyOptions::make(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/Downloader/Proxy/ProxyOptions.php: -------------------------------------------------------------------------------- 1 | $excludedDomains 20 | */ 21 | public function __construct( 22 | private readonly ?string $httpProxyURL = null, 23 | private readonly ?string $httpsProxyURL = null, 24 | private readonly array $excludedDomains = [], 25 | ) { 26 | } 27 | 28 | public static function make(): self 29 | { 30 | return new self(); 31 | } 32 | 33 | /** 34 | * Configure the same proxy URL to be used for HTTP and HTTPS. 35 | */ 36 | public static function allProtocols(string $url): self 37 | { 38 | return new self($url, $url, []); 39 | } 40 | 41 | /** 42 | * Configure the proxy URL to be used for requests using HTTP. 43 | */ 44 | public function http(string $url): self 45 | { 46 | return new self($url, $this->httpsProxyURL, $this->excludedDomains); 47 | } 48 | 49 | /** 50 | * Configure the proxy URL to be used for requests using HTTPs. 51 | */ 52 | public function https(string $url): self 53 | { 54 | return new self($this->httpProxyURL, $url, $this->excludedDomains); 55 | } 56 | 57 | /** 58 | * Configure the domains or TLDs that should not use proxies. 59 | * 60 | * @param array|string $domains 61 | */ 62 | public function exclude(array|string $domains): self 63 | { 64 | return new self( 65 | $this->httpProxyURL, 66 | $this->httpsProxyURL, 67 | (array) $domains, 68 | ); 69 | } 70 | 71 | public function isEmpty(): bool 72 | { 73 | return null === $this->httpProxyURL 74 | && null === $this->httpsProxyURL 75 | && \count($this->excludedDomains) === 0; 76 | } 77 | 78 | public function equals(self $other): bool 79 | { 80 | return $this->httpProxyURL === $other->httpProxyURL 81 | && $this->httpsProxyURL === $other->httpsProxyURL 82 | && $this->excludedDomains === $other->excludedDomains; 83 | } 84 | 85 | /** 86 | * @return array{ 87 | * http?: string, 88 | * https?: string, 89 | * no?: array 90 | * } 91 | */ 92 | public function toArray(): array 93 | { 94 | return \array_filter([ 95 | 'http' => $this->httpProxyURL, 96 | 'https' => $this->httpsProxyURL, 97 | 'no' => $this->excludedDomains, 98 | ]); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/Events/FakeDispatcher.php: -------------------------------------------------------------------------------- 1 | > 23 | */ 24 | private array $dispatchedEvents = []; 25 | 26 | public function dispatch(object $event, ?string $eventName = null): object 27 | { 28 | $eventName ??= $event::class; 29 | 30 | parent::dispatch($event, $eventName); 31 | 32 | $this->dispatchedEvents[$eventName][] = $event; 33 | 34 | return $event; 35 | } 36 | 37 | public function assertDispatched(string $eventName, ?callable $callback = null): void 38 | { 39 | Assert::assertArrayHasKey($eventName, $this->dispatchedEvents); 40 | 41 | if (null !== $callback) { 42 | foreach ($this->dispatchedEvents[$eventName] as $event) { 43 | if ($callback($event)) { 44 | return; 45 | } 46 | } 47 | 48 | Assert::fail('Event was not dispatched with correct payload'); 49 | } 50 | } 51 | 52 | public function assertNotDispatched(string $eventName): void 53 | { 54 | Assert::assertArrayNotHasKey($eventName, $this->dispatchedEvents); 55 | } 56 | 57 | public function listen(string $eventName, callable $listener): void 58 | { 59 | $this->addListener($eventName, $listener); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/Events/ItemDropped.php: -------------------------------------------------------------------------------- 1 | ['onRunStarting', 100], 37 | RunFinished::NAME => ['onRunFinished', 100], 38 | RequestSending::NAME => ['onRequestSending', 100], 39 | RequestDropped::NAME => ['onRequestDropped', 100], 40 | ItemScraped::NAME => ['onItemScraped', 100], 41 | ItemDropped::NAME => ['onItemDropped', 100], 42 | ]; 43 | } 44 | 45 | public function onRunStarting(RunStarting $event): void 46 | { 47 | $this->logger->info('Run starting'); 48 | } 49 | 50 | public function onRunFinished(RunFinished $event): void 51 | { 52 | $this->logger->info('Run finished'); 53 | } 54 | 55 | public function onRequestSending(RequestSending $event): void 56 | { 57 | $this->logger->info('Dispatching request', [ 58 | 'uri' => $event->request->getUri(), 59 | ]); 60 | } 61 | 62 | public function onRequestDropped(RequestDropped $event): void 63 | { 64 | $request = $event->request; 65 | 66 | $this->logger->info('Request dropped', [ 67 | 'uri' => $request->getUri(), 68 | 'reason' => $request->getDropReason(), 69 | ]); 70 | } 71 | 72 | public function onItemScraped(ItemScraped $event): void 73 | { 74 | $this->logger->info('Item scraped', $event->item->all()); 75 | } 76 | 77 | public function onItemDropped(ItemDropped $event): void 78 | { 79 | $this->logger->info('Item dropped', [ 80 | 'item' => $event->item->all(), 81 | 'reason' => $event->item->getDropReason(), 82 | ]); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/Extensions/MaxRequestExtension.php: -------------------------------------------------------------------------------- 1 | ['onRequestSending', 10000], 30 | RequestScheduling::NAME => ['onRequestScheduling', 0], 31 | ]; 32 | } 33 | 34 | public function onRequestSending(RequestSending $event): void 35 | { 36 | $this->dropRequestIfLimitReached($event); 37 | 38 | if (!$event->request->wasDropped()) { 39 | ++$this->sentRequests; 40 | } 41 | } 42 | 43 | public function onRequestScheduling(RequestScheduling $event): void 44 | { 45 | $this->dropRequestIfLimitReached($event); 46 | } 47 | 48 | private function dropRequestIfLimitReached(RequestScheduling|RequestSending $event): void 49 | { 50 | /** @var int $limit */ 51 | $limit = $this->option('limit'); 52 | 53 | if ($limit <= $this->sentRequests) { 54 | $event->request = $event->request->drop("Reached maximum request limit of {$limit}"); 55 | } 56 | } 57 | 58 | private function defaultOptions(): array 59 | { 60 | return [ 61 | 'limit' => 10, 62 | ]; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/Extensions/ScrapedItemCollectorExtension.php: -------------------------------------------------------------------------------- 1 | 29 | */ 30 | private array $scrapedItems = []; 31 | 32 | public static function getSubscribedEvents(): array 33 | { 34 | return [ 35 | ItemScraped::NAME => ['onItemScraped', 0], 36 | ]; 37 | } 38 | 39 | public function onItemScraped(ItemScraped $event): void 40 | { 41 | $this->scrapedItems[] = $event->item; 42 | } 43 | 44 | /** 45 | * @return array 46 | */ 47 | public function getScrapedItems(): array 48 | { 49 | return $this->scrapedItems; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/Extensions/StatsCollectorExtension.php: -------------------------------------------------------------------------------- 1 | null, 43 | 'requests.sent' => 0, 44 | 'requests.dropped' => 0, 45 | 'items.scraped' => 0, 46 | 'items.dropped' => 0, 47 | ]; 48 | 49 | public function __construct( 50 | private LoggerInterface $logger, 51 | private ClockInterface $clock, 52 | ) { 53 | } 54 | 55 | public static function getSubscribedEvents(): array 56 | { 57 | return [ 58 | RunStarting::NAME => ['onRunStarting', 200], 59 | RequestSending::NAME => ['onRequestSending', 200], 60 | RequestDropped::NAME => ['onRequestDropped', 200], 61 | ItemDropped::NAME => ['onItemDropped', 200], 62 | ItemScraped::NAME => ['onItemScraped', 200], 63 | RunFinished::NAME => ['onRunFinished', 200], 64 | ]; 65 | } 66 | 67 | public function onRunStarting(): void 68 | { 69 | $this->startTime = $this->clock->now(); 70 | } 71 | 72 | public function onRunFinished(): void 73 | { 74 | if (null !== $this->startTime) { 75 | $duration = $this->startTime->diff($this->clock->now()); 76 | $this->stats['duration'] = $duration->format('%H:%I:%S'); 77 | } 78 | 79 | $this->logger->info('Run statistics', $this->stats); 80 | } 81 | 82 | public function onRequestSending(RequestSending $event): void 83 | { 84 | if (!$event->request->wasDropped()) { 85 | ++$this->stats['requests.sent']; 86 | } 87 | } 88 | 89 | public function onRequestDropped(): void 90 | { 91 | ++$this->stats['requests.dropped']; 92 | } 93 | 94 | public function onItemDropped(): void 95 | { 96 | ++$this->stats['items.dropped']; 97 | } 98 | 99 | public function onItemScraped(): void 100 | { 101 | ++$this->stats['items.scraped']; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/Http/Client.php: -------------------------------------------------------------------------------- 1 | client = $client ?? new GuzzleClient(); 29 | } 30 | 31 | /** 32 | * @param list $requests 33 | */ 34 | public function pool( 35 | array $requests, 36 | ?callable $onFulfilled = null, 37 | ?callable $onRejected = null, 38 | ): void { 39 | $makeRequests = function () use ($requests): \Generator { 40 | foreach ($requests as $request) { 41 | yield function () use ($request) { 42 | return $this->client 43 | ->sendAsync($request->getPsrRequest(), $request->getOptions()) 44 | ->then( 45 | static fn (ResponseInterface $response) => new Response($response, $request), 46 | static function (GuzzleException $reason) use ($request) { 47 | // If we got back a response, we want to return a Response object 48 | // so it can get sent through the middleware stack. 49 | if ($reason instanceof BadResponseException) { 50 | return new Response($reason->getResponse(), $request); 51 | } 52 | 53 | // For all other cases, we'll wrap the exception in our own 54 | // exception so it can be handled by any request exception middleware. 55 | throw new RequestException($request, $reason); 56 | }, 57 | ); 58 | }; 59 | } 60 | }; 61 | 62 | $pool = new Pool($this->client, $makeRequests(), [ 63 | 'concurrency' => 0, 64 | 'fulfilled' => $onFulfilled, 65 | 'rejected' => $onRejected, 66 | ]); 67 | 68 | $pool->promise()->wait(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/Http/ClientInterface.php: -------------------------------------------------------------------------------- 1 | $requests 20 | * @param ?callable(Response): void $onFulfilled 21 | * @param ?callable(RequestException): void $onRejected 22 | */ 23 | public function pool( 24 | array $requests, 25 | ?callable $onFulfilled = null, 26 | ?callable $onRejected = null, 27 | ): void; 28 | } 29 | -------------------------------------------------------------------------------- /src/Http/FakeClient.php: -------------------------------------------------------------------------------- 1 | 26 | */ 27 | private array $sentRequestUrls = []; 28 | 29 | public function pool(array $requests, ?callable $onFulfilled = null, ?callable $onRejected = null): void 30 | { 31 | foreach ($requests as $request) { 32 | $this->sentRequestUrls[] = $request->getUri(); 33 | 34 | if (null !== $onFulfilled) { 35 | $response = new Response(new GuzzleResponse(), $request); 36 | 37 | $onFulfilled($response); 38 | } 39 | } 40 | } 41 | 42 | public function assertRequestWasSent(Request $request): void 43 | { 44 | $uri = $request->getUri(); 45 | 46 | Assert::assertContains( 47 | $request->getUri(), 48 | $this->sentRequestUrls, 49 | "Expected request to [{$uri}] was not sent", 50 | ); 51 | } 52 | 53 | public function assertRequestWasNotSent(Request $request): void 54 | { 55 | $uri = $request->getUri(); 56 | 57 | Assert::assertNotContains( 58 | $uri, 59 | $this->sentRequestUrls, 60 | "Unexpected request sent to [{$uri}]", 61 | ); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/Http/MalformedUriException.php: -------------------------------------------------------------------------------- 1 | request; 30 | } 31 | 32 | public function getReason(): GuzzleException 33 | { 34 | return $this->reason; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/Http/Response.php: -------------------------------------------------------------------------------- 1 | crawler = new Crawler((string) $response->getBody(), $request->getUri()); 38 | } 39 | 40 | public function __call(string $method, array $args): mixed 41 | { 42 | return $this->crawler->{$method}(...$args); 43 | } 44 | 45 | public function getRequest(): Request 46 | { 47 | return $this->request; 48 | } 49 | 50 | public function getStatus(): int 51 | { 52 | return $this->response->getStatusCode(); 53 | } 54 | 55 | public function getBody(): string 56 | { 57 | return (string) $this->response->getBody(); 58 | } 59 | 60 | public function withBody(string $body): self 61 | { 62 | $this->response = $this->response->withBody(Utils::streamFor($body)); 63 | $this->crawler = new Crawler($body, $this->request->getUri()); 64 | 65 | return $this; 66 | } 67 | 68 | public function getResponse(): ResponseInterface 69 | { 70 | return $this->response; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/Http/URL.php: -------------------------------------------------------------------------------- 1 | $this->scheme, 69 | 'host' => $this->host, 70 | 'port' => $this->port, 71 | 'user' => $this->username, 72 | 'pass' => $this->password, 73 | 'path' => $this->path, 74 | 'query' => $this->query->toString(), 75 | 'fragment' => $this->fragment, 76 | ]; 77 | 78 | return http_build_url(\array_filter($parts)); 79 | } 80 | 81 | /** 82 | * Checks if two URLs are equal. 83 | * 84 | * URLs are considered equal if they contain all the same parts with all the 85 | * same values. Note that if the URLs have a query string, the order of the 86 | * query parameters does not matter. 87 | * 88 | * If a string is provided, it will be converted to a URL object internally. 89 | * 90 | * @throws MalformedUriException thrown if the provided URL is a string and cannot be parsed to a valid URL object 91 | */ 92 | public function equals(self|string $other): bool 93 | { 94 | if (\is_string($other)) { 95 | $other = self::parse($other); 96 | } 97 | 98 | return $this->scheme === $other->scheme 99 | && $this->host === $other->host 100 | && $this->port === $other->port 101 | && $this->username === $other->username 102 | && $this->password === $other->password 103 | && $this->path === $other->path 104 | && $this->query->equals($other->query) 105 | && $this->fragment === $other->fragment; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/Http/UnknownQueryParameterException.php: -------------------------------------------------------------------------------- 1 | getProperties(\ReflectionProperty::IS_PUBLIC); 26 | 27 | return \array_reduce( 28 | $properties, 29 | function (array $data, \ReflectionProperty $property): array { 30 | /** @psalm-suppress MixedAssignment */ 31 | $data[$property->getName()] = $property->getValue($this); 32 | 33 | return $data; 34 | }, 35 | [], 36 | ); 37 | } 38 | 39 | final public function get(string $key, mixed $default = null): mixed 40 | { 41 | $reflectionClass = new \ReflectionClass($this); 42 | 43 | try { 44 | $property = $reflectionClass->getProperty($key); 45 | } catch (\ReflectionException) { 46 | return $default; 47 | } 48 | 49 | if (!$property->isPublic()) { 50 | return $default; 51 | } 52 | 53 | return $property->getValue($this) ?: $default; 54 | } 55 | 56 | final public function set(string $key, mixed $value): ItemInterface 57 | { 58 | $reflectionClass = new \ReflectionClass($this); 59 | 60 | try { 61 | $property = $reflectionClass->getProperty($key); 62 | } catch (\ReflectionException) { 63 | throw new \InvalidArgumentException( 64 | \sprintf('No public property %s exists on class %s', $key, static::class), 65 | ); 66 | } 67 | 68 | if (!$property->isPublic()) { 69 | throw new \InvalidArgumentException( 70 | \sprintf('No public property %s exists on class %s', $key, static::class), 71 | ); 72 | } 73 | 74 | $property->setValue($this, $value); 75 | 76 | return $this; 77 | } 78 | 79 | final public function has(string $key): bool 80 | { 81 | $reflectionClass = new \ReflectionClass($this); 82 | 83 | try { 84 | $property = $reflectionClass->getProperty($key); 85 | 86 | return $property->isPublic(); 87 | } catch (\ReflectionException) { 88 | return false; 89 | } 90 | } 91 | 92 | final public function offsetExists(mixed $offset): bool 93 | { 94 | return $this->has($offset); 95 | } 96 | 97 | final public function offsetGet(mixed $offset): mixed 98 | { 99 | // @phpstan-ignore function.alreadyNarrowedType 100 | if (!\is_string($offset)) { 101 | throw new \InvalidArgumentException('Offset needs to be a string'); 102 | } 103 | 104 | return $this->get($offset); 105 | } 106 | 107 | final public function offsetSet(mixed $offset, mixed $value): void 108 | { 109 | if (!\is_string($offset)) { 110 | throw new \InvalidArgumentException('Offset needs to be a string'); 111 | } 112 | 113 | $this->set($offset, $value); 114 | } 115 | 116 | final public function offsetUnset(mixed $offset): void 117 | { 118 | throw new \RuntimeException('Unsetting properties is not supported for custom item classes'); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/ItemPipeline/Item.php: -------------------------------------------------------------------------------- 1 | data; 29 | } 30 | 31 | public function get(string $key, mixed $default = null): mixed 32 | { 33 | return $this->data[$key] ?? $default; 34 | } 35 | 36 | public function set(string $key, mixed $value): ItemInterface 37 | { 38 | $this->data[$key] = $value; 39 | 40 | return $this; 41 | } 42 | 43 | public function has(string $key): bool 44 | { 45 | return isset($this->data[$key]); 46 | } 47 | 48 | public function offsetExists(mixed $offset): bool 49 | { 50 | return isset($this->data[$offset]); 51 | } 52 | 53 | public function offsetGet(mixed $offset): mixed 54 | { 55 | /** @psalm-suppress MixedReturnStatement */ 56 | return $this->data[$offset]; 57 | } 58 | 59 | public function offsetSet(mixed $offset, mixed $value): void 60 | { 61 | /** @psalm-suppress PossiblyNullArrayOffset */ 62 | $this->data[$offset] = $value; 63 | } 64 | 65 | public function offsetUnset(mixed $offset): void 66 | { 67 | unset($this->data[$offset]); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/ItemPipeline/ItemInterface.php: -------------------------------------------------------------------------------- 1 | 20 | */ 21 | interface ItemInterface extends \ArrayAccess, DroppableInterface 22 | { 23 | public function all(): array; 24 | 25 | public function get(string $key, mixed $default = null): mixed; 26 | 27 | public function set(string $key, mixed $value): self; 28 | 29 | public function has(string $key): bool; 30 | } 31 | -------------------------------------------------------------------------------- /src/ItemPipeline/ItemPipeline.php: -------------------------------------------------------------------------------- 1 | 26 | */ 27 | private array $processors = []; 28 | 29 | public function __construct(private EventDispatcherInterface $eventDispatcher) 30 | { 31 | } 32 | 33 | public function setProcessors(ItemProcessorInterface ...$processors): ItemPipelineInterface 34 | { 35 | $this->processors = $processors; 36 | 37 | return $this; 38 | } 39 | 40 | public function sendItem(ItemInterface $item): ItemInterface 41 | { 42 | foreach ($this->processors as $processor) { 43 | if ($processor instanceof ConditionalItemProcessor && !$processor->shouldHandle($item)) { 44 | continue; 45 | } 46 | 47 | $item = $processor->processItem($item); 48 | 49 | if ($item->wasDropped()) { 50 | $this->eventDispatcher->dispatch( 51 | new ItemDropped($item), 52 | ItemDropped::NAME, 53 | ); 54 | 55 | return $item; 56 | } 57 | } 58 | 59 | $this->eventDispatcher->dispatch( 60 | new ItemScraped($item), 61 | ItemScraped::NAME, 62 | ); 63 | 64 | return $item; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/ItemPipeline/ItemPipelineInterface.php: -------------------------------------------------------------------------------- 1 | getHandledItemClasses(), true); 26 | } 27 | 28 | /** 29 | * @return array> 30 | */ 31 | abstract protected function getHandledItemClasses(): array; 32 | } 33 | -------------------------------------------------------------------------------- /src/ItemPipeline/Processors/FakeProcessor.php: -------------------------------------------------------------------------------- 1 | calls[] = $item->all(); 29 | 30 | return $item; 31 | } 32 | 33 | public function assertCalledWith(ItemInterface $item): void 34 | { 35 | Assert::assertContains( 36 | $item->all(), 37 | $this->calls, 38 | 'Processor was not called with expected item', 39 | ); 40 | } 41 | 42 | public function assertNotCalledWith(ItemInterface $item): void 43 | { 44 | Assert::assertNotContains( 45 | $item->all(), 46 | $this->calls, 47 | 'Processor got unexpected call with item', 48 | ); 49 | } 50 | 51 | public function assertNotCalled(): void 52 | { 53 | Assert::assertEmpty( 54 | $this->calls, 55 | \sprintf('Expected processor to not have been called at all. Was called %s time(s)', \count($this->calls)), 56 | ); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/ItemPipeline/Processors/ItemProcessorInterface.php: -------------------------------------------------------------------------------- 1 | $spiderClass 53 | */ 54 | public static function startSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): void 55 | { 56 | self::getRunner()->startSpider($spiderClass, $overrides, $context); 57 | } 58 | 59 | /** 60 | * Start the spider run and collect and return scraped items. 61 | * 62 | * @psalm-param class-string $spiderClass 63 | * 64 | * @return array 65 | */ 66 | public static function collectSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): array 67 | { 68 | return self::getRunner()->collectSpider($spiderClass, $overrides, $context); 69 | } 70 | 71 | /** 72 | * @template T 73 | * 74 | * @psalm-param class-string $class 75 | * 76 | * @psalm-suppress MixedInferredReturnType 77 | * 78 | * @return T 79 | */ 80 | public static function resolve(string $class): mixed 81 | { 82 | /** @psalm-suppress MixedReturnStatement */ 83 | return self::getContainer()->get($class); 84 | } 85 | 86 | private static function getContainer(): ContainerInterface 87 | { 88 | if (null === self::$container) { 89 | self::$container = new DefaultContainer(); 90 | } 91 | 92 | return self::$container; 93 | } 94 | 95 | private static function getRunner(): RunnerInterface 96 | { 97 | return self::$runnerFake ?: self::resolve(RunnerInterface::class); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/Scheduling/ArrayRequestScheduler.php: -------------------------------------------------------------------------------- 1 | 25 | */ 26 | private array $requests = []; 27 | 28 | private \DateTimeImmutable $nextBatchReadyAt; 29 | 30 | public function __construct(private ClockInterface $clock) 31 | { 32 | $this->nextBatchReadyAt = $this->clock->now(); 33 | } 34 | 35 | public function schedule(Request $request): void 36 | { 37 | $this->requests[] = $request; 38 | } 39 | 40 | public function empty(): bool 41 | { 42 | return empty($this->requests); 43 | } 44 | 45 | /** 46 | * @return array 47 | */ 48 | public function nextRequests(int $batchSize): array 49 | { 50 | $this->clock->sleepUntil($this->nextBatchReadyAt); 51 | 52 | $this->updateNextBatchTime(); 53 | 54 | return $this->getNextRequests($batchSize); 55 | } 56 | 57 | public function forceNextRequests(int $batchSize): array 58 | { 59 | return $this->getNextRequests($batchSize); 60 | } 61 | 62 | public function setDelay(int $delay): RequestSchedulerInterface 63 | { 64 | $this->delay = $delay; 65 | 66 | return $this; 67 | } 68 | 69 | public function setNamespace(string $namespace): RequestSchedulerInterface 70 | { 71 | return $this; 72 | } 73 | 74 | private function updateNextBatchTime(): void 75 | { 76 | $this->nextBatchReadyAt = $this->clock->now()->add(new \DateInterval("PT{$this->delay}S")); 77 | } 78 | 79 | /** 80 | * @psalm-suppress MixedReturnTypeCoercion 81 | * 82 | * @return array 83 | */ 84 | private function getNextRequests(int $batchSize): array 85 | { 86 | return \array_splice($this->requests, 0, $batchSize); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/Scheduling/RequestSchedulerInterface.php: -------------------------------------------------------------------------------- 1 | 29 | */ 30 | public function nextRequests(int $batchSize): array; 31 | 32 | /** 33 | * Immediately return the next number of requests as defined by $batchSize 34 | * regardless of the configured delay. 35 | * 36 | * @return array 37 | */ 38 | public function forceNextRequests(int $batchSize): array; 39 | 40 | public function empty(): bool; 41 | 42 | public function setDelay(int $delay): self; 43 | 44 | public function setNamespace(string $namespace): self; 45 | } 46 | -------------------------------------------------------------------------------- /src/Scheduling/Timing/ClockInterface.php: -------------------------------------------------------------------------------- 1 | now = new \DateTimeImmutable(); 25 | } 26 | 27 | public function now(): \DateTimeImmutable 28 | { 29 | return $this->now; 30 | } 31 | 32 | public function sleep(int $seconds): void 33 | { 34 | $this->sleepUntil( 35 | $this->now->add(new \DateInterval("PT{$seconds}S")), 36 | ); 37 | } 38 | 39 | public function sleepUntil(\DateTimeImmutable $date): void 40 | { 41 | if ($date < $this->now) { 42 | return; 43 | } 44 | 45 | $this->secondsPassed += $this->now->diff($date)->s; 46 | $this->now = $date; 47 | } 48 | 49 | public function timePassed(): int 50 | { 51 | return $this->secondsPassed; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/Scheduling/Timing/SystemClock.php: -------------------------------------------------------------------------------- 1 | now()->getTimestamp(); 34 | $target = $date->getTimestamp(); 35 | 36 | if ($target <= $now) { 37 | return; 38 | } 39 | 40 | /** @psalm-suppress UnusedFunctionCall */ 41 | \time_sleep_until($target); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/Shell/Commands/FetchCommand.php: -------------------------------------------------------------------------------- 1 | addArgument('url', InputArgument::REQUIRED); 34 | } 35 | 36 | protected function execute(InputInterface $input, OutputInterface $output): int 37 | { 38 | $client = new Client(); 39 | 40 | /** 41 | * @psalm-suppress MixedAssignement 42 | * 43 | * @var string 44 | */ 45 | $url = $input->getArgument('url'); 46 | $request = new Request('GET', $url, static fn () => yield from []); 47 | $response = new Response( 48 | $client->send($request->getPsrRequest()), 49 | $request, 50 | ); 51 | 52 | $output->writeln( 53 | << 55 | Available variables: 56 | \$response: <{$response->getStatus()} '{$url}'> 57 | \$html: Raw HTML contents of response 58 | Commands: 59 | fetch Fetch URL and update the \$response and \$html objects 60 | 61 | TEXT 62 | ); 63 | 64 | /** @var Shell $app */ 65 | $app = $this->getApplication(); 66 | $app->setScopeVariables([ 67 | 'response' => $response, 68 | 'html' => $response->getBody(), 69 | ]); 70 | 71 | return 0; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/Shell/Commands/RunSpiderCommand.php: -------------------------------------------------------------------------------- 1 | addArgument('spider', InputArgument::REQUIRED, 'The spider class to execute'); 35 | } 36 | 37 | protected function execute(InputInterface $input, OutputInterface $output): int 38 | { 39 | $resolver = Roach::resolve(NamespaceResolverInterface::class); 40 | 41 | try { 42 | /** @phpstan-ignore argument.type */ 43 | $spiderClass = $resolver->resolveSpiderNamespace($input->getArgument('spider')); 44 | } catch (InvalidSpiderException $exception) { 45 | $output->writeln(\sprintf('Invalid spider: %s', $exception->getMessage())); 46 | 47 | return self::FAILURE; 48 | } 49 | 50 | Roach::startSpider($spiderClass); 51 | 52 | return self::SUCCESS; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/Shell/InvalidSpiderException.php: -------------------------------------------------------------------------------- 1 | addArgument('url', InputArgument::REQUIRED, 'The URL to fetch'); 40 | } 41 | 42 | protected function execute(InputInterface $input, OutputInterface $output): int 43 | { 44 | $input->setOption('ansi', true); 45 | 46 | /** @psalm-suppress MixedAssignment */ 47 | $url = $input->getArgument('url'); 48 | 49 | $config = Configuration::fromInput($input); 50 | $config->addCasters([ 51 | Crawler::class => 'RoachPHP\Shell\ShellCaster::castCrawler', 52 | Link::class => 'RoachPHP\Shell\ShellCaster::castLink', 53 | Response::class => 'RoachPHP\Shell\ShellCaster::castResponse', 54 | ]); 55 | $config->addCommands([new FetchCommand()]); 56 | 57 | $shell = new Shell($config); 58 | 59 | $command = $shell->find('fetch'); 60 | $command->run(new ArrayInput(['url' => $url]), $output); 61 | 62 | $shell->run(); 63 | 64 | return 0; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/Shell/Resolver/DefaultNamespaceResolverDecorator.php: -------------------------------------------------------------------------------- 1 | defaultNamespace = \mb_trim($defaultNamespace, " \t\n\r\0\x0B\\"); 28 | } 29 | 30 | /** 31 | * @throws InvalidSpiderException 32 | * 33 | * @return class-string 34 | */ 35 | public function resolveSpiderNamespace(string $spiderClass): string 36 | { 37 | $spiderClass = \mb_trim($spiderClass); 38 | 39 | if ( 40 | \str_starts_with($spiderClass, '\\') 41 | || \str_starts_with($spiderClass, $this->defaultNamespace) 42 | || \class_exists($spiderClass) 43 | ) { 44 | return $this->wrapped->resolveSpiderNamespace($spiderClass); 45 | } 46 | 47 | return $this->wrapped->resolveSpiderNamespace($this->defaultNamespace . '\\' . $spiderClass); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/Shell/Resolver/FakeNamespaceResolver.php: -------------------------------------------------------------------------------- 1 | $spiderClass 25 | * 26 | * @return class-string 27 | */ 28 | public function resolveSpiderNamespace(string $spiderClass): string 29 | { 30 | return $spiderClass; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/Shell/Resolver/NamespaceResolverInterface.php: -------------------------------------------------------------------------------- 1 | 26 | */ 27 | public function resolveSpiderNamespace(string $spiderClass): string; 28 | } 29 | -------------------------------------------------------------------------------- /src/Shell/Resolver/StaticNamespaceResolver.php: -------------------------------------------------------------------------------- 1 | $spiderClass 23 | * 24 | * @throws \ReflectionException 25 | * @throws InvalidSpiderException 26 | * 27 | * @return class-string 28 | */ 29 | public function resolveSpiderNamespace(string $spiderClass): string 30 | { 31 | if (!\class_exists($spiderClass)) { 32 | throw new InvalidSpiderException("The spider class {$spiderClass} does not exist"); 33 | } 34 | 35 | if (!$this->isSpider($spiderClass)) { 36 | throw new InvalidSpiderException("The class {$spiderClass} is not a spider"); 37 | } 38 | 39 | return $spiderClass; 40 | } 41 | 42 | /** 43 | * @param class-string $spiderClass 44 | * 45 | * @throws \ReflectionException 46 | */ 47 | private function isSpider(string $spiderClass): bool 48 | { 49 | return (new \ReflectionClass($spiderClass))->implementsInterface(SpiderInterface::class); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/Shell/ShellCaster.php: -------------------------------------------------------------------------------- 1 | $response->getStatus(), 27 | Caster::PREFIX_VIRTUAL . '.uri' => $response->getUri(), 28 | ]; 29 | } 30 | 31 | public static function castCrawler(Crawler $crawler): array 32 | { 33 | return [ 34 | Caster::PREFIX_VIRTUAL . '.count' => $crawler->count(), 35 | Caster::PREFIX_VIRTUAL . '.html' => $crawler->outerHtml(), 36 | ]; 37 | } 38 | 39 | public static function castLink(Link $link): array 40 | { 41 | return [ 42 | Caster::PREFIX_PROTECTED . '.uri' => $link->getUri(), 43 | ]; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/Spider/AbstractSpider.php: -------------------------------------------------------------------------------- 1 | configuration = $loaderStrategy->load(); 30 | } 31 | 32 | /** 33 | * @psalm-return \Generator 34 | */ 35 | abstract public function parse(Response $response): \Generator; 36 | 37 | /** 38 | * @return array 39 | */ 40 | final public function getInitialRequests(): array 41 | { 42 | return $this->initialRequests(); 43 | } 44 | 45 | final public function withConfiguration(Configuration $configuration): void 46 | { 47 | $this->configuration = $configuration; 48 | } 49 | 50 | final public function withContext(array $context): void 51 | { 52 | $this->context = $context; 53 | } 54 | 55 | final public function loadConfiguration(): Configuration 56 | { 57 | return $this->configuration; 58 | } 59 | 60 | protected function request( 61 | string $method, 62 | string $url, 63 | string $parseMethod = 'parse', 64 | array $options = [], 65 | ): ParseResult { 66 | // @phpstan-ignore argument.type 67 | return ParseResult::request($method, $url, [$this, $parseMethod], $options); 68 | } 69 | 70 | protected function item(array|ItemInterface $item): ParseResult 71 | { 72 | if ($item instanceof ItemInterface) { 73 | return ParseResult::fromValue($item); 74 | } 75 | 76 | return ParseResult::item($item); 77 | } 78 | 79 | /** 80 | * @return array 81 | */ 82 | protected function initialRequests(): array 83 | { 84 | return \array_map(function (string $url) { 85 | return new Request('GET', $url, [$this, 'parse']); 86 | }, $this->configuration->startUrls); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/Spider/BasicSpider.php: -------------------------------------------------------------------------------- 1 | 31 | */ 32 | public array $startUrls = []; 33 | 34 | /** 35 | * @var list> 36 | */ 37 | public array $spiderMiddleware = []; 38 | 39 | /** 40 | * @var list> 41 | */ 42 | public array $downloaderMiddleware = [ 43 | RequestDeduplicationMiddleware::class, 44 | HttpErrorMiddleware::class, 45 | ]; 46 | 47 | /** 48 | * @var list> 49 | */ 50 | public array $itemProcessors = []; 51 | 52 | /** 53 | * @var list> 54 | */ 55 | public array $extensions = [ 56 | LoggerExtension::class, 57 | StatsCollectorExtension::class, 58 | ]; 59 | 60 | public int $concurrency = 5; 61 | 62 | public int $requestDelay = 1; 63 | 64 | public function __construct() 65 | { 66 | parent::__construct(new ArrayLoader([ 67 | 'startUrls' => $this->startUrls, 68 | 'downloaderMiddleware' => $this->downloaderMiddleware, 69 | 'spiderMiddleware' => $this->spiderMiddleware, 70 | 'itemProcessors' => $this->itemProcessors, 71 | 'extensions' => $this->extensions, 72 | 'concurrency' => $this->concurrency, 73 | 'requestDelay' => $this->requestDelay, 74 | ])); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/Spider/Configuration/ArrayLoader.php: -------------------------------------------------------------------------------- 1 | [], 29 | * spiderMiddleware: class-string[], 30 | * itemProcessors: class-string[], 31 | * extensions: class-string[], 32 | * concurrency: int, 33 | * requestDelay: int 34 | * } 35 | */ 36 | private array $config; 37 | 38 | public function __construct(array $configuration) 39 | { 40 | $resolver = new OptionsResolver(); 41 | 42 | $resolver->setDefaults([ 43 | 'startUrls' => [], 44 | 'downloaderMiddleware' => [], 45 | 'itemProcessors' => [], 46 | 'spiderMiddleware' => [], 47 | 'extensions' => [], 48 | 'concurrency' => 5, 49 | 'requestDelay' => 0, 50 | ]); 51 | 52 | // @phpstan-ignore assign.propertyType 53 | $this->config = $resolver->resolve($configuration); 54 | } 55 | 56 | public function load(): Configuration 57 | { 58 | return new Configuration( 59 | $this->config['startUrls'], 60 | $this->config['downloaderMiddleware'], 61 | $this->config['itemProcessors'], 62 | $this->config['spiderMiddleware'], 63 | $this->config['extensions'], 64 | $this->config['concurrency'], 65 | $this->config['requestDelay'], 66 | ); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/Spider/Configuration/Configuration.php: -------------------------------------------------------------------------------- 1 | $startUrls 25 | * @param array> $downloaderMiddleware 26 | * @param array> $itemProcessors 27 | * @param array> $spiderMiddleware 28 | * @param array> $extensions 29 | */ 30 | public function __construct( 31 | public array $startUrls, 32 | public array $downloaderMiddleware, 33 | public array $itemProcessors, 34 | public array $spiderMiddleware, 35 | public array $extensions, 36 | public int $concurrency, 37 | public int $requestDelay, 38 | ) { 39 | } 40 | 41 | public function withOverrides(Overrides $overrides): self 42 | { 43 | $newValues = \array_merge([ 44 | 'startUrls' => $this->startUrls, 45 | 'downloaderMiddleware' => $this->downloaderMiddleware, 46 | 'spiderMiddleware' => $this->spiderMiddleware, 47 | 'extensions' => $this->extensions, 48 | 'itemProcessors' => $this->itemProcessors, 49 | 'concurrency' => $this->concurrency, 50 | 'requestDelay' => $this->requestDelay, 51 | ], $overrides->toArray()); 52 | 53 | return new self(...$newValues); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/Spider/Configuration/Overrides.php: -------------------------------------------------------------------------------- 1 | $startUrls 28 | * @param null|list> $downloaderMiddleware 29 | * @param null|list> $spiderMiddleware 30 | * @param null|list> $itemProcessors 31 | * @param null|list> $extensions 32 | */ 33 | public function __construct( 34 | public ?array $startUrls = null, 35 | public ?array $downloaderMiddleware = null, 36 | public ?array $spiderMiddleware = null, 37 | public ?array $itemProcessors = null, 38 | public ?array $extensions = null, 39 | public ?int $concurrency = null, 40 | public ?int $requestDelay = null, 41 | ) { 42 | } 43 | 44 | /** 45 | * @psalm-suppress MoreSpecificReturnType, LessSpecificReturnStatement 46 | * 47 | * @return array{ 48 | * startUrls?: string[], 49 | * downloaderMiddleware?: class-string[], 50 | * spiderMiddleware?: class-string[], 51 | * itemProcessors?: class-string[], 52 | * extensions?: class-string[], 53 | * concurrency?: int, 54 | * requestDelay?: int, 55 | * } 56 | */ 57 | public function toArray(): array 58 | { 59 | return \array_filter([ 60 | 'startUrls' => $this->startUrls, 61 | 'downloaderMiddleware' => $this->downloaderMiddleware, 62 | 'spiderMiddleware' => $this->spiderMiddleware, 63 | 'itemProcessors' => $this->itemProcessors, 64 | 'extensions' => $this->extensions, 65 | 'concurrency' => $this->concurrency, 66 | 'requestDelay' => $this->requestDelay, 67 | ], static fn ($value) => null !== $value); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/Spider/ConfigurationLoaderStrategy.php: -------------------------------------------------------------------------------- 1 | getRequest()->getMeta('depth', 1); 28 | $newDepth = $currentDepth + 1; 29 | 30 | if ($this->option('maxCrawlDepth') < $newDepth) { 31 | return $request->drop('Maximum crawl depth reached'); 32 | } 33 | 34 | return $request->withMeta('depth', $currentDepth + 1); 35 | } 36 | 37 | private function defaultOptions(): array 38 | { 39 | return [ 40 | 'maxCrawlDepth' => 10, 41 | ]; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/Spider/Middleware/RequestMiddlewareInterface.php: -------------------------------------------------------------------------------- 1 | middleware instanceof ItemMiddlewareInterface) { 44 | return $this->middleware->handleItem($item, $response); 45 | } 46 | 47 | return $item; 48 | } 49 | 50 | public function handleRequest(Request $request, Response $response): Request 51 | { 52 | if ($this->middleware instanceof RequestMiddlewareInterface) { 53 | return $this->middleware->handleRequest($request, $response); 54 | } 55 | 56 | return $request; 57 | } 58 | 59 | public function handleResponse(Response $response): Response 60 | { 61 | if ($this->middleware instanceof ResponseMiddlewareInterface) { 62 | return $this->middleware->handleResponse($response); 63 | } 64 | 65 | return $response; 66 | } 67 | 68 | public function configure(array $options): void 69 | { 70 | $this->middleware->configure($options); 71 | } 72 | 73 | public function getMiddleware(): ItemMiddlewareInterface|RequestMiddlewareInterface|ResponseMiddlewareInterface 74 | { 75 | return $this->middleware; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/Spider/ParseResult.php: -------------------------------------------------------------------------------- 1 | value; 40 | } 41 | 42 | /** 43 | * @param callable(Response): \Generator $parseCallback 44 | */ 45 | public static function request( 46 | string $method, 47 | string $url, 48 | callable $parseCallback, 49 | array $options = [], 50 | ): self { 51 | return new self(new Request($method, $url, $parseCallback, $options)); 52 | } 53 | 54 | /** 55 | * @param \Closure(Request): void $ifRequest 56 | * @param \Closure(ItemInterface): void $ifItem 57 | */ 58 | public function apply(\Closure $ifRequest, \Closure $ifItem): void 59 | { 60 | if ($this->value instanceof Request) { 61 | $ifRequest($this->value); 62 | } else { 63 | $ifItem($this->value); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/Spider/Processor.php: -------------------------------------------------------------------------------- 1 | 28 | */ 29 | private array $middleware = []; 30 | 31 | public function __construct(private EventDispatcherInterface $eventDispatcher) 32 | { 33 | } 34 | 35 | public function withMiddleware(SpiderMiddlewareInterface ...$middleware): self 36 | { 37 | $this->middleware = $middleware; 38 | 39 | return $this; 40 | } 41 | 42 | public function handle(Response $response): \Generator 43 | { 44 | foreach ($this->middleware as $handler) { 45 | $response = $handler->handleResponse($response); 46 | 47 | if ($response->wasDropped()) { 48 | $this->eventDispatcher->dispatch( 49 | new ResponseDropped($response), 50 | ResponseDropped::NAME, 51 | ); 52 | 53 | return; 54 | } 55 | } 56 | 57 | /** @var list $results */ 58 | $results = $response->getRequest()->callback($response); 59 | 60 | foreach ($results as $result) { 61 | $value = $result->value(); 62 | $handleMethod = $value instanceof Request 63 | ? 'handleRequest' 64 | : 'handleItem'; 65 | 66 | foreach ($this->middleware as $handler) { 67 | /** @var ItemInterface|Request $value */ 68 | $value = $handler->{$handleMethod}($value, $response); 69 | 70 | if ($value->wasDropped()) { 71 | if ($value instanceof Request) { 72 | $this->eventDispatcher->dispatch( 73 | new RequestDropped($value), 74 | RequestDropped::NAME, 75 | ); 76 | } else { 77 | $this->eventDispatcher->dispatch( 78 | new ItemDropped($value), 79 | ItemDropped::NAME, 80 | ); 81 | } 82 | 83 | break; 84 | } 85 | } 86 | 87 | if (!$value->wasDropped()) { 88 | yield ParseResult::fromValue($value); 89 | } 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/Spider/SpiderInterface.php: -------------------------------------------------------------------------------- 1 | 38 | */ 39 | public function getInitialRequests(): array; 40 | } 41 | -------------------------------------------------------------------------------- /src/Spider/SpiderMiddlewareInterface.php: -------------------------------------------------------------------------------- 1 | $options 26 | * 27 | * @return array{0: class-string, 1: array} 28 | */ 29 | public static function withOptions(array $options): array 30 | { 31 | return [static::class, $options]; 32 | } 33 | 34 | /** 35 | * @param array $options 36 | */ 37 | final public function configure(array $options): void 38 | { 39 | if ($this->optionsResolved) { 40 | return; 41 | } 42 | 43 | $resolver = new OptionsResolver(); 44 | 45 | $resolver->setDefaults($this->defaultOptions()); 46 | 47 | $this->resolvedOptions = $resolver->resolve($options); 48 | $this->optionsResolved = true; 49 | 50 | $this->onAfterConfigured(); 51 | } 52 | 53 | public function option(string $key): mixed 54 | { 55 | if (!$this->optionsResolved) { 56 | $this->configure([]); 57 | } 58 | 59 | return $this->resolvedOptions[$key] ?? null; 60 | } 61 | 62 | /** 63 | * @return array 64 | */ 65 | private function defaultOptions(): array 66 | { 67 | return []; 68 | } 69 | 70 | /** 71 | * Called after the `configure` method was called on the object the first 72 | * time. This is a good place to perform any one-time setup that should 73 | * happen before the run starts. 74 | */ 75 | private function onAfterConfigured(): void 76 | { 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/Support/ConfigurableInterface.php: -------------------------------------------------------------------------------- 1 | $options 20 | */ 21 | public function configure(array $options): void; 22 | } 23 | -------------------------------------------------------------------------------- /src/Support/Droppable.php: -------------------------------------------------------------------------------- 1 | dropped = true; 26 | $clone->dropReason = $reason; 27 | 28 | return $clone; 29 | } 30 | 31 | public function wasDropped(): bool 32 | { 33 | return $this->dropped; 34 | } 35 | 36 | public function getDropReason(): string 37 | { 38 | return $this->dropReason; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/Support/DroppableInterface.php: -------------------------------------------------------------------------------- 1 | meta[$key] ?? $default; 23 | } 24 | 25 | public function withMeta(string $key, mixed $value): static 26 | { 27 | $newThis = clone $this; 28 | $newThis->meta[$key] = $value; 29 | 30 | return $newThis; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/Testing/Concerns/InteractsWithRequestsAndResponses.php: -------------------------------------------------------------------------------- 1 | makeRequest(); 33 | 34 | $processedRequest = $middleware->handleRequest($request); 35 | 36 | self::assertSame($jar, $processedRequest->getOptions()['cookies']); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/Downloader/Middleware/ExecuteJavascriptMiddlewareTest.php: -------------------------------------------------------------------------------- 1 | makeResponse( 32 | $this->makeRequest('http://localhost:8000/javascript'), 33 | ); 34 | $middleware = new ExecuteJavascriptMiddleware(new FakeLogger()); 35 | 36 | $processedResponse = $middleware->handleResponse($response); 37 | 38 | self::assertSame('Headline', $processedResponse->filter('#content h1')->text('')); 39 | self::assertSame('I was loaded via Javascript!', $processedResponse->filter('#content p')->text('')); 40 | } 41 | 42 | public function testDropResponseIfExceptionOccursWhileExecutingJavascript(): void 43 | { 44 | $throwingBrowsershot = new class() extends Browsershot { 45 | public function bodyHtml(): string 46 | { 47 | throw new \Exception('::exception-message::'); 48 | } 49 | }; 50 | $middleware = new ExecuteJavascriptMiddleware( 51 | new FakeLogger(), 52 | static fn (string $uri): Browsershot => $throwingBrowsershot->setUrl($uri), 53 | ); 54 | 55 | $processedResponse = $middleware->handleResponse($this->makeResponse()); 56 | 57 | self::assertTrue($processedResponse->wasDropped()); 58 | } 59 | 60 | public function testLogErrors(): void 61 | { 62 | $throwingBrowsershot = new class() extends Browsershot { 63 | public function bodyHtml(): string 64 | { 65 | throw new \Exception('::exception-message::'); 66 | } 67 | }; 68 | $logger = new FakeLogger(); 69 | $middleware = new ExecuteJavascriptMiddleware( 70 | $logger, 71 | static fn (string $uri): Browsershot => $throwingBrowsershot->setUrl($uri), 72 | ); 73 | 74 | $middleware->handleResponse($this->makeResponse()); 75 | 76 | self::assertTrue( 77 | $logger->messageWasLogged( 78 | 'info', 79 | '[ExecuteJavascriptMiddleware] Error while executing javascript', 80 | ), 81 | ); 82 | } 83 | 84 | public function testUsesTheProvidedUserAgentOption(): void 85 | { 86 | $mockBrowserShot = $this->createMock(Browsershot::class); 87 | $response = $this->makeResponse( 88 | $this->makeRequest('http://localhost:8000/javascript'), 89 | ); 90 | $middleware = new ExecuteJavascriptMiddleware( 91 | new FakeLogger(), 92 | static fn (string $uri): Browsershot => $mockBrowserShot, 93 | ); 94 | $middleware->configure(['userAgent' => 'custom']); 95 | 96 | $mockBrowserShot->expects(self::once()) 97 | ->method('userAgent') 98 | ->with(self::equalTo('custom')); 99 | 100 | $middleware->handleResponse($response); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /tests/Downloader/Middleware/RobotsTxtMiddlewareTest.php: -------------------------------------------------------------------------------- 1 | engine = new Engine( 50 | new ArrayRequestScheduler(new FakeClock()), 51 | new Downloader(new Client(), $dispatcher), 52 | new ItemPipeline($dispatcher), 53 | new Processor($dispatcher), 54 | $dispatcher, 55 | ); 56 | 57 | $middleware = new RobotsTxtMiddleware(); 58 | $middleware->configure(['fileName' => 'robots']); 59 | $this->middleware = DownloaderMiddlewareAdapter::fromMiddleware($middleware); 60 | } 61 | 62 | public function testOnlyRequestsRobotsTxtOnceForRequestsToSameDomain(): void 63 | { 64 | $parseCallback = static fn () => yield ParseResult::fromValue(self::makeRequest('http://localhost:8000/test2')); 65 | $run = new Run( 66 | [new Request('GET', 'http://localhost:8000/test1', $parseCallback)], 67 | '::namespace::', 68 | downloaderMiddleware: [$this->middleware], 69 | ); 70 | 71 | $this->engine->start($run); 72 | 73 | $this->assertRouteWasCrawledTimes('/robots', 1); 74 | } 75 | 76 | public function testAllowsRequestIfAllowedByRobotsTxt(): void 77 | { 78 | $run = new Run( 79 | [self::makeRequest('http://localhost:8000/test1')], 80 | '::namespace::', 81 | downloaderMiddleware: [$this->middleware], 82 | ); 83 | 84 | $this->engine->start($run); 85 | 86 | $this->assertRouteWasCrawled('/test1'); 87 | } 88 | 89 | public function testDropRequestIfForbiddenByRobotsTxt(): void 90 | { 91 | $run = new Run( 92 | [self::makeRequest('http://localhost:8000/test2')], 93 | '::namespace::', 94 | downloaderMiddleware: [$this->middleware], 95 | ); 96 | 97 | $this->engine->start($run); 98 | 99 | $this->assertRouteWasNotCrawled('/test2'); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /tests/Downloader/Middleware/UserAgentMiddlewareTest.php: -------------------------------------------------------------------------------- 1 | configure([]); 34 | 35 | $request = $middleware->handleRequest($this->makeRequest()); 36 | 37 | self::assertTrue($request->hasHeader('User-Agent')); 38 | self::assertSame('roach-php', $request->getHeader('User-Agent')[0]); 39 | } 40 | 41 | public function testSetCustomUserAgentOnRequest(): void 42 | { 43 | $middleware = new UserAgentMiddleware(); 44 | $middleware->configure(['userAgent' => 'custom']); 45 | 46 | $request = $middleware->handleRequest($this->makeRequest()); 47 | 48 | self::assertTrue($request->hasHeader('User-Agent')); 49 | self::assertSame('custom', $request->getHeader('User-Agent')[0]); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tests/Downloader/Proxy/ArrayConfigurationLoaderTest.php: -------------------------------------------------------------------------------- 1 | [ 27 | 'https' => '::https-proxy-1::', 28 | 'http' => '::http-proxy-1::', 29 | 'no' => ['::no-1::'], 30 | ], 31 | '::host-2::' => [ 32 | 'https' => '::https-proxy-2::', 33 | 'http' => '::http-proxy-2::', 34 | 'no' => [], 35 | ], 36 | '::host-3::' => [ 37 | 'no' => ['::no-3::'], 38 | ], 39 | ]); 40 | 41 | $proxy = $loader->loadProxyConfiguration(); 42 | self::assertEquals( 43 | new Proxy([ 44 | '::host-1::' => new ProxyOptions( 45 | '::http-proxy-1::', 46 | '::https-proxy-1::', 47 | ['::no-1::'], 48 | ), 49 | '::host-2::' => new ProxyOptions( 50 | '::http-proxy-2::', 51 | '::https-proxy-2::', 52 | [], 53 | ), 54 | '::host-3::' => new ProxyOptions( 55 | null, 56 | null, 57 | ['::no-3::'], 58 | ), 59 | ]), 60 | $proxy, 61 | ); 62 | } 63 | 64 | public function testCreatesAWildcardProxyIfOnlyAURLIsProvided(): void 65 | { 66 | $loader = new ArrayConfigurationLoader('::proxy-url::'); 67 | 68 | $proxy = $loader->loadProxyConfiguration(); 69 | 70 | self::assertEquals( 71 | new Proxy([ 72 | '*' => ProxyOptions::allProtocols('::proxy-url::'), 73 | ]), 74 | $proxy, 75 | ); 76 | } 77 | 78 | public function testConfiguresTheSameURLForAllProtocolsIfOnlyAURLIsProvided(): void 79 | { 80 | $loader = new ArrayConfigurationLoader([ 81 | '::host::' => '::proxy-url::', 82 | ]); 83 | 84 | $proxy = $loader->loadProxyConfiguration(); 85 | 86 | self::assertEquals( 87 | new Proxy([ 88 | '::host::' => ProxyOptions::allProtocols('::proxy-url::'), 89 | ]), 90 | $proxy, 91 | ); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /tests/Downloader/Proxy/ProxyTest.php: -------------------------------------------------------------------------------- 1 | optionsFor($this->makeRequest()); 33 | 34 | self::assertTrue($options->equals(ProxyOptions::make())); 35 | } 36 | 37 | public function testReturnMatchingProxyOptionsForRequestIfConfigured(): void 38 | { 39 | $proxy = new Proxy([ 40 | 'domain-1.com' => ProxyOptions::make() 41 | ->allProtocols('::proxy-url-1::'), 42 | 'domain-2.com' => ProxyOptions::make() 43 | ->allProtocols('::proxy-url-2::'), 44 | ]); 45 | 46 | $options = $proxy->optionsFor( 47 | $this->makeRequest('https://domain-1.com'), 48 | ); 49 | self::assertTrue( 50 | $options->equals( 51 | ProxyOptions::make()->allProtocols('::proxy-url-1::'), 52 | ), 53 | ); 54 | 55 | $options = $proxy->optionsFor( 56 | $this->makeRequest('https://domain-2.com'), 57 | ); 58 | self::assertTrue( 59 | $options->equals( 60 | ProxyOptions::make()->allProtocols('::proxy-url-2::'), 61 | ), 62 | ); 63 | } 64 | 65 | public function testReturnsWildcardOptionsIfConfiguredAndDomainDoesntMatch(): void 66 | { 67 | $proxy = new Proxy([ 68 | 'domain-1.com' => ProxyOptions::make() 69 | ->allProtocols('::proxy-url-1::'), 70 | '*' => ProxyOptions::make() 71 | ->allProtocols('::proxy-url-2::'), 72 | ]); 73 | 74 | $options = $proxy->optionsFor( 75 | $this->makeRequest('https://domain-2.com'), 76 | ); 77 | self::assertTrue( 78 | $options->equals( 79 | ProxyOptions::make()->allProtocols('::proxy-url-2::'), 80 | ), 81 | ); 82 | } 83 | 84 | public function testPreferDomainConfigurationOverWildcard(): void 85 | { 86 | $proxy = new Proxy([ 87 | 'domain-1.com' => ProxyOptions::make() 88 | ->allProtocols('::proxy-url-1::'), 89 | '*' => ProxyOptions::make() 90 | ->allProtocols('::proxy-url-2::'), 91 | ]); 92 | 93 | $options = $proxy->optionsFor( 94 | $this->makeRequest('https://domain-1.com'), 95 | ); 96 | self::assertTrue( 97 | $options->equals( 98 | ProxyOptions::make()->allProtocols('::proxy-url-1::'), 99 | ), 100 | ); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /tests/Events/FakeDispatcherTest.php: -------------------------------------------------------------------------------- 1 | dispatcher = new FakeDispatcher(); 30 | } 31 | 32 | public function testAssertDispatchedPassesIfEventWasDispatched(): void 33 | { 34 | $event = new FakeEvent(); 35 | $this->dispatcher->dispatch($event, 'event.name'); 36 | 37 | $this->dispatcher->assertDispatched('event.name'); 38 | } 39 | 40 | public function testAssertDispatchedFailsIfNoEventWasDispatched(): void 41 | { 42 | $this->expectException(AssertionFailedError::class); 43 | $this->dispatcher->assertDispatched('event.name'); 44 | } 45 | 46 | public function testAssertDispatchedFailsIfCallbackReturnsFalse(): void 47 | { 48 | $this->dispatcher->dispatch(new FakeEvent(), 'event.name'); 49 | 50 | $this->expectException(AssertionFailedError::class); 51 | $this->dispatcher->assertDispatched('event.name', static fn (FakeEvent $event) => false); 52 | } 53 | 54 | public function testAssertDispatchedPassesIfCallbackReturnsTrue(): void 55 | { 56 | $this->dispatcher->dispatch(new FakeEvent(), 'event.name'); 57 | 58 | $this->dispatcher->assertDispatched('event.name', static fn (FakeEvent $event) => true); 59 | } 60 | 61 | public function testAssertNotDispatched(): void 62 | { 63 | $event = new FakeEvent(); 64 | 65 | $this->dispatcher->assertNotDispatched('event.name'); 66 | 67 | $this->dispatcher->dispatch($event, 'event.name'); 68 | $this->expectException(AssertionFailedError::class); 69 | $this->dispatcher->assertNotDispatched('event.name'); 70 | } 71 | 72 | public function testRunEventListeners(): void 73 | { 74 | $called = false; 75 | $this->dispatcher->listen('event.name', static function () use (&$called): void { 76 | $called = true; 77 | }); 78 | 79 | $this->dispatcher->dispatch(new FakeEvent(), 'event.name'); 80 | 81 | self::assertTrue($called); 82 | } 83 | } 84 | 85 | final class FakeEvent 86 | { 87 | public function __construct(public array $data = []) 88 | { 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /tests/Extensions/ExtensionTestCase.php: -------------------------------------------------------------------------------- 1 | dispatcher = new FakeDispatcher(); 33 | $this->extension = $this->createExtension(); 34 | 35 | $this->dispatcher->addSubscriber($this->extension); 36 | } 37 | 38 | abstract protected function createExtension(): ExtensionInterface; 39 | 40 | protected function dispatch(Event $event, string $eventName): void 41 | { 42 | $this->dispatcher->dispatch($event, $eventName); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /tests/Extensions/MaxRequestExtensionTest.php: -------------------------------------------------------------------------------- 1 | extension->configure(['limit' => $threshold]); 35 | 36 | for ($i = 0; $threshold - 1 > $i; ++$i) { 37 | $this->dispatch( 38 | new RequestSending($this->makeRequest()), 39 | RequestSending::NAME, 40 | ); 41 | } 42 | 43 | $event = new RequestScheduling($this->makeRequest()); 44 | $this->dispatch($event, RequestScheduling::NAME); 45 | 46 | self::assertFalse($event->request->wasDropped()); 47 | } 48 | 49 | /** 50 | * @dataProvider thresholdProvider 51 | */ 52 | public function testDropRequestAfterThresholdWasReached(int $threshold): void 53 | { 54 | $this->extension->configure(['limit' => $threshold]); 55 | 56 | for ($i = 0; $i < $threshold; ++$i) { 57 | $this->dispatch( 58 | new RequestSending($this->makeRequest()), 59 | RequestSending::NAME, 60 | ); 61 | } 62 | 63 | $event = new RequestScheduling($this->makeRequest()); 64 | $this->dispatch($event, RequestScheduling::NAME); 65 | 66 | self::assertTrue($event->request->wasDropped()); 67 | } 68 | 69 | public static function thresholdProvider(): iterable 70 | { 71 | yield [1]; 72 | 73 | yield [2]; 74 | 75 | yield [3]; 76 | 77 | yield [4]; 78 | } 79 | 80 | protected function createExtension(): ExtensionInterface 81 | { 82 | return new MaxRequestExtension(); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /tests/Extensions/ScrapedItemCollectorExtensionTest.php: -------------------------------------------------------------------------------- 1 | extension->configure([]); 28 | 29 | self::assertEmpty($this->extension->getScrapedItems()); 30 | 31 | $item = new Item(['::key::' => '::value::']); 32 | $this->dispatch(new ItemScraped($item), ItemScraped::NAME); 33 | 34 | self::assertEquals([$item], $this->extension->getScrapedItems()); 35 | } 36 | 37 | protected function createExtension(): ScrapedItemCollectorExtension 38 | { 39 | return new ScrapedItemCollectorExtension(); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tests/Fixtures/Extension.php: -------------------------------------------------------------------------------- 1 | '::default-option-value::', 32 | ]; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /tests/Fixtures/ItemProcessor.php: -------------------------------------------------------------------------------- 1 | '::default-option-value::', 33 | ]; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tests/Fixtures/ItemSpiderMiddleware.php: -------------------------------------------------------------------------------- 1 | '::default-option-value::', 33 | ]; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tests/Fixtures/RequestSpiderMiddleware.php: -------------------------------------------------------------------------------- 1 | '::default-option-value::', 34 | ]; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/Fixtures/ResponseDownloaderMiddleware.php: -------------------------------------------------------------------------------- 1 | > $handledItemClasses 23 | */ 24 | public function __construct(private array $handledItemClasses) 25 | { 26 | } 27 | 28 | public function processItem(ItemInterface $item): ItemInterface 29 | { 30 | return $item->drop('::reason::'); 31 | } 32 | 33 | protected function getHandledItemClasses(): array 34 | { 35 | return $this->handledItemClasses; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tests/Fixtures/TestItem.php: -------------------------------------------------------------------------------- 1 | client = new FakeClient(); 34 | } 35 | 36 | public function testAssertRequestWasSent(): void 37 | { 38 | $requestA = $this->makeRequest('::url-a::'); 39 | $requestB = $this->makeRequest('::url-b::'); 40 | $requestC = $this->makeRequest('::url-c::'); 41 | 42 | $this->client->pool([$requestA, $requestB]); 43 | 44 | $this->client->assertRequestWasSent($requestA); 45 | $this->client->assertRequestWasSent($requestB); 46 | 47 | $this->expectException(AssertionFailedError::class); 48 | $this->client->assertRequestWasSent($requestC); 49 | } 50 | 51 | public function testAssertRequestWasNotSent(): void 52 | { 53 | $requestA = $this->makeRequest('::url-a::'); 54 | $requestB = $this->makeRequest('::url-b::'); 55 | $requestC = $this->makeRequest('::url-c::'); 56 | 57 | $this->client->pool([$requestC]); 58 | 59 | $this->client->assertRequestWasNotSent($requestA); 60 | $this->client->assertRequestWasNotSent($requestB); 61 | 62 | $this->expectException(AssertionFailedError::class); 63 | $this->client->assertRequestWasNotSent($requestC); 64 | } 65 | 66 | public function testCallOnFulfilledCallbackWithResponseForEachRequest(): void 67 | { 68 | $requests = [ 69 | $this->makeRequest('::url-a::')->withMeta('index', 0), 70 | $this->makeRequest('::url-b::')->withMeta('index', 1), 71 | $this->makeRequest('::url-c::')->withMeta('index', 2), 72 | ]; 73 | 74 | $this->client->pool($requests, static function (Response $response) use (&$requests): void { 75 | self::assertContains($response->getRequest(), $requests); 76 | 77 | // Remove request from array so it can't be used for 78 | // another reponse as well. 79 | unset($requests[$response->getRequest()->getMeta('index')]); 80 | }); 81 | 82 | self::assertEmpty($requests); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /tests/IntegrationTestCase.php: -------------------------------------------------------------------------------- 1 | skipIfServerNotRunning(); 28 | 29 | if (\file_exists(__DIR__ . '/Server/tmp/crawled.json')) { 30 | \unlink(__DIR__ . '/Server/tmp/crawled.json'); 31 | } 32 | } 33 | 34 | protected function skipIfServerNotRunning(): void 35 | { 36 | if (false === \file_get_contents("{$this->serverUrl}/ping")) { 37 | self::markTestSkipped('Skipping integration test. Server not running.'); 38 | } 39 | } 40 | 41 | protected function assertRouteWasCrawled(string $route): void 42 | { 43 | self::assertArrayHasKey($route, $this->getCrawledRoutes()); 44 | } 45 | 46 | protected function assertRouteWasCrawledTimes(string $route, int $times): void 47 | { 48 | $crawledRoutes = $this->getCrawledRoutes(); 49 | 50 | self::assertArrayHasKey($route, $crawledRoutes); 51 | self::assertSame($times, $crawledRoutes[$route]); 52 | } 53 | 54 | protected function assertRouteWasNotCrawled(string $route): void 55 | { 56 | self::assertArrayNotHasKey($route, $this->getCrawledRoutes()); 57 | } 58 | 59 | private function getCrawledRoutes(): array 60 | { 61 | $response = \file_get_contents("{$this->serverUrl}/crawled-routes"); 62 | 63 | if (!$response) { 64 | return []; 65 | } 66 | 67 | return \json_decode( 68 | $response, 69 | true, 70 | 512, 71 | \JSON_THROW_ON_ERROR, 72 | ); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /tests/ItemPipeline/CustomItemProcessorTest.php: -------------------------------------------------------------------------------- 1 | shouldHandle(new TestItem('::foo::', '::bar::')), 31 | ); 32 | 33 | $processor = new TestCustomItemProcessor([TestItem2::class]); 34 | self::assertTrue( 35 | $processor->shouldHandle(new TestItem2()), 36 | ); 37 | } 38 | 39 | public function testDoesNotHandleItemsNotDefinedInTheChildClass(): void 40 | { 41 | $processor = new TestCustomItemProcessor([TestItem::class]); 42 | self::assertFalse( 43 | $processor->shouldHandle(new TestItem2()), 44 | ); 45 | 46 | $processor = new TestCustomItemProcessor([TestItem2::class]); 47 | self::assertFalse( 48 | $processor->shouldHandle(new TestItem('::foo::', '::bar::')), 49 | ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tests/ItemPipeline/ItemTest.php: -------------------------------------------------------------------------------- 1 | assertRunWasStarted(TestSpider::class); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tests/Scheduling/Timing/FakeClockTest.php: -------------------------------------------------------------------------------- 1 | clock = new FakeClock(); 30 | } 31 | 32 | public function testWaitUntilTargetTime(): void 33 | { 34 | $now = $this->clock->now(); 35 | 36 | $this->clock->sleepUntil($now->add(new \DateInterval('PT1S'))); 37 | $then1 = $this->clock->now(); 38 | self::assertSame(1, $now->diff($then1)->s); 39 | 40 | $this->clock->sleepUntil($then1->add(new \DateInterval('PT1S'))); 41 | $then2 = $this->clock->now(); 42 | self::assertSame(1, $then1->diff($then2)->s); 43 | self::assertSame(2, $now->diff($then2)->s); 44 | } 45 | 46 | public function testDontWaitIfTargetDateIsInPast(): void 47 | { 48 | $now = $this->clock->now(); 49 | 50 | $this->clock->sleepUntil($now->sub(new \DateInterval('PT2S'))); 51 | $then = $this->clock->now(); 52 | self::assertSame(0, $now->diff($then)->s); 53 | } 54 | 55 | public function testRecordTimePassedSleepUntil(): void 56 | { 57 | $clock = new FakeClock(); 58 | 59 | self::assertSame(0, $clock->timePassed()); 60 | 61 | $clock->sleepUntil($clock->now()->add(new \DateInterval('PT5S'))); 62 | self::assertSame(5, $clock->timePassed()); 63 | 64 | $clock->sleepUntil($clock->now()->add(new \DateInterval('PT2S'))); 65 | self::assertSame(7, $clock->timePassed()); 66 | 67 | $clock->sleepUntil($clock->now()->add(new \DateInterval('PT3S'))); 68 | self::assertSame(10, $clock->timePassed()); 69 | } 70 | 71 | public function testRecordTimePassedSleep(): void 72 | { 73 | $clock = new FakeClock(); 74 | 75 | self::assertSame(0, $clock->timePassed()); 76 | 77 | $clock->sleep(5); 78 | self::assertSame(5, $clock->timePassed()); 79 | 80 | $clock->sleep(2); 81 | self::assertSame(7, $clock->timePassed()); 82 | 83 | $clock->sleep(3); 84 | self::assertSame(10, $clock->timePassed()); 85 | } 86 | 87 | protected function createClock(): ClockInterface 88 | { 89 | return new FakeClock(); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /tests/Server/index.php: -------------------------------------------------------------------------------- 1 | add(static function (ServerRequestInterface $request, RequestHandlerInterface $handler): Response { 27 | $ignoredRoutes = ['/ping', '/crawled-routes']; 28 | $path = $request->getUri()->getPath(); 29 | 30 | if (\in_array($path, $ignoredRoutes, true)) { 31 | return $handler->handle($request); 32 | } 33 | 34 | if (!\file_exists(LOG_PATH)) { 35 | \file_put_contents(LOG_PATH, '{}'); 36 | } 37 | 38 | try { 39 | $logs = \json_decode( 40 | \file_get_contents(LOG_PATH), // @phpstan-ignore argument.type 41 | true, 42 | 512, 43 | \JSON_THROW_ON_ERROR, 44 | ); 45 | } catch (JsonException) { 46 | $logs = []; 47 | } 48 | 49 | if (!isset($logs[$path])) { 50 | $logs[$path] = 0; 51 | } 52 | 53 | ++$logs[$path]; 54 | \file_put_contents(__DIR__ . '/tmp/crawled.json', \json_encode($logs, \JSON_THROW_ON_ERROR)); 55 | 56 | return $handler->handle($request); 57 | }); 58 | 59 | $app->get('/ping', static function (Request $request, Response $response, $args) { 60 | $response->getBody()->write('pong'); 61 | 62 | return $response; 63 | }); 64 | 65 | $app->get('/crawled-routes', static function (Request $request, Response $response, $args): Response { 66 | $stats = \file_get_contents(LOG_PATH); 67 | 68 | if (false === $stats) { 69 | $stats = '{}'; 70 | } 71 | 72 | $response->getBody()->write($stats); 73 | 74 | return $response 75 | ->withHeader('Content-Type', 'application/json'); 76 | }); 77 | 78 | $app->get('/robots', static function (Request $request, Response $response, $args): Response { 79 | $robots = <<<'PLAIN' 80 | User-agent: * 81 | Disallow: /test2 82 | PLAIN; 83 | 84 | $response->getBody()->write($robots); 85 | 86 | return $response->withAddedHeader('Content-type', 'text/plain'); 87 | }); 88 | 89 | $app->get('/test1', static function (Request $request, Response $response, $args) { 90 | $response->getBody()->write('

Such headline, wow

'); 91 | 92 | return $response; 93 | }); 94 | 95 | $app->get('/test2', static function (Request $request, Response $response, $args) { 96 | $response->getBody()->write(''); 97 | 98 | return $response; 99 | }); 100 | 101 | $app->get('/test3', static function (Request $request, Response $response, $args) { 102 | return $response; 103 | }); 104 | 105 | $app->get('/javascript', static function (Request $request, Response $response, $args) { 106 | $body = <<<'HTML' 107 |
Loading...
108 | 112 | HTML; 113 | 114 | $response->getBody()->write($body); 115 | 116 | return $response; 117 | }); 118 | 119 | $app->run(); 120 | -------------------------------------------------------------------------------- /tests/Server/tmp/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /tests/Shell/Commands/RunSpiderCommandTest.php: -------------------------------------------------------------------------------- 1 | execute([ 39 | 'spider' => TestSpider::class, 40 | ]); 41 | 42 | $commandTester->assertCommandIsSuccessful(); 43 | $runner->assertRunWasStarted(TestSpider::class); 44 | } 45 | 46 | public function testPrintsAnErrorIfTheProvidedSpiderClassWasInvalid(): void 47 | { 48 | $commandTester = new CommandTester(new RunSpiderCommand()); 49 | 50 | $commandTester->execute([ 51 | 'spider' => '::not-a-spider::', 52 | ]); 53 | 54 | self::assertSame(Command::FAILURE, $commandTester->getStatusCode()); 55 | self::assertStringContainsString('Invalid spider:', $commandTester->getDisplay(true)); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /tests/Shell/Resolver/FakeNamespaceResolverTest.php: -------------------------------------------------------------------------------- 1 | resolveSpiderNamespace($input); 32 | 33 | self::assertSame($input, $result); 34 | } 35 | 36 | /** 37 | * @return iterable> 38 | */ 39 | public static function inputStringProvider(): iterable 40 | { 41 | yield from [ 42 | ['::string-1::'], 43 | [TestSpider::class], 44 | ['::string-2::'], 45 | [RequestSpiderMiddleware::class], 46 | ]; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/Shell/Resolver/StaticNamespaceResolverTest.php: -------------------------------------------------------------------------------- 1 | resolveSpiderNamespace('RoachPHP\Tests\Fixtures\TestSpider'); 31 | 32 | self::assertSame('RoachPHP\Tests\Fixtures\TestSpider', $result); 33 | } 34 | 35 | public function testThrowsExceptionIfTheProvidedSpiderClassDoesNotExist(): void 36 | { 37 | $resolver = new StaticNamespaceResolver(); 38 | 39 | $this->expectException(InvalidSpiderException::class); 40 | $this->expectExceptionMessage('The spider class ::spider-class:: does not exist'); 41 | 42 | $resolver->resolveSpiderNamespace('::spider-class::'); 43 | } 44 | 45 | public function testThrowsExceptionIfTheProvidedClassIsNotASpider(): void 46 | { 47 | $resolver = new StaticNamespaceResolver(); 48 | 49 | $this->expectException(InvalidSpiderException::class); 50 | $this->expectExceptionMessage(\sprintf('The class %s is not a spider', RequestSpiderMiddleware::class)); 51 | 52 | $resolver->resolveSpiderNamespace(RequestSpiderMiddleware::class); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tests/Spider/Configuration/ArrayLoaderTest.php: -------------------------------------------------------------------------------- 1 | load(); 31 | 32 | $expected = new Configuration([], [], [], [], [], 5, 0); 33 | self::assertEquals($expected, $actual); 34 | } 35 | 36 | public function testMergePartialOptions(): void 37 | { 38 | $loader = new ArrayLoader([ 39 | 'startUrls' => ['::start-url::'], 40 | 'extensions' => [LoggerExtension::class], 41 | 'concurrency' => 2, 42 | ]); 43 | 44 | $actual = $loader->load(); 45 | 46 | $expected = new Configuration(['::start-url::'], [], [], [], [LoggerExtension::class], 2, 0); 47 | self::assertEquals($expected, $actual); 48 | } 49 | 50 | public function testMergeAllOptions(): void 51 | { 52 | $loader = new ArrayLoader([ 53 | 'startUrls' => ['::start-url::'], 54 | 'downloaderMiddleware' => ['::downloader-middleware::'], 55 | 'spiderMiddleware' => ['::spider-middleware::'], 56 | 'itemProcessors' => ['::item-processor::'], 57 | 'extensions' => [LoggerExtension::class], 58 | 'concurrency' => 2, 59 | 'requestDelay' => 2, 60 | ]); 61 | 62 | $actual = $loader->load(); 63 | 64 | $expected = new Configuration( 65 | ['::start-url::'], 66 | ['::downloader-middleware::'], // @phpstan-ignore argument.type 67 | ['::item-processor::'], // @phpstan-ignore argument.type 68 | ['::spider-middleware::'], // @phpstan-ignore argument.type 69 | [LoggerExtension::class], 70 | 2, 71 | 2, 72 | ); 73 | self::assertEquals($expected, $actual); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /tests/Spider/ParseResultTest.php: -------------------------------------------------------------------------------- 1 | yield from []); 30 | 31 | $result->apply( 32 | static fn (Request $request) => self::assertEquals('::url::', (string) $request->getUri()), 33 | static fn () => self::fail('Should not have been called'), 34 | ); 35 | } 36 | 37 | public function testPassesItemToCallbackIfResultIsItem(): void 38 | { 39 | $result = ParseResult::item(['::key::' => '::value::']); 40 | 41 | $result->apply( 42 | static fn () => self::fail('Should not have been called'), 43 | static fn (ItemInterface $item) => self::assertSame('::value::', $item->get('::key::')), 44 | ); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tests/Spider/SpiderTestCase.php: -------------------------------------------------------------------------------- 1 | assertRouteWasCrawledTimes('/test1', 1); 47 | $this->assertRouteWasCrawledTimes('/test2', 1); 48 | } 49 | 50 | public function testOverrideInitialRequests(): void 51 | { 52 | $spider = new class() extends BasicSpider { 53 | // Don't want logging in this test 54 | public array $extensions = []; 55 | 56 | public function parse(Response $response): \Generator 57 | { 58 | yield from []; 59 | } 60 | 61 | protected function initialRequests(): array 62 | { 63 | return [new Request('GET', 'http://localhost:8000/test1', [$this, 'parse'])]; 64 | } 65 | }; 66 | 67 | Roach::startSpider($spider::class); 68 | 69 | $this->assertRouteWasCrawledTimes('/test1', 1); 70 | } 71 | 72 | public function testCanAccessRunContextFromWithinSpider(): void 73 | { 74 | $spider = new class() extends BasicSpider { 75 | public array $extensions = []; 76 | 77 | public function parse(Response $response): \Generator 78 | { 79 | yield from []; 80 | } 81 | 82 | protected function initialRequests(): array 83 | { 84 | return [ 85 | new Request( 86 | 'GET', 87 | // Use initialRequest from passed request context as a heuristic 88 | // if context can be accessed. 89 | $this->context['initialRequest'], 90 | [$this, 'parse'], 91 | ), ]; 92 | } 93 | }; 94 | 95 | Roach::startSpider($spider::class, context: ['initialRequest' => 'http://localhost:8000/test1']); 96 | 97 | $this->assertRouteWasCrawledTimes('/test1', 1); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /tests/Support/DroppableTestCase.php: -------------------------------------------------------------------------------- 1 | createDroppable(); 27 | 28 | self::assertFalse($droppable->wasDropped()); 29 | 30 | $droppable = $droppable->drop('::reason::'); 31 | 32 | self::assertTrue($droppable->wasDropped()); 33 | } 34 | 35 | public function testGetReason(): void 36 | { 37 | $droppable = $this->createDroppable(); 38 | 39 | $dropped = $droppable->drop('::reason::'); 40 | 41 | self::assertSame('::reason::', $dropped->getDropReason()); 42 | } 43 | 44 | abstract protected function createDroppable(): DroppableInterface; 45 | } 46 | -------------------------------------------------------------------------------- /tests/Testing/FakeLoggerTest.php: -------------------------------------------------------------------------------- 1 | $context 30 | */ 31 | public function testCheckIfSpecificMessageWasLoggedAtLevel(string $level, string $message, array $context): void 32 | { 33 | $logger = new FakeLogger(); 34 | 35 | self::assertFalse($logger->messageWasLogged($level, $message)); 36 | 37 | $logger->{$level}($message, $context); 38 | 39 | self::assertTrue($logger->messageWasLogged($level, $message)); 40 | } 41 | 42 | /** 43 | * @dataProvider logMessageProvider 44 | * 45 | * @param array $context 46 | */ 47 | public function testCheckIfMessageWasLoggedWithContext(string $level, string $message, array $context): void 48 | { 49 | $logger = new FakeLogger(); 50 | 51 | $logger->{$level}($message, []); 52 | self::assertFalse($logger->messageWasLogged($level, $message, $context)); 53 | 54 | $logger->{$level}($message, $context); 55 | self::assertTrue($logger->messageWasLogged($level, $message, $context)); 56 | } 57 | 58 | /** 59 | * @return iterable}> 60 | */ 61 | public static function logMessageProvider(): iterable 62 | { 63 | yield from [ 64 | 'debug' => [ 65 | 'debug', '::debug-message::', ['::debug-context::'], 66 | ], 67 | 'info' => [ 68 | 'info', '::info-message::', ['::info-context::'], 69 | ], 70 | 'notice' => [ 71 | 'notice', '::notice-message::', ['::notice-context::'], 72 | ], 73 | 'warning' => [ 74 | 'warning', '::warning-message::', ['::warning-context::'], 75 | ], 76 | 'error' => [ 77 | 'error', '::error-message::', ['::error-context::'], 78 | ], 79 | 'critical' => [ 80 | 'critical', '::critical-message::', ['::critical-context::'], 81 | ], 82 | 'alert' => [ 83 | 'alert', '::alert-message::', ['::alert-context::'], 84 | ], 85 | 'emergency' => [ 86 | 'emergency', '::emergency-message::', ['::emergency-context::'], 87 | ], 88 | ]; 89 | } 90 | } 91 | --------------------------------------------------------------------------------