├── .github
├── CODEOWNERS
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── documentation.md
│ └── feature_request.md
├── dependabot.yml
└── workflows
│ ├── commitlint.yml
│ ├── fix-style.yml
│ ├── release-please.yml
│ └── run-tests.yml
├── .gitignore
├── .php-cs-fixer.php
├── .phpactor.json
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── README.md
├── composer.json
├── composer.lock
├── phpstan-baseline.neon
├── phpstan.neon
├── phpunit.xml
├── psalm.xml
├── roach
├── src
├── Core
│ ├── DefaultContainer.php
│ ├── Engine.php
│ ├── EngineInterface.php
│ ├── FakeRunner.php
│ ├── Run.php
│ ├── RunFactory.php
│ ├── Runner.php
│ ├── RunnerInterface.php
│ └── Version.php
├── Downloader
│ ├── Downloader.php
│ ├── DownloaderMiddlewareInterface.php
│ ├── Middleware
│ │ ├── CookieMiddleware.php
│ │ ├── DownloaderMiddlewareAdapter.php
│ │ ├── ExecuteJavascriptMiddleware.php
│ │ ├── FakeMiddleware.php
│ │ ├── HttpErrorMiddleware.php
│ │ ├── ProxyMiddleware.php
│ │ ├── RequestDeduplicationMiddleware.php
│ │ ├── RequestMiddlewareInterface.php
│ │ ├── ResponseMiddlewareInterface.php
│ │ ├── RobotsTxtMiddleware.php
│ │ └── UserAgentMiddleware.php
│ └── Proxy
│ │ ├── ArrayConfigurationLoader.php
│ │ ├── ConfigurationLoaderInterface.php
│ │ ├── Proxy.php
│ │ └── ProxyOptions.php
├── Events
│ ├── FakeDispatcher.php
│ ├── ItemDropped.php
│ ├── ItemScraped.php
│ ├── RequestDropped.php
│ ├── RequestScheduling.php
│ ├── RequestSending.php
│ ├── ResponseDropped.php
│ ├── ResponseReceived.php
│ ├── ResponseReceiving.php
│ ├── RunFinished.php
│ └── RunStarting.php
├── Extensions
│ ├── ExtensionInterface.php
│ ├── LoggerExtension.php
│ ├── MaxRequestExtension.php
│ ├── ScrapedItemCollectorExtension.php
│ └── StatsCollectorExtension.php
├── Http
│ ├── Client.php
│ ├── ClientInterface.php
│ ├── FakeClient.php
│ ├── MalformedUriException.php
│ ├── Query.php
│ ├── QueryParameterTypeMismatchException.php
│ ├── Request.php
│ ├── RequestException.php
│ ├── Response.php
│ ├── URL.php
│ └── UnknownQueryParameterException.php
├── ItemPipeline
│ ├── AbstractItem.php
│ ├── Item.php
│ ├── ItemInterface.php
│ ├── ItemPipeline.php
│ ├── ItemPipelineInterface.php
│ └── Processors
│ │ ├── ConditionalItemProcessor.php
│ │ ├── CustomItemProcessor.php
│ │ ├── FakeProcessor.php
│ │ └── ItemProcessorInterface.php
├── Roach.php
├── Scheduling
│ ├── ArrayRequestScheduler.php
│ ├── RequestSchedulerInterface.php
│ └── Timing
│ │ ├── ClockInterface.php
│ │ ├── FakeClock.php
│ │ └── SystemClock.php
├── Shell
│ ├── Commands
│ │ ├── FetchCommand.php
│ │ └── RunSpiderCommand.php
│ ├── InvalidSpiderException.php
│ ├── Repl.php
│ ├── Resolver
│ │ ├── DefaultNamespaceResolverDecorator.php
│ │ ├── FakeNamespaceResolver.php
│ │ ├── NamespaceResolverInterface.php
│ │ └── StaticNamespaceResolver.php
│ └── ShellCaster.php
├── Spider
│ ├── AbstractSpider.php
│ ├── BasicSpider.php
│ ├── Configuration
│ │ ├── ArrayLoader.php
│ │ ├── Configuration.php
│ │ └── Overrides.php
│ ├── ConfigurationLoaderStrategy.php
│ ├── Middleware
│ │ ├── FakeHandler.php
│ │ ├── ItemMiddlewareInterface.php
│ │ ├── MaximumCrawlDepthMiddleware.php
│ │ ├── RequestMiddlewareInterface.php
│ │ ├── ResponseMiddlewareInterface.php
│ │ └── SpiderMiddlewareAdapter.php
│ ├── ParseResult.php
│ ├── Processor.php
│ ├── SpiderInterface.php
│ └── SpiderMiddlewareInterface.php
├── Support
│ ├── Configurable.php
│ ├── ConfigurableInterface.php
│ ├── Droppable.php
│ ├── DroppableInterface.php
│ └── HasMetaData.php
└── Testing
│ ├── Concerns
│ └── InteractsWithRequestsAndResponses.php
│ └── FakeLogger.php
└── tests
├── Core
├── EngineTest.php
└── RunFactoryTest.php
├── Downloader
├── DownloaderMiddlewareAdapterTest.php
├── DownloaderTest.php
├── Middleware
│ ├── CookieMiddlewareTest.php
│ ├── ExecuteJavascriptMiddlewareTest.php
│ ├── FakeMiddlewareTest.php
│ ├── HttpErrorMiddlewareTest.php
│ ├── ProxyMiddlewareTest.php
│ ├── RequestDeduplicationMiddlewareTest.php
│ ├── RobotsTxtMiddlewareTest.php
│ └── UserAgentMiddlewareTest.php
└── Proxy
│ ├── ArrayConfigurationLoaderTest.php
│ ├── ProxyOptionsTest.php
│ └── ProxyTest.php
├── Events
└── FakeDispatcherTest.php
├── Extensions
├── ExtensionTestCase.php
├── LoggerExtensionTest.php
├── MaxRequestExtensionTest.php
├── ScrapedItemCollectorExtensionTest.php
└── StatsCollectorExtensionTest.php
├── Fixtures
├── Extension.php
├── ItemProcessor.php
├── ItemSpiderMiddleware.php
├── RequestDownloaderMiddleware.php
├── RequestSpiderMiddleware.php
├── ResponseDownloaderMiddleware.php
├── ResponseSpiderMiddleware.php
├── TestCustomItemProcessor.php
├── TestItem.php
├── TestItem2.php
├── TestSpider.php
└── TestSpider2.php
├── Http
├── ClientTest.php
├── FakeClientTest.php
├── QueryTest.php
├── RequestTest.php
├── ResponseTest.php
└── URLTest.php
├── IntegrationTestCase.php
├── ItemPipeline
├── AbstractItemTest.php
├── CustomItemProcessorTest.php
├── ItemPipelineTest.php
└── ItemTest.php
├── RoachTest.php
├── Scheduling
├── ArrayRequestSchedulerTest.php
└── Timing
│ └── FakeClockTest.php
├── Server
├── index.php
└── tmp
│ └── .gitignore
├── Shell
├── Commands
│ └── RunSpiderCommandTest.php
└── Resolver
│ ├── DefaultNamespaceResolverDecoratorTest.php
│ ├── FakeNamespaceResolverTest.php
│ └── StaticNamespaceResolverTest.php
├── Spider
├── Configuration
│ ├── ArrayLoaderTest.php
│ └── ConfigurationTest.php
├── Middleware
│ ├── FakeHandlerTest.php
│ ├── FakeProcessorTest.php
│ ├── MaximumCrawlDepthMiddlewareTest.php
│ └── SpiderMiddlewareAdapterTest.php
├── ParseResultTest.php
├── ProcessorTest.php
└── SpiderTestCase.php
├── Support
└── DroppableTestCase.php
└── Testing
├── FakeLoggerTest.php
└── FakeRunnerTest.php
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @ksassnowski
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: ksassnowski
4 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help improve Motion Canvas
4 | title: ''
5 | labels: bug
6 | assignees: ksassnowski
7 | ---
8 |
9 | **Describe the bug**
10 | A clear and concise description of what the bug is.
11 |
12 | **Reproduction**
13 | Please include a link to a minimal repository that reproduces the issue.
14 |
15 | **Expected behavior**
16 | If applicable, a clear and concise description of what you expected to happen.
17 |
18 | **Package versions (please complete the following information):**
19 |
20 | - core: [e.g. 1.0.0]
21 |
22 | **Additional context**
23 | Add any other context about the problem here.
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Documentation
3 | about: Report an issue or suggest improvements for documentation
4 | title: ''
5 | labels: documentation
6 | assignees: ksassnowski
7 | ---
8 |
9 | **Description**
10 | A clear and concise description of what the issue is.
11 | How can it be improved?
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for Motion Canvas
4 | title: ''
5 | labels: enhancement
6 | assignees: ksassnowski
7 | ---
8 |
9 | **Description**
10 | A clear and concise description of why the feature is needed.
11 | What problem does it aim to fix?
12 | What benefits does it bring?
13 |
14 | **Proposed solution**
15 | A clear and concise description of how the feature would work.
16 | If applicable, provide an example of the API and how it would be used.
17 |
18 | **Considered alternatives**
19 | A clear and concise description of any alternative solutions or features you've considered.
20 |
21 | **Additional context**
22 | Add any other context or screenshots about the feature request here.
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "composer"
4 | directory: "/"
5 | commit-message:
6 | prefix: "chore"
7 | include: "scope"
8 | schedule:
9 | interval: "weekly"
10 |
--------------------------------------------------------------------------------
/.github/workflows/commitlint.yml:
--------------------------------------------------------------------------------
1 | name: Lint Commit Messages
2 | on: [pull_request]
3 |
4 | jobs:
5 | commitlint:
6 | runs-on: ubuntu-latest
7 | steps:
8 | - uses: actions/checkout@v4
9 | with:
10 | fetch-depth: 0
11 | - uses: wagoid/commitlint-github-action@v5
--------------------------------------------------------------------------------
/.github/workflows/fix-style.yml:
--------------------------------------------------------------------------------
1 | name: fix-style
2 |
3 | on: [push]
4 |
5 | jobs:
6 | cs-fix:
7 | runs-on: ubuntu-latest
8 |
9 | steps:
10 | - name: Get branch names
11 | id: branch-name
12 | uses: tj-actions/branch-names@v8
13 |
14 | - name: Checkout code
15 | uses: actions/checkout@v4
16 | with:
17 | ref: ${{ github.head_ref }}
18 |
19 | - name: Setup PHP
20 | uses: shivammathur/setup-php@v2
21 | with:
22 | php-version: 8.3
23 |
24 | - name: Install dependencies
25 | run: composer install
26 |
27 | - name: Fix style
28 | run: ./vendor/bin/php-cs-fixer fix --allow-risky=yes --using-cache=no
29 |
30 | - name: Commit style fixes
31 | uses: stefanzweifel/git-auto-commit-action@v5
32 | with:
33 | commit_message: Apply php-cs-fixer changes
34 |
--------------------------------------------------------------------------------
/.github/workflows/release-please.yml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | branches:
4 | - main
5 |
6 | permissions:
7 | contents: write
8 | pull-requests: write
9 |
10 | name: release-please
11 |
12 | jobs:
13 | release-please:
14 | runs-on: ubuntu-latest
15 | steps:
16 | - uses: google-github-actions/release-please-action@v3
17 | with:
18 | release-type: php
19 | package-name: roach-php/core
--------------------------------------------------------------------------------
/.github/workflows/run-tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | runs-on: ${{ matrix.os }}
8 | strategy:
9 | fail-fast: false
10 | matrix:
11 | os: [ubuntu-latest]
12 | php: ['8.2', '8.3', '8.4']
13 | dependency-version: [prefer-lowest, prefer-stable]
14 |
15 | name: P${{ matrix.php }} - ${{ matrix.dependency-version }} - ${{ matrix.os }}
16 |
17 | steps:
18 | - name: Checkout code
19 | uses: actions/checkout@v4
20 |
21 | - name: Disable AppArmor
22 | run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns
23 |
24 | - name: Install Puppeteer
25 | run: npm install puppeteer
26 |
27 | - name: Setup PHP
28 | uses: shivammathur/setup-php@v2
29 | with:
30 | php-version: ${{ matrix.php }}
31 | extensions: dom, curl
32 | coverage: none
33 |
34 | - name: "Validate composer.json and composer.lock"
35 | run: "composer validate --strict"
36 |
37 | - name: "Determine composer cache directory"
38 | id: "determine-composer-cache-directory"
39 | run: echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT
40 |
41 | - name: "Cache dependencies installed with composer"
42 | uses: "actions/cache@v4"
43 | with:
44 | path: "${{ steps.determine-composer-cache-directory.outputs.dir }}"
45 | key: "php-${{ matrix.php }}-composer-${{ matrix.dependency-version }}-${{ hashFiles('composer.lock') }}"
46 | restore-keys: "php-${{ matrix.php }}-composer-${{ matrix.dependency-version }}-"
47 |
48 | - name: Install dependencies
49 | run: composer update --${{ matrix.dependency-version }} --no-interaction --prefer-dist
50 |
51 | - name: Run phpstan
52 | run: composer analyze
53 |
54 | - name: Start server
55 | run: (php -S localhost:8000 -t ./tests/Server &) || /bin/true
56 |
57 | - name: Wait for server bootup
58 | run: sleep 3
59 |
60 | - name: Execute tests
61 | run: vendor/bin/phpunit
62 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /vendor/
2 | /.build/
3 | .phpunit.result.cache
4 | .phpunit.cache/
5 | /coverage/
6 | /package-lock.json
7 | /package.json
8 | /node_modules
9 |
--------------------------------------------------------------------------------
/.php-cs-fixer.php:
--------------------------------------------------------------------------------
1 | withHeader($header)
16 | ->withRules(Config\Rules::fromArray([
17 | 'php_unit_test_class_requires_covers' => false,
18 | 'class_attributes_separation' => [
19 | 'elements' => [
20 | 'const' => 'one',
21 | 'method' => 'one',
22 | 'property' => 'one',
23 | 'trait_import' => 'none',
24 | ],
25 | ],
26 | 'error_suppression' => [
27 | 'noise_remaining_usages' => false,
28 | ],
29 | ]));
30 |
31 | $config = Config\Factory::fromRuleSet($ruleSet);
32 |
33 | $config->getFinder()->in(__DIR__);
34 | $config->setCacheFile(__DIR__ . '/.build/php-cs-fixer/.php-cs-fixer.cache');
35 |
36 | return $config;
--------------------------------------------------------------------------------
/.phpactor.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "/phpactor.schema.json",
3 | "language_server_phpstan.enabled": false,
4 | "language_server_php_cs_fixer.enabled": false
5 | }
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Roach
2 |
3 | This is the Contribution Guide for Roach PHP. Please read this document
4 | carefully before opening an issue or a pull request.
5 |
6 | ## Code of Conduct
7 |
8 | Before contributing to the project, please read our
9 | [Code of Conduct](./CODE_OF_CONDUCT.md).
10 |
11 | ## Reporting a bug
12 |
13 | Before you submit an issue, please search [the issue tracker][issues]. An issue
14 | for your problem might already exist and the discussion might inform you of
15 | workarounds readily available.
16 |
17 | You can file new issues by [selecting an issue template][new-issue] and filling
18 | out the necessary information.
19 |
20 | ## Proposing a Change
21 |
22 | If you intend to change the public API or make any non-trivial changes to the
23 | implementation, make sure to [create an issue][new-feature] first. This will let
24 | us discuss a proposal before you put significant effort into it.
25 |
26 | If you're only fixing a bug or a typo, it's fine to submit a pull request right
27 | away without creating an issue, but make sure it contains a clear and concise
28 | description of the bug.
29 |
30 | ## Working on Issues
31 |
32 | Before you start working on an issue make sure that it has been accepted
33 | (indicated by an [`accepted`][label-accepted] label) and that no one has
34 | claimed it yet. Otherwise, you may duplicate other people's efforts. If somebody
35 | claims an issue but doesn't follow up for more than two weeks, it’s fine to take
36 | it over, but you should still leave a comment. You should also leave a comment
37 | on any issue you're working on, to let others know.
38 |
39 | ## Semantic Versioning
40 |
41 | Roach follows [semantic versioning][semver].
42 |
43 | ## Making a Pull Request
44 |
45 | 1. Fork the roach-php/core repo.
46 | 2. In your forked repo, create a new branch for your changes:
47 | ```shell
48 | git checkout -b my-fix-branch main
49 | ```
50 | 3. Update the code. **Make sure that all your changes are covered by tests.**
51 | 4. Commit your changes using a **descriptive commit message** that follows the
52 | [Angular Commit Message Conventions][commit-format].
53 | ```shell
54 | git commit --all
55 | ```
56 | 5. Push your branch to GitHub:
57 | ```shell
58 | git push origin my-fix-branch
59 | ```
60 | 6. In GitHub, send a pull request to [the main branch][main].
61 |
62 | ### Addressing review feedback
63 |
64 | 1. Make required updates to the code.
65 | 2. Create a fixup commit and push it to your GitHub repo:
66 | ```shell
67 | git commit --all --fixup HEAD
68 | git push
69 | ```
70 |
71 | ## Attribution
72 |
73 | This Contribution Guide was adapted from the [Motion Canvas][motion-canvas]
74 | Contribution guide
75 |
76 | [semver]: https://semver.org/
77 | [semantic-release]: https://semantic-release.gitbook.io/semantic-release/support/faq#can-i-set-the-initial-release-version-of-my-package-to-0.0.1
78 | [main]: https://github.com/roach-php/core/tree/main
79 | [issues]: https://github.com/roach-php/core/issues
80 | [new-issue]: https://github.com/roach-php/core/issues/new/choose
81 | [new-feature]: https://github.com/roach-php/core/issues/new?template=feature_request.md
82 | [commit-format]: https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit
83 | [motion-canvas]: https://github.com/motion-canvas/motion-canvas/blob/main/CONTRIBUTING.md
84 | [label-accepted]: https://github.com/roach-php/core/labels/accepted
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | 🐴 Roach
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | A complete web scraping toolkit for PHP
20 |
21 | ## About
22 |
23 | Roach is a complete web scraping toolkit for PHP. It is heavily inspired (read: a shameless clone) of the popular [Scrapy](https://docs.scrapy.org)
24 | package for Python.
25 |
26 | ## Installation
27 |
28 | Install the package via composer
29 |
30 | ```bash
31 | composer require roach-php/core
32 | ```
33 |
34 | ## Documentation
35 |
36 | The full documentation can be found [here](https://roach-php.dev).
37 |
38 | ## Contributing
39 |
40 | Please read our [Contribution Guide][contribution-guide] before opening issues
41 | or pull requests.
42 |
43 | ## Credits
44 |
45 | - [Kai Sassnowski](https://github.com/ksassnowski)
46 | - [All contributors](https://github.com/roach-php/core/contributors)
47 |
48 | ## License
49 |
50 | MIT
51 |
52 | [contribution-guide]: https://github.com/roach-php/core/blob/main/CONTRIBUTING.md
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "roach-php/core",
3 | "description": "A complete web scraping toolkit for PHP",
4 | "license": "MIT",
5 | "type": "library",
6 | "authors": [
7 | {
8 | "name": "Kai Sassnowski",
9 | "email": "me@kai-sassnowski.com"
10 | }
11 | ],
12 | "require": {
13 | "php": "~8.2.0 || ~8.3.0 || ~8.4.0",
14 | "guzzlehttp/guzzle": "^7.8.0",
15 | "jakeasmith/http_build_url": "^1.0.1",
16 | "league/container": "^4.2",
17 | "monolog/monolog": "^3.5",
18 | "nyholm/psr7": "^1.8.1",
19 | "nyholm/psr7-server": "^1.1",
20 | "psr/container": "^2.0.2",
21 | "psy/psysh": "^0.11.22 || ^0.12.0",
22 | "spatie/robots-txt": "^2.0.3",
23 | "symfony/console": "^7.0",
24 | "symfony/css-selector": "^7.0",
25 | "symfony/dom-crawler": "^7.0",
26 | "symfony/event-dispatcher": "^7.0",
27 | "symfony/options-resolver": "^7.0"
28 | },
29 | "require-dev": {
30 | "ergebnis/composer-normalize": "^2.45",
31 | "ergebnis/php-cs-fixer-config": "^6.45.0",
32 | "http-interop/http-factory-guzzle": "^1.2",
33 | "phpstan/phpstan": "^2.1",
34 | "phpunit/phpunit": "^10.4.2",
35 | "psr/http-message": "^1.1.0",
36 | "roave/security-advisories": "dev-latest",
37 | "slim/slim": "^4.12",
38 | "spatie/browsershot": "^5.0"
39 | },
40 | "suggest": {
41 | "spatie/browsershot": "Required to execute Javascript in spiders"
42 | },
43 | "autoload": {
44 | "psr-4": {
45 | "RoachPHP\\": "src/"
46 | }
47 | },
48 | "autoload-dev": {
49 | "psr-4": {
50 | "RoachPHP\\Tests\\": "tests/"
51 | }
52 | },
53 | "bin": [
54 | "roach"
55 | ],
56 | "config": {
57 | "allow-plugins": {
58 | "composer/package-versions-deprecated": true,
59 | "ergebnis/composer-normalize": true
60 | }
61 | },
62 | "scripts": {
63 | "post-install-cmd": [
64 | "composer normalize"
65 | ],
66 | "post-update-cmd": [
67 | "composer normalize"
68 | ],
69 | "analyze": [
70 | "vendor/bin/phpstan"
71 | ],
72 | "coding-standards": [
73 | "mkdir -p .build/php-cs-fixer",
74 | "php-cs-fixer fix --diff --verbose"
75 | ],
76 | "test-server": [
77 | "php -S localhost:8000 -t ./tests/Server"
78 | ],
79 | "test-watch": [
80 | "vendor/bin/phpunit-watcher watch"
81 | ]
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/phpstan-baseline.neon:
--------------------------------------------------------------------------------
1 | parameters:
2 | ignoreErrors:
3 | -
4 | message: '#^Parameter \#1 \$item of method RoachPHP\\Spider\\Middleware\\ItemMiddlewareInterface\:\:handleItem\(\) expects RoachPHP\\ItemPipeline\\ItemInterface, RoachPHP\\Http\\Request\|RoachPHP\\ItemPipeline\\ItemInterface given\.$#'
5 | identifier: argument.type
6 | count: 1
7 | path: src/Spider/Processor.php
8 |
9 | -
10 | message: '#^Parameter \#1 \$request of method RoachPHP\\Spider\\Middleware\\RequestMiddlewareInterface\:\:handleRequest\(\) expects RoachPHP\\Http\\Request, RoachPHP\\Http\\Request\|RoachPHP\\ItemPipeline\\ItemInterface given\.$#'
11 | identifier: argument.type
12 | count: 1
13 | path: src/Spider/Processor.php
14 |
--------------------------------------------------------------------------------
/phpstan.neon:
--------------------------------------------------------------------------------
1 | includes:
2 | - phpstan-baseline.neon
3 |
4 | parameters:
5 | paths:
6 | - src
7 |
8 | level: 9
9 |
10 | ignoreErrors:
11 | - identifier: missingType.iterableValue
12 |
13 | excludePaths: []
14 |
--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | tests
6 |
7 |
8 |
9 |
10 |
11 | src
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/psalm.xml:
--------------------------------------------------------------------------------
1 |
2 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/roach:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env php
2 | setName(Version::getVersionString());
30 |
31 | $application->add(new Repl());
32 | $application->add(new RunSpiderCommand());
33 |
34 | $application->run();
35 |
--------------------------------------------------------------------------------
/src/Core/EngineInterface.php:
--------------------------------------------------------------------------------
1 |
30 | */
31 | public function collect(Run $run): array;
32 | }
33 |
--------------------------------------------------------------------------------
/src/Core/FakeRunner.php:
--------------------------------------------------------------------------------
1 | , array>
24 | */
25 | private array $runs = [];
26 |
27 | public function startSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): void
28 | {
29 | $this->recordRun($spiderClass, $overrides, $context);
30 | }
31 |
32 | public function collectSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): array
33 | {
34 | $this->recordRun($spiderClass, $overrides, $context);
35 |
36 | return [];
37 | }
38 |
39 | /**
40 | * @param class-string $spider
41 | *
42 | * @psalm-param (callable(Overrides|null, array): bool)|null $callback
43 | */
44 | public function assertRunWasStarted(string $spider, ?callable $callback = null): void
45 | {
46 | Assert::assertArrayHasKey(
47 | $spider,
48 | $this->runs,
49 | "Expected run for spider {$spider} to exist but no runs were started instead.",
50 | );
51 |
52 | if (null !== $callback) {
53 | foreach ($this->runs[$spider] as $run) {
54 | if ($callback($run['overrides'], $run['context'])) {
55 | return;
56 | }
57 | }
58 |
59 | Assert::fail("Found run for spider {$spider}, but passed callback returned false");
60 | }
61 | }
62 |
63 | /**
64 | * @param class-string $spider
65 | */
66 | public function assertRunWasNotStarted(string $spider): void
67 | {
68 | Assert::assertArrayNotHasKey(
69 | $spider,
70 | $this->runs,
71 | "Unexpected run for spider {$spider} was started",
72 | );
73 | }
74 |
75 | /**
76 | * @param class-string $spiderClass
77 | */
78 | private function recordRun(string $spiderClass, ?Overrides $overrides = null, array $context = []): void
79 | {
80 | $this->runs[$spiderClass][] = [
81 | 'overrides' => $overrides,
82 | 'context' => $context,
83 | ];
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/Core/Run.php:
--------------------------------------------------------------------------------
1 | $startRequests
29 | * @param array $downloaderMiddleware
30 | * @param array $itemProcessors
31 | * @param array $responseMiddleware
32 | * @param array $extensions
33 | */
34 | public function __construct(
35 | public array $startRequests,
36 | public string $namespace,
37 | public array $downloaderMiddleware = [],
38 | public array $itemProcessors = [],
39 | public array $responseMiddleware = [],
40 | public array $extensions = [],
41 | public int $concurrency = 25,
42 | public int $requestDelay = 0,
43 | ) {
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/Core/Runner.php:
--------------------------------------------------------------------------------
1 | engine->start($this->createRun($spiderClass, $overrides, $context));
31 | }
32 |
33 | public function collectSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): array
34 | {
35 | return $this->engine->collect($this->createRun($spiderClass, $overrides, $context));
36 | }
37 |
38 | private function createRun(string $spiderClass, ?Overrides $overrides, array $context): Run
39 | {
40 | /** @var SpiderInterface $spider */
41 | $spider = $this->container->get($spiderClass);
42 |
43 | $spider->withContext($context);
44 |
45 | return (new RunFactory($this->container))->fromSpider($spider, $overrides);
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/Core/RunnerInterface.php:
--------------------------------------------------------------------------------
1 | $spiderClass
24 | */
25 | public function startSpider(
26 | string $spiderClass,
27 | ?Overrides $overrides = null,
28 | array $context = [],
29 | ): void;
30 |
31 | /**
32 | * @param class-string $spiderClass
33 | *
34 | * @return array
35 | */
36 | public function collectSpider(
37 | string $spiderClass,
38 | ?Overrides $overrides = null,
39 | array $context = [],
40 | ): array;
41 | }
42 |
--------------------------------------------------------------------------------
/src/Core/Version.php:
--------------------------------------------------------------------------------
1 | cookieJar = $cookieJar ?: new CookieJar();
30 | }
31 |
32 | public function handleRequest(Request $request): Request
33 | {
34 | return $request->addOption('cookies', $this->cookieJar);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/Downloader/Middleware/DownloaderMiddlewareAdapter.php:
--------------------------------------------------------------------------------
1 | middleware instanceof RequestMiddlewareInterface) {
43 | return $this->middleware->handleRequest($request);
44 | }
45 |
46 | return $request;
47 | }
48 |
49 | public function handleResponse(Response $response): Response
50 | {
51 | if ($this->middleware instanceof ResponseMiddlewareInterface) {
52 | return $this->middleware->handleResponse($response);
53 | }
54 |
55 | return $response;
56 | }
57 |
58 | public function configure(array $options): void
59 | {
60 | $this->middleware->configure($options);
61 | }
62 |
63 | public function getMiddleware(): RequestMiddlewareInterface|ResponseMiddlewareInterface
64 | {
65 | return $this->middleware;
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/Downloader/Middleware/FakeMiddleware.php:
--------------------------------------------------------------------------------
1 | requestsHandled[] = $request;
52 |
53 | if (null !== $this->requestHandler) {
54 | return ($this->requestHandler)($request);
55 | }
56 |
57 | return $request;
58 | }
59 |
60 | public function handleResponse(Response $response): Response
61 | {
62 | $this->responsesHandled[] = $response;
63 |
64 | if (null !== $this->responseHandler) {
65 | return ($this->responseHandler)($response);
66 | }
67 |
68 | return $response;
69 | }
70 |
71 | public function assertRequestHandled(Request $request): void
72 | {
73 | Assert::assertContains($request, $this->requestsHandled);
74 | }
75 |
76 | public function assertRequestNotHandled(Request $request): void
77 | {
78 | Assert::assertNotContains($request, $this->requestsHandled);
79 | }
80 |
81 | public function assertNoRequestsHandled(): void
82 | {
83 | Assert::assertEmpty($this->requestsHandled);
84 | }
85 |
86 | public function assertResponseHandled(Response $response): void
87 | {
88 | Assert::assertContains($response, $this->responsesHandled);
89 | }
90 |
91 | public function assertResponseNotHandled(Response $response): void
92 | {
93 | Assert::assertNotContains($response, $this->responsesHandled);
94 | }
95 |
96 | public function assertNoResponseHandled(): void
97 | {
98 | Assert::assertEmpty($this->responsesHandled);
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/src/Downloader/Middleware/HttpErrorMiddleware.php:
--------------------------------------------------------------------------------
1 | getStatus();
31 |
32 | if (200 <= $status && 300 > $status) {
33 | return $response;
34 | }
35 |
36 | /** @var array $allowedStatus */
37 | $allowedStatus = $this->option('handleStatus');
38 |
39 | if (\in_array($status, $allowedStatus, true)) {
40 | return $response;
41 | }
42 |
43 | $this->logger->info(
44 | '[HttpErrorMiddleware] Dropping unsuccessful response',
45 | [
46 | 'uri' => $response->getRequest()->getUri(),
47 | 'status' => $status,
48 | ],
49 | );
50 |
51 | return $response->drop('Unallowed HTTP status: ' . $status);
52 | }
53 |
54 | private function defaultOptions(): array
55 | {
56 | return [
57 | 'handleStatus' => [],
58 | ];
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/Downloader/Middleware/ProxyMiddleware.php:
--------------------------------------------------------------------------------
1 | proxy) {
39 | $this->logger->warning(
40 | '[ProxyMiddleware] No proxy configured for middleware',
41 | );
42 |
43 | return $request;
44 | }
45 |
46 | $options = $this->proxy->optionsFor($request);
47 |
48 | if ($options->isEmpty()) {
49 | return $request;
50 | }
51 |
52 | $this->logger->info(
53 | '[ProxyMiddleware] Using proxy for request',
54 | $options->toArray(),
55 | );
56 |
57 | return $request->addOption('proxy', $options->toArray());
58 | }
59 |
60 | private function defaultOptions(): array
61 | {
62 | return [
63 | 'proxy' => [],
64 | 'loader' => null,
65 | ];
66 | }
67 |
68 | private function onAfterConfigured(): void
69 | {
70 | /** @var null|class-string $loaderClass */
71 | $loaderClass = $this->option('loader');
72 |
73 | if (null !== $loaderClass) {
74 | /** @var ConfigurationLoaderInterface $loader */
75 | $loader = $this->container->get($loaderClass);
76 | } else {
77 | /** @var array}|string>|string $options */
78 | $options = $this->option('proxy');
79 | $loader = new ArrayConfigurationLoader($options);
80 | }
81 |
82 | $this->proxy = $loader->loadProxyConfiguration();
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/src/Downloader/Middleware/RequestDeduplicationMiddleware.php:
--------------------------------------------------------------------------------
1 |
26 | */
27 | private array $seenUris = [];
28 |
29 | public function __construct(private LoggerInterface $logger)
30 | {
31 | }
32 |
33 | public function handleRequest(Request $request): Request
34 | {
35 | $uri = $request->getUri();
36 | $replaceFlags = \HTTP_URL_REPLACE;
37 | $parts = \parse_url($uri);
38 |
39 | if ($this->option('ignore_url_fragments')) {
40 | $replaceFlags |= \HTTP_URL_STRIP_FRAGMENT;
41 | }
42 |
43 | if ($this->option('ignore_trailing_slashes') && isset($parts['path'])) {
44 | $parts['path'] = \mb_rtrim($parts['path'], '/');
45 | }
46 |
47 | if ($this->option('ignore_query_string')) {
48 | $replaceFlags |= \HTTP_URL_STRIP_QUERY;
49 | }
50 |
51 | /** @phpstan-ignore argument.type */
52 | $uri = http_build_url($uri, $parts, $replaceFlags);
53 |
54 | if (\in_array($uri, $this->seenUris, true)) {
55 | $this->logger->info(
56 | '[RequestDeduplicationMiddleware] Dropping duplicate request',
57 | ['uri' => $request->getUri()],
58 | );
59 |
60 | return $request->drop('Duplicate request');
61 | }
62 |
63 | $this->seenUris[] = $uri;
64 |
65 | return $request;
66 | }
67 |
68 | private function defaultOptions(): array
69 | {
70 | return [
71 | 'ignore_url_fragments' => false,
72 | 'ignore_trailing_slashes' => true,
73 | 'ignore_query_string' => false,
74 | ];
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/Downloader/Middleware/RequestMiddlewareInterface.php:
--------------------------------------------------------------------------------
1 |
26 | */
27 | private array $robots = [];
28 |
29 | public function handleRequest(Request $request): Request
30 | {
31 | /** @var string $userAgent */
32 | $userAgent = $request->getHeader('User-Agent')[0] ?? '';
33 | $uri = $request->getUri();
34 | $robotsUrl = $this->createRobotsUrl($uri);
35 |
36 | if (!isset($this->robots[$robotsUrl])) {
37 | $this->robots[$robotsUrl] = Robots::create($userAgent, $robotsUrl);
38 | }
39 |
40 | $robots = $this->robots[$robotsUrl];
41 |
42 | if (!$robots->mayIndex($uri, $userAgent)) {
43 | return $request->drop("robots.txt forbids crawling {$uri} for user agent {$userAgent}");
44 | }
45 |
46 | return $request;
47 | }
48 |
49 | private function createRobotsUrl(string $url): string
50 | {
51 | $robotsUrl = \parse_url($url, \PHP_URL_SCHEME) . '://' . \parse_url($url, \PHP_URL_HOST);
52 |
53 | $port = \parse_url($url, \PHP_URL_PORT);
54 |
55 | if (null !== $port && false !== $port) {
56 | $robotsUrl .= ":{$port}";
57 | }
58 |
59 | /** @var string $fileName */
60 | $fileName = $this->option('fileName');
61 |
62 | return "{$robotsUrl}/{$fileName}";
63 | }
64 |
65 | private function defaultOptions(): array
66 | {
67 | return [
68 | 'fileName' => 'robots.txt',
69 | ];
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/Downloader/Middleware/UserAgentMiddleware.php:
--------------------------------------------------------------------------------
1 | addHeader('User-Agent', $this->option('userAgent'));
27 | }
28 |
29 | private function defaultOptions(): array
30 | {
31 | return [
32 | 'userAgent' => 'roach-php',
33 | ];
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/Downloader/Proxy/ArrayConfigurationLoader.php:
--------------------------------------------------------------------------------
1 | }|string>|string $params
20 | */
21 | public function __construct(private readonly array|string $params)
22 | {
23 | }
24 |
25 | public function loadProxyConfiguration(): Proxy
26 | {
27 | if (\is_string($this->params)) {
28 | return new Proxy([
29 | '*' => ProxyOptions::allProtocols($this->params),
30 | ]);
31 | }
32 |
33 | /** @var array $proxyList */
34 | $proxyList = [];
35 |
36 | foreach ($this->params as $domain => $options) {
37 | if (\is_string($options)) {
38 | $proxyList[$domain] = ProxyOptions::allProtocols($options);
39 | } else {
40 | $proxyList[$domain] = new ProxyOptions(
41 | $options['http'] ?? null,
42 | $options['https'] ?? null,
43 | $options['no'] ?? [],
44 | );
45 | }
46 | }
47 |
48 | return new Proxy($proxyList);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/Downloader/Proxy/ConfigurationLoaderInterface.php:
--------------------------------------------------------------------------------
1 | $proxyList
22 | */
23 | public function __construct(private readonly array $proxyList = [])
24 | {
25 | }
26 |
27 | public function optionsFor(Request $request): ProxyOptions
28 | {
29 | $host = $request->url->host;
30 |
31 | if (null === $host) {
32 | return ProxyOptions::make();
33 | }
34 |
35 | if (\array_key_exists($host, $this->proxyList)) {
36 | return $this->proxyList[$host];
37 | }
38 |
39 | if (\array_key_exists('*', $this->proxyList)) {
40 | return $this->proxyList['*'];
41 | }
42 |
43 | return ProxyOptions::make();
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/Downloader/Proxy/ProxyOptions.php:
--------------------------------------------------------------------------------
1 | $excludedDomains
20 | */
21 | public function __construct(
22 | private readonly ?string $httpProxyURL = null,
23 | private readonly ?string $httpsProxyURL = null,
24 | private readonly array $excludedDomains = [],
25 | ) {
26 | }
27 |
28 | public static function make(): self
29 | {
30 | return new self();
31 | }
32 |
33 | /**
34 | * Configure the same proxy URL to be used for HTTP and HTTPS.
35 | */
36 | public static function allProtocols(string $url): self
37 | {
38 | return new self($url, $url, []);
39 | }
40 |
41 | /**
42 | * Configure the proxy URL to be used for requests using HTTP.
43 | */
44 | public function http(string $url): self
45 | {
46 | return new self($url, $this->httpsProxyURL, $this->excludedDomains);
47 | }
48 |
49 | /**
50 | * Configure the proxy URL to be used for requests using HTTPs.
51 | */
52 | public function https(string $url): self
53 | {
54 | return new self($this->httpProxyURL, $url, $this->excludedDomains);
55 | }
56 |
57 | /**
58 | * Configure the domains or TLDs that should not use proxies.
59 | *
60 | * @param array|string $domains
61 | */
62 | public function exclude(array|string $domains): self
63 | {
64 | return new self(
65 | $this->httpProxyURL,
66 | $this->httpsProxyURL,
67 | (array) $domains,
68 | );
69 | }
70 |
71 | public function isEmpty(): bool
72 | {
73 | return null === $this->httpProxyURL
74 | && null === $this->httpsProxyURL
75 | && \count($this->excludedDomains) === 0;
76 | }
77 |
78 | public function equals(self $other): bool
79 | {
80 | return $this->httpProxyURL === $other->httpProxyURL
81 | && $this->httpsProxyURL === $other->httpsProxyURL
82 | && $this->excludedDomains === $other->excludedDomains;
83 | }
84 |
85 | /**
86 | * @return array{
87 | * http?: string,
88 | * https?: string,
89 | * no?: array
90 | * }
91 | */
92 | public function toArray(): array
93 | {
94 | return \array_filter([
95 | 'http' => $this->httpProxyURL,
96 | 'https' => $this->httpsProxyURL,
97 | 'no' => $this->excludedDomains,
98 | ]);
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/src/Events/FakeDispatcher.php:
--------------------------------------------------------------------------------
1 | >
23 | */
24 | private array $dispatchedEvents = [];
25 |
26 | public function dispatch(object $event, ?string $eventName = null): object
27 | {
28 | $eventName ??= $event::class;
29 |
30 | parent::dispatch($event, $eventName);
31 |
32 | $this->dispatchedEvents[$eventName][] = $event;
33 |
34 | return $event;
35 | }
36 |
37 | public function assertDispatched(string $eventName, ?callable $callback = null): void
38 | {
39 | Assert::assertArrayHasKey($eventName, $this->dispatchedEvents);
40 |
41 | if (null !== $callback) {
42 | foreach ($this->dispatchedEvents[$eventName] as $event) {
43 | if ($callback($event)) {
44 | return;
45 | }
46 | }
47 |
48 | Assert::fail('Event was not dispatched with correct payload');
49 | }
50 | }
51 |
52 | public function assertNotDispatched(string $eventName): void
53 | {
54 | Assert::assertArrayNotHasKey($eventName, $this->dispatchedEvents);
55 | }
56 |
57 | public function listen(string $eventName, callable $listener): void
58 | {
59 | $this->addListener($eventName, $listener);
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/Events/ItemDropped.php:
--------------------------------------------------------------------------------
1 | ['onRunStarting', 100],
37 | RunFinished::NAME => ['onRunFinished', 100],
38 | RequestSending::NAME => ['onRequestSending', 100],
39 | RequestDropped::NAME => ['onRequestDropped', 100],
40 | ItemScraped::NAME => ['onItemScraped', 100],
41 | ItemDropped::NAME => ['onItemDropped', 100],
42 | ];
43 | }
44 |
45 | public function onRunStarting(RunStarting $event): void
46 | {
47 | $this->logger->info('Run starting');
48 | }
49 |
50 | public function onRunFinished(RunFinished $event): void
51 | {
52 | $this->logger->info('Run finished');
53 | }
54 |
55 | public function onRequestSending(RequestSending $event): void
56 | {
57 | $this->logger->info('Dispatching request', [
58 | 'uri' => $event->request->getUri(),
59 | ]);
60 | }
61 |
62 | public function onRequestDropped(RequestDropped $event): void
63 | {
64 | $request = $event->request;
65 |
66 | $this->logger->info('Request dropped', [
67 | 'uri' => $request->getUri(),
68 | 'reason' => $request->getDropReason(),
69 | ]);
70 | }
71 |
72 | public function onItemScraped(ItemScraped $event): void
73 | {
74 | $this->logger->info('Item scraped', $event->item->all());
75 | }
76 |
77 | public function onItemDropped(ItemDropped $event): void
78 | {
79 | $this->logger->info('Item dropped', [
80 | 'item' => $event->item->all(),
81 | 'reason' => $event->item->getDropReason(),
82 | ]);
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/src/Extensions/MaxRequestExtension.php:
--------------------------------------------------------------------------------
1 | ['onRequestSending', 10000],
30 | RequestScheduling::NAME => ['onRequestScheduling', 0],
31 | ];
32 | }
33 |
34 | public function onRequestSending(RequestSending $event): void
35 | {
36 | $this->dropRequestIfLimitReached($event);
37 |
38 | if (!$event->request->wasDropped()) {
39 | ++$this->sentRequests;
40 | }
41 | }
42 |
43 | public function onRequestScheduling(RequestScheduling $event): void
44 | {
45 | $this->dropRequestIfLimitReached($event);
46 | }
47 |
48 | private function dropRequestIfLimitReached(RequestScheduling|RequestSending $event): void
49 | {
50 | /** @var int $limit */
51 | $limit = $this->option('limit');
52 |
53 | if ($limit <= $this->sentRequests) {
54 | $event->request = $event->request->drop("Reached maximum request limit of {$limit}");
55 | }
56 | }
57 |
58 | private function defaultOptions(): array
59 | {
60 | return [
61 | 'limit' => 10,
62 | ];
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/Extensions/ScrapedItemCollectorExtension.php:
--------------------------------------------------------------------------------
1 |
29 | */
30 | private array $scrapedItems = [];
31 |
32 | public static function getSubscribedEvents(): array
33 | {
34 | return [
35 | ItemScraped::NAME => ['onItemScraped', 0],
36 | ];
37 | }
38 |
39 | public function onItemScraped(ItemScraped $event): void
40 | {
41 | $this->scrapedItems[] = $event->item;
42 | }
43 |
44 | /**
45 | * @return array
46 | */
47 | public function getScrapedItems(): array
48 | {
49 | return $this->scrapedItems;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/Extensions/StatsCollectorExtension.php:
--------------------------------------------------------------------------------
1 | null,
43 | 'requests.sent' => 0,
44 | 'requests.dropped' => 0,
45 | 'items.scraped' => 0,
46 | 'items.dropped' => 0,
47 | ];
48 |
49 | public function __construct(
50 | private LoggerInterface $logger,
51 | private ClockInterface $clock,
52 | ) {
53 | }
54 |
55 | public static function getSubscribedEvents(): array
56 | {
57 | return [
58 | RunStarting::NAME => ['onRunStarting', 200],
59 | RequestSending::NAME => ['onRequestSending', 200],
60 | RequestDropped::NAME => ['onRequestDropped', 200],
61 | ItemDropped::NAME => ['onItemDropped', 200],
62 | ItemScraped::NAME => ['onItemScraped', 200],
63 | RunFinished::NAME => ['onRunFinished', 200],
64 | ];
65 | }
66 |
67 | public function onRunStarting(): void
68 | {
69 | $this->startTime = $this->clock->now();
70 | }
71 |
72 | public function onRunFinished(): void
73 | {
74 | if (null !== $this->startTime) {
75 | $duration = $this->startTime->diff($this->clock->now());
76 | $this->stats['duration'] = $duration->format('%H:%I:%S');
77 | }
78 |
79 | $this->logger->info('Run statistics', $this->stats);
80 | }
81 |
82 | public function onRequestSending(RequestSending $event): void
83 | {
84 | if (!$event->request->wasDropped()) {
85 | ++$this->stats['requests.sent'];
86 | }
87 | }
88 |
89 | public function onRequestDropped(): void
90 | {
91 | ++$this->stats['requests.dropped'];
92 | }
93 |
94 | public function onItemDropped(): void
95 | {
96 | ++$this->stats['items.dropped'];
97 | }
98 |
99 | public function onItemScraped(): void
100 | {
101 | ++$this->stats['items.scraped'];
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/Http/Client.php:
--------------------------------------------------------------------------------
1 | client = $client ?? new GuzzleClient();
29 | }
30 |
31 | /**
32 | * @param list $requests
33 | */
34 | public function pool(
35 | array $requests,
36 | ?callable $onFulfilled = null,
37 | ?callable $onRejected = null,
38 | ): void {
39 | $makeRequests = function () use ($requests): \Generator {
40 | foreach ($requests as $request) {
41 | yield function () use ($request) {
42 | return $this->client
43 | ->sendAsync($request->getPsrRequest(), $request->getOptions())
44 | ->then(
45 | static fn (ResponseInterface $response) => new Response($response, $request),
46 | static function (GuzzleException $reason) use ($request) {
47 | // If we got back a response, we want to return a Response object
48 | // so it can get sent through the middleware stack.
49 | if ($reason instanceof BadResponseException) {
50 | return new Response($reason->getResponse(), $request);
51 | }
52 |
53 | // For all other cases, we'll wrap the exception in our own
54 | // exception so it can be handled by any request exception middleware.
55 | throw new RequestException($request, $reason);
56 | },
57 | );
58 | };
59 | }
60 | };
61 |
62 | $pool = new Pool($this->client, $makeRequests(), [
63 | 'concurrency' => 0,
64 | 'fulfilled' => $onFulfilled,
65 | 'rejected' => $onRejected,
66 | ]);
67 |
68 | $pool->promise()->wait();
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/Http/ClientInterface.php:
--------------------------------------------------------------------------------
1 | $requests
20 | * @param ?callable(Response): void $onFulfilled
21 | * @param ?callable(RequestException): void $onRejected
22 | */
23 | public function pool(
24 | array $requests,
25 | ?callable $onFulfilled = null,
26 | ?callable $onRejected = null,
27 | ): void;
28 | }
29 |
--------------------------------------------------------------------------------
/src/Http/FakeClient.php:
--------------------------------------------------------------------------------
1 |
26 | */
27 | private array $sentRequestUrls = [];
28 |
29 | public function pool(array $requests, ?callable $onFulfilled = null, ?callable $onRejected = null): void
30 | {
31 | foreach ($requests as $request) {
32 | $this->sentRequestUrls[] = $request->getUri();
33 |
34 | if (null !== $onFulfilled) {
35 | $response = new Response(new GuzzleResponse(), $request);
36 |
37 | $onFulfilled($response);
38 | }
39 | }
40 | }
41 |
42 | public function assertRequestWasSent(Request $request): void
43 | {
44 | $uri = $request->getUri();
45 |
46 | Assert::assertContains(
47 | $request->getUri(),
48 | $this->sentRequestUrls,
49 | "Expected request to [{$uri}] was not sent",
50 | );
51 | }
52 |
53 | public function assertRequestWasNotSent(Request $request): void
54 | {
55 | $uri = $request->getUri();
56 |
57 | Assert::assertNotContains(
58 | $uri,
59 | $this->sentRequestUrls,
60 | "Unexpected request sent to [{$uri}]",
61 | );
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/Http/MalformedUriException.php:
--------------------------------------------------------------------------------
1 | request;
30 | }
31 |
32 | public function getReason(): GuzzleException
33 | {
34 | return $this->reason;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/Http/Response.php:
--------------------------------------------------------------------------------
1 | crawler = new Crawler((string) $response->getBody(), $request->getUri());
38 | }
39 |
40 | public function __call(string $method, array $args): mixed
41 | {
42 | return $this->crawler->{$method}(...$args);
43 | }
44 |
45 | public function getRequest(): Request
46 | {
47 | return $this->request;
48 | }
49 |
50 | public function getStatus(): int
51 | {
52 | return $this->response->getStatusCode();
53 | }
54 |
55 | public function getBody(): string
56 | {
57 | return (string) $this->response->getBody();
58 | }
59 |
60 | public function withBody(string $body): self
61 | {
62 | $this->response = $this->response->withBody(Utils::streamFor($body));
63 | $this->crawler = new Crawler($body, $this->request->getUri());
64 |
65 | return $this;
66 | }
67 |
68 | public function getResponse(): ResponseInterface
69 | {
70 | return $this->response;
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/Http/URL.php:
--------------------------------------------------------------------------------
1 | $this->scheme,
69 | 'host' => $this->host,
70 | 'port' => $this->port,
71 | 'user' => $this->username,
72 | 'pass' => $this->password,
73 | 'path' => $this->path,
74 | 'query' => $this->query->toString(),
75 | 'fragment' => $this->fragment,
76 | ];
77 |
78 | return http_build_url(\array_filter($parts));
79 | }
80 |
81 | /**
82 | * Checks if two URLs are equal.
83 | *
84 | * URLs are considered equal if they contain all the same parts with all the
85 | * same values. Note that if the URLs have a query string, the order of the
86 | * query parameters does not matter.
87 | *
88 | * If a string is provided, it will be converted to a URL object internally.
89 | *
90 | * @throws MalformedUriException thrown if the provided URL is a string and cannot be parsed to a valid URL object
91 | */
92 | public function equals(self|string $other): bool
93 | {
94 | if (\is_string($other)) {
95 | $other = self::parse($other);
96 | }
97 |
98 | return $this->scheme === $other->scheme
99 | && $this->host === $other->host
100 | && $this->port === $other->port
101 | && $this->username === $other->username
102 | && $this->password === $other->password
103 | && $this->path === $other->path
104 | && $this->query->equals($other->query)
105 | && $this->fragment === $other->fragment;
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/src/Http/UnknownQueryParameterException.php:
--------------------------------------------------------------------------------
1 | getProperties(\ReflectionProperty::IS_PUBLIC);
26 |
27 | return \array_reduce(
28 | $properties,
29 | function (array $data, \ReflectionProperty $property): array {
30 | /** @psalm-suppress MixedAssignment */
31 | $data[$property->getName()] = $property->getValue($this);
32 |
33 | return $data;
34 | },
35 | [],
36 | );
37 | }
38 |
39 | final public function get(string $key, mixed $default = null): mixed
40 | {
41 | $reflectionClass = new \ReflectionClass($this);
42 |
43 | try {
44 | $property = $reflectionClass->getProperty($key);
45 | } catch (\ReflectionException) {
46 | return $default;
47 | }
48 |
49 | if (!$property->isPublic()) {
50 | return $default;
51 | }
52 |
53 | return $property->getValue($this) ?: $default;
54 | }
55 |
56 | final public function set(string $key, mixed $value): ItemInterface
57 | {
58 | $reflectionClass = new \ReflectionClass($this);
59 |
60 | try {
61 | $property = $reflectionClass->getProperty($key);
62 | } catch (\ReflectionException) {
63 | throw new \InvalidArgumentException(
64 | \sprintf('No public property %s exists on class %s', $key, static::class),
65 | );
66 | }
67 |
68 | if (!$property->isPublic()) {
69 | throw new \InvalidArgumentException(
70 | \sprintf('No public property %s exists on class %s', $key, static::class),
71 | );
72 | }
73 |
74 | $property->setValue($this, $value);
75 |
76 | return $this;
77 | }
78 |
79 | final public function has(string $key): bool
80 | {
81 | $reflectionClass = new \ReflectionClass($this);
82 |
83 | try {
84 | $property = $reflectionClass->getProperty($key);
85 |
86 | return $property->isPublic();
87 | } catch (\ReflectionException) {
88 | return false;
89 | }
90 | }
91 |
92 | final public function offsetExists(mixed $offset): bool
93 | {
94 | return $this->has($offset);
95 | }
96 |
97 | final public function offsetGet(mixed $offset): mixed
98 | {
99 | // @phpstan-ignore function.alreadyNarrowedType
100 | if (!\is_string($offset)) {
101 | throw new \InvalidArgumentException('Offset needs to be a string');
102 | }
103 |
104 | return $this->get($offset);
105 | }
106 |
107 | final public function offsetSet(mixed $offset, mixed $value): void
108 | {
109 | if (!\is_string($offset)) {
110 | throw new \InvalidArgumentException('Offset needs to be a string');
111 | }
112 |
113 | $this->set($offset, $value);
114 | }
115 |
116 | final public function offsetUnset(mixed $offset): void
117 | {
118 | throw new \RuntimeException('Unsetting properties is not supported for custom item classes');
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/src/ItemPipeline/Item.php:
--------------------------------------------------------------------------------
1 | data;
29 | }
30 |
31 | public function get(string $key, mixed $default = null): mixed
32 | {
33 | return $this->data[$key] ?? $default;
34 | }
35 |
36 | public function set(string $key, mixed $value): ItemInterface
37 | {
38 | $this->data[$key] = $value;
39 |
40 | return $this;
41 | }
42 |
43 | public function has(string $key): bool
44 | {
45 | return isset($this->data[$key]);
46 | }
47 |
48 | public function offsetExists(mixed $offset): bool
49 | {
50 | return isset($this->data[$offset]);
51 | }
52 |
53 | public function offsetGet(mixed $offset): mixed
54 | {
55 | /** @psalm-suppress MixedReturnStatement */
56 | return $this->data[$offset];
57 | }
58 |
59 | public function offsetSet(mixed $offset, mixed $value): void
60 | {
61 | /** @psalm-suppress PossiblyNullArrayOffset */
62 | $this->data[$offset] = $value;
63 | }
64 |
65 | public function offsetUnset(mixed $offset): void
66 | {
67 | unset($this->data[$offset]);
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/ItemPipeline/ItemInterface.php:
--------------------------------------------------------------------------------
1 |
20 | */
21 | interface ItemInterface extends \ArrayAccess, DroppableInterface
22 | {
23 | public function all(): array;
24 |
25 | public function get(string $key, mixed $default = null): mixed;
26 |
27 | public function set(string $key, mixed $value): self;
28 |
29 | public function has(string $key): bool;
30 | }
31 |
--------------------------------------------------------------------------------
/src/ItemPipeline/ItemPipeline.php:
--------------------------------------------------------------------------------
1 |
26 | */
27 | private array $processors = [];
28 |
29 | public function __construct(private EventDispatcherInterface $eventDispatcher)
30 | {
31 | }
32 |
33 | public function setProcessors(ItemProcessorInterface ...$processors): ItemPipelineInterface
34 | {
35 | $this->processors = $processors;
36 |
37 | return $this;
38 | }
39 |
40 | public function sendItem(ItemInterface $item): ItemInterface
41 | {
42 | foreach ($this->processors as $processor) {
43 | if ($processor instanceof ConditionalItemProcessor && !$processor->shouldHandle($item)) {
44 | continue;
45 | }
46 |
47 | $item = $processor->processItem($item);
48 |
49 | if ($item->wasDropped()) {
50 | $this->eventDispatcher->dispatch(
51 | new ItemDropped($item),
52 | ItemDropped::NAME,
53 | );
54 |
55 | return $item;
56 | }
57 | }
58 |
59 | $this->eventDispatcher->dispatch(
60 | new ItemScraped($item),
61 | ItemScraped::NAME,
62 | );
63 |
64 | return $item;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/ItemPipeline/ItemPipelineInterface.php:
--------------------------------------------------------------------------------
1 | getHandledItemClasses(), true);
26 | }
27 |
28 | /**
29 | * @return array>
30 | */
31 | abstract protected function getHandledItemClasses(): array;
32 | }
33 |
--------------------------------------------------------------------------------
/src/ItemPipeline/Processors/FakeProcessor.php:
--------------------------------------------------------------------------------
1 | calls[] = $item->all();
29 |
30 | return $item;
31 | }
32 |
33 | public function assertCalledWith(ItemInterface $item): void
34 | {
35 | Assert::assertContains(
36 | $item->all(),
37 | $this->calls,
38 | 'Processor was not called with expected item',
39 | );
40 | }
41 |
42 | public function assertNotCalledWith(ItemInterface $item): void
43 | {
44 | Assert::assertNotContains(
45 | $item->all(),
46 | $this->calls,
47 | 'Processor got unexpected call with item',
48 | );
49 | }
50 |
51 | public function assertNotCalled(): void
52 | {
53 | Assert::assertEmpty(
54 | $this->calls,
55 | \sprintf('Expected processor to not have been called at all. Was called %s time(s)', \count($this->calls)),
56 | );
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/ItemPipeline/Processors/ItemProcessorInterface.php:
--------------------------------------------------------------------------------
1 | $spiderClass
53 | */
54 | public static function startSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): void
55 | {
56 | self::getRunner()->startSpider($spiderClass, $overrides, $context);
57 | }
58 |
59 | /**
60 | * Start the spider run and collect and return scraped items.
61 | *
62 | * @psalm-param class-string $spiderClass
63 | *
64 | * @return array
65 | */
66 | public static function collectSpider(string $spiderClass, ?Overrides $overrides = null, array $context = []): array
67 | {
68 | return self::getRunner()->collectSpider($spiderClass, $overrides, $context);
69 | }
70 |
71 | /**
72 | * @template T
73 | *
74 | * @psalm-param class-string $class
75 | *
76 | * @psalm-suppress MixedInferredReturnType
77 | *
78 | * @return T
79 | */
80 | public static function resolve(string $class): mixed
81 | {
82 | /** @psalm-suppress MixedReturnStatement */
83 | return self::getContainer()->get($class);
84 | }
85 |
86 | private static function getContainer(): ContainerInterface
87 | {
88 | if (null === self::$container) {
89 | self::$container = new DefaultContainer();
90 | }
91 |
92 | return self::$container;
93 | }
94 |
95 | private static function getRunner(): RunnerInterface
96 | {
97 | return self::$runnerFake ?: self::resolve(RunnerInterface::class);
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/Scheduling/ArrayRequestScheduler.php:
--------------------------------------------------------------------------------
1 |
25 | */
26 | private array $requests = [];
27 |
28 | private \DateTimeImmutable $nextBatchReadyAt;
29 |
30 | public function __construct(private ClockInterface $clock)
31 | {
32 | $this->nextBatchReadyAt = $this->clock->now();
33 | }
34 |
35 | public function schedule(Request $request): void
36 | {
37 | $this->requests[] = $request;
38 | }
39 |
40 | public function empty(): bool
41 | {
42 | return empty($this->requests);
43 | }
44 |
45 | /**
46 | * @return array
47 | */
48 | public function nextRequests(int $batchSize): array
49 | {
50 | $this->clock->sleepUntil($this->nextBatchReadyAt);
51 |
52 | $this->updateNextBatchTime();
53 |
54 | return $this->getNextRequests($batchSize);
55 | }
56 |
57 | public function forceNextRequests(int $batchSize): array
58 | {
59 | return $this->getNextRequests($batchSize);
60 | }
61 |
62 | public function setDelay(int $delay): RequestSchedulerInterface
63 | {
64 | $this->delay = $delay;
65 |
66 | return $this;
67 | }
68 |
69 | public function setNamespace(string $namespace): RequestSchedulerInterface
70 | {
71 | return $this;
72 | }
73 |
74 | private function updateNextBatchTime(): void
75 | {
76 | $this->nextBatchReadyAt = $this->clock->now()->add(new \DateInterval("PT{$this->delay}S"));
77 | }
78 |
79 | /**
80 | * @psalm-suppress MixedReturnTypeCoercion
81 | *
82 | * @return array
83 | */
84 | private function getNextRequests(int $batchSize): array
85 | {
86 | return \array_splice($this->requests, 0, $batchSize);
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/Scheduling/RequestSchedulerInterface.php:
--------------------------------------------------------------------------------
1 |
29 | */
30 | public function nextRequests(int $batchSize): array;
31 |
32 | /**
33 | * Immediately return the next number of requests as defined by $batchSize
34 | * regardless of the configured delay.
35 | *
36 | * @return array
37 | */
38 | public function forceNextRequests(int $batchSize): array;
39 |
40 | public function empty(): bool;
41 |
42 | public function setDelay(int $delay): self;
43 |
44 | public function setNamespace(string $namespace): self;
45 | }
46 |
--------------------------------------------------------------------------------
/src/Scheduling/Timing/ClockInterface.php:
--------------------------------------------------------------------------------
1 | now = new \DateTimeImmutable();
25 | }
26 |
27 | public function now(): \DateTimeImmutable
28 | {
29 | return $this->now;
30 | }
31 |
32 | public function sleep(int $seconds): void
33 | {
34 | $this->sleepUntil(
35 | $this->now->add(new \DateInterval("PT{$seconds}S")),
36 | );
37 | }
38 |
39 | public function sleepUntil(\DateTimeImmutable $date): void
40 | {
41 | if ($date < $this->now) {
42 | return;
43 | }
44 |
45 | $this->secondsPassed += $this->now->diff($date)->s;
46 | $this->now = $date;
47 | }
48 |
49 | public function timePassed(): int
50 | {
51 | return $this->secondsPassed;
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/Scheduling/Timing/SystemClock.php:
--------------------------------------------------------------------------------
1 | now()->getTimestamp();
34 | $target = $date->getTimestamp();
35 |
36 | if ($target <= $now) {
37 | return;
38 | }
39 |
40 | /** @psalm-suppress UnusedFunctionCall */
41 | \time_sleep_until($target);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/Shell/Commands/FetchCommand.php:
--------------------------------------------------------------------------------
1 | addArgument('url', InputArgument::REQUIRED);
34 | }
35 |
36 | protected function execute(InputInterface $input, OutputInterface $output): int
37 | {
38 | $client = new Client();
39 |
40 | /**
41 | * @psalm-suppress MixedAssignement
42 | *
43 | * @var string
44 | */
45 | $url = $input->getArgument('url');
46 | $request = new Request('GET', $url, static fn () => yield from []);
47 | $response = new Response(
48 | $client->send($request->getPsrRequest()),
49 | $request,
50 | );
51 |
52 | $output->writeln(
53 | <<
55 | Available variables:
56 | \$response: <{$response->getStatus()} '{$url}'>
57 | \$html: Raw HTML contents of response
58 | Commands:
59 | fetch Fetch URL and update the \$response and \$html objects
60 |
61 | TEXT
62 | );
63 |
64 | /** @var Shell $app */
65 | $app = $this->getApplication();
66 | $app->setScopeVariables([
67 | 'response' => $response,
68 | 'html' => $response->getBody(),
69 | ]);
70 |
71 | return 0;
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/Shell/Commands/RunSpiderCommand.php:
--------------------------------------------------------------------------------
1 | addArgument('spider', InputArgument::REQUIRED, 'The spider class to execute');
35 | }
36 |
37 | protected function execute(InputInterface $input, OutputInterface $output): int
38 | {
39 | $resolver = Roach::resolve(NamespaceResolverInterface::class);
40 |
41 | try {
42 | /** @phpstan-ignore argument.type */
43 | $spiderClass = $resolver->resolveSpiderNamespace($input->getArgument('spider'));
44 | } catch (InvalidSpiderException $exception) {
45 | $output->writeln(\sprintf('Invalid spider: %s', $exception->getMessage()));
46 |
47 | return self::FAILURE;
48 | }
49 |
50 | Roach::startSpider($spiderClass);
51 |
52 | return self::SUCCESS;
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/Shell/InvalidSpiderException.php:
--------------------------------------------------------------------------------
1 | addArgument('url', InputArgument::REQUIRED, 'The URL to fetch');
40 | }
41 |
42 | protected function execute(InputInterface $input, OutputInterface $output): int
43 | {
44 | $input->setOption('ansi', true);
45 |
46 | /** @psalm-suppress MixedAssignment */
47 | $url = $input->getArgument('url');
48 |
49 | $config = Configuration::fromInput($input);
50 | $config->addCasters([
51 | Crawler::class => 'RoachPHP\Shell\ShellCaster::castCrawler',
52 | Link::class => 'RoachPHP\Shell\ShellCaster::castLink',
53 | Response::class => 'RoachPHP\Shell\ShellCaster::castResponse',
54 | ]);
55 | $config->addCommands([new FetchCommand()]);
56 |
57 | $shell = new Shell($config);
58 |
59 | $command = $shell->find('fetch');
60 | $command->run(new ArrayInput(['url' => $url]), $output);
61 |
62 | $shell->run();
63 |
64 | return 0;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/Shell/Resolver/DefaultNamespaceResolverDecorator.php:
--------------------------------------------------------------------------------
1 | defaultNamespace = \mb_trim($defaultNamespace, " \t\n\r\0\x0B\\");
28 | }
29 |
30 | /**
31 | * @throws InvalidSpiderException
32 | *
33 | * @return class-string
34 | */
35 | public function resolveSpiderNamespace(string $spiderClass): string
36 | {
37 | $spiderClass = \mb_trim($spiderClass);
38 |
39 | if (
40 | \str_starts_with($spiderClass, '\\')
41 | || \str_starts_with($spiderClass, $this->defaultNamespace)
42 | || \class_exists($spiderClass)
43 | ) {
44 | return $this->wrapped->resolveSpiderNamespace($spiderClass);
45 | }
46 |
47 | return $this->wrapped->resolveSpiderNamespace($this->defaultNamespace . '\\' . $spiderClass);
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/Shell/Resolver/FakeNamespaceResolver.php:
--------------------------------------------------------------------------------
1 | $spiderClass
25 | *
26 | * @return class-string
27 | */
28 | public function resolveSpiderNamespace(string $spiderClass): string
29 | {
30 | return $spiderClass;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/Shell/Resolver/NamespaceResolverInterface.php:
--------------------------------------------------------------------------------
1 |
26 | */
27 | public function resolveSpiderNamespace(string $spiderClass): string;
28 | }
29 |
--------------------------------------------------------------------------------
/src/Shell/Resolver/StaticNamespaceResolver.php:
--------------------------------------------------------------------------------
1 | $spiderClass
23 | *
24 | * @throws \ReflectionException
25 | * @throws InvalidSpiderException
26 | *
27 | * @return class-string
28 | */
29 | public function resolveSpiderNamespace(string $spiderClass): string
30 | {
31 | if (!\class_exists($spiderClass)) {
32 | throw new InvalidSpiderException("The spider class {$spiderClass} does not exist");
33 | }
34 |
35 | if (!$this->isSpider($spiderClass)) {
36 | throw new InvalidSpiderException("The class {$spiderClass} is not a spider");
37 | }
38 |
39 | return $spiderClass;
40 | }
41 |
42 | /**
43 | * @param class-string $spiderClass
44 | *
45 | * @throws \ReflectionException
46 | */
47 | private function isSpider(string $spiderClass): bool
48 | {
49 | return (new \ReflectionClass($spiderClass))->implementsInterface(SpiderInterface::class);
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/Shell/ShellCaster.php:
--------------------------------------------------------------------------------
1 | $response->getStatus(),
27 | Caster::PREFIX_VIRTUAL . '.uri' => $response->getUri(),
28 | ];
29 | }
30 |
31 | public static function castCrawler(Crawler $crawler): array
32 | {
33 | return [
34 | Caster::PREFIX_VIRTUAL . '.count' => $crawler->count(),
35 | Caster::PREFIX_VIRTUAL . '.html' => $crawler->outerHtml(),
36 | ];
37 | }
38 |
39 | public static function castLink(Link $link): array
40 | {
41 | return [
42 | Caster::PREFIX_PROTECTED . '.uri' => $link->getUri(),
43 | ];
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/Spider/AbstractSpider.php:
--------------------------------------------------------------------------------
1 | configuration = $loaderStrategy->load();
30 | }
31 |
32 | /**
33 | * @psalm-return \Generator
34 | */
35 | abstract public function parse(Response $response): \Generator;
36 |
37 | /**
38 | * @return array
39 | */
40 | final public function getInitialRequests(): array
41 | {
42 | return $this->initialRequests();
43 | }
44 |
45 | final public function withConfiguration(Configuration $configuration): void
46 | {
47 | $this->configuration = $configuration;
48 | }
49 |
50 | final public function withContext(array $context): void
51 | {
52 | $this->context = $context;
53 | }
54 |
55 | final public function loadConfiguration(): Configuration
56 | {
57 | return $this->configuration;
58 | }
59 |
60 | protected function request(
61 | string $method,
62 | string $url,
63 | string $parseMethod = 'parse',
64 | array $options = [],
65 | ): ParseResult {
66 | // @phpstan-ignore argument.type
67 | return ParseResult::request($method, $url, [$this, $parseMethod], $options);
68 | }
69 |
70 | protected function item(array|ItemInterface $item): ParseResult
71 | {
72 | if ($item instanceof ItemInterface) {
73 | return ParseResult::fromValue($item);
74 | }
75 |
76 | return ParseResult::item($item);
77 | }
78 |
79 | /**
80 | * @return array
81 | */
82 | protected function initialRequests(): array
83 | {
84 | return \array_map(function (string $url) {
85 | return new Request('GET', $url, [$this, 'parse']);
86 | }, $this->configuration->startUrls);
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/Spider/BasicSpider.php:
--------------------------------------------------------------------------------
1 |
31 | */
32 | public array $startUrls = [];
33 |
34 | /**
35 | * @var list>
36 | */
37 | public array $spiderMiddleware = [];
38 |
39 | /**
40 | * @var list>
41 | */
42 | public array $downloaderMiddleware = [
43 | RequestDeduplicationMiddleware::class,
44 | HttpErrorMiddleware::class,
45 | ];
46 |
47 | /**
48 | * @var list>
49 | */
50 | public array $itemProcessors = [];
51 |
52 | /**
53 | * @var list>
54 | */
55 | public array $extensions = [
56 | LoggerExtension::class,
57 | StatsCollectorExtension::class,
58 | ];
59 |
60 | public int $concurrency = 5;
61 |
62 | public int $requestDelay = 1;
63 |
64 | public function __construct()
65 | {
66 | parent::__construct(new ArrayLoader([
67 | 'startUrls' => $this->startUrls,
68 | 'downloaderMiddleware' => $this->downloaderMiddleware,
69 | 'spiderMiddleware' => $this->spiderMiddleware,
70 | 'itemProcessors' => $this->itemProcessors,
71 | 'extensions' => $this->extensions,
72 | 'concurrency' => $this->concurrency,
73 | 'requestDelay' => $this->requestDelay,
74 | ]));
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/Spider/Configuration/ArrayLoader.php:
--------------------------------------------------------------------------------
1 | [],
29 | * spiderMiddleware: class-string[],
30 | * itemProcessors: class-string[],
31 | * extensions: class-string[],
32 | * concurrency: int,
33 | * requestDelay: int
34 | * }
35 | */
36 | private array $config;
37 |
38 | public function __construct(array $configuration)
39 | {
40 | $resolver = new OptionsResolver();
41 |
42 | $resolver->setDefaults([
43 | 'startUrls' => [],
44 | 'downloaderMiddleware' => [],
45 | 'itemProcessors' => [],
46 | 'spiderMiddleware' => [],
47 | 'extensions' => [],
48 | 'concurrency' => 5,
49 | 'requestDelay' => 0,
50 | ]);
51 |
52 | // @phpstan-ignore assign.propertyType
53 | $this->config = $resolver->resolve($configuration);
54 | }
55 |
56 | public function load(): Configuration
57 | {
58 | return new Configuration(
59 | $this->config['startUrls'],
60 | $this->config['downloaderMiddleware'],
61 | $this->config['itemProcessors'],
62 | $this->config['spiderMiddleware'],
63 | $this->config['extensions'],
64 | $this->config['concurrency'],
65 | $this->config['requestDelay'],
66 | );
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/Spider/Configuration/Configuration.php:
--------------------------------------------------------------------------------
1 | $startUrls
25 | * @param array> $downloaderMiddleware
26 | * @param array> $itemProcessors
27 | * @param array> $spiderMiddleware
28 | * @param array> $extensions
29 | */
30 | public function __construct(
31 | public array $startUrls,
32 | public array $downloaderMiddleware,
33 | public array $itemProcessors,
34 | public array $spiderMiddleware,
35 | public array $extensions,
36 | public int $concurrency,
37 | public int $requestDelay,
38 | ) {
39 | }
40 |
41 | public function withOverrides(Overrides $overrides): self
42 | {
43 | $newValues = \array_merge([
44 | 'startUrls' => $this->startUrls,
45 | 'downloaderMiddleware' => $this->downloaderMiddleware,
46 | 'spiderMiddleware' => $this->spiderMiddleware,
47 | 'extensions' => $this->extensions,
48 | 'itemProcessors' => $this->itemProcessors,
49 | 'concurrency' => $this->concurrency,
50 | 'requestDelay' => $this->requestDelay,
51 | ], $overrides->toArray());
52 |
53 | return new self(...$newValues);
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/Spider/Configuration/Overrides.php:
--------------------------------------------------------------------------------
1 | $startUrls
28 | * @param null|list> $downloaderMiddleware
29 | * @param null|list> $spiderMiddleware
30 | * @param null|list> $itemProcessors
31 | * @param null|list> $extensions
32 | */
33 | public function __construct(
34 | public ?array $startUrls = null,
35 | public ?array $downloaderMiddleware = null,
36 | public ?array $spiderMiddleware = null,
37 | public ?array $itemProcessors = null,
38 | public ?array $extensions = null,
39 | public ?int $concurrency = null,
40 | public ?int $requestDelay = null,
41 | ) {
42 | }
43 |
44 | /**
45 | * @psalm-suppress MoreSpecificReturnType, LessSpecificReturnStatement
46 | *
47 | * @return array{
48 | * startUrls?: string[],
49 | * downloaderMiddleware?: class-string[],
50 | * spiderMiddleware?: class-string[],
51 | * itemProcessors?: class-string[],
52 | * extensions?: class-string[],
53 | * concurrency?: int,
54 | * requestDelay?: int,
55 | * }
56 | */
57 | public function toArray(): array
58 | {
59 | return \array_filter([
60 | 'startUrls' => $this->startUrls,
61 | 'downloaderMiddleware' => $this->downloaderMiddleware,
62 | 'spiderMiddleware' => $this->spiderMiddleware,
63 | 'itemProcessors' => $this->itemProcessors,
64 | 'extensions' => $this->extensions,
65 | 'concurrency' => $this->concurrency,
66 | 'requestDelay' => $this->requestDelay,
67 | ], static fn ($value) => null !== $value);
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/Spider/ConfigurationLoaderStrategy.php:
--------------------------------------------------------------------------------
1 | getRequest()->getMeta('depth', 1);
28 | $newDepth = $currentDepth + 1;
29 |
30 | if ($this->option('maxCrawlDepth') < $newDepth) {
31 | return $request->drop('Maximum crawl depth reached');
32 | }
33 |
34 | return $request->withMeta('depth', $currentDepth + 1);
35 | }
36 |
37 | private function defaultOptions(): array
38 | {
39 | return [
40 | 'maxCrawlDepth' => 10,
41 | ];
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/Spider/Middleware/RequestMiddlewareInterface.php:
--------------------------------------------------------------------------------
1 | middleware instanceof ItemMiddlewareInterface) {
44 | return $this->middleware->handleItem($item, $response);
45 | }
46 |
47 | return $item;
48 | }
49 |
50 | public function handleRequest(Request $request, Response $response): Request
51 | {
52 | if ($this->middleware instanceof RequestMiddlewareInterface) {
53 | return $this->middleware->handleRequest($request, $response);
54 | }
55 |
56 | return $request;
57 | }
58 |
59 | public function handleResponse(Response $response): Response
60 | {
61 | if ($this->middleware instanceof ResponseMiddlewareInterface) {
62 | return $this->middleware->handleResponse($response);
63 | }
64 |
65 | return $response;
66 | }
67 |
68 | public function configure(array $options): void
69 | {
70 | $this->middleware->configure($options);
71 | }
72 |
73 | public function getMiddleware(): ItemMiddlewareInterface|RequestMiddlewareInterface|ResponseMiddlewareInterface
74 | {
75 | return $this->middleware;
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/Spider/ParseResult.php:
--------------------------------------------------------------------------------
1 | value;
40 | }
41 |
42 | /**
43 | * @param callable(Response): \Generator $parseCallback
44 | */
45 | public static function request(
46 | string $method,
47 | string $url,
48 | callable $parseCallback,
49 | array $options = [],
50 | ): self {
51 | return new self(new Request($method, $url, $parseCallback, $options));
52 | }
53 |
54 | /**
55 | * @param \Closure(Request): void $ifRequest
56 | * @param \Closure(ItemInterface): void $ifItem
57 | */
58 | public function apply(\Closure $ifRequest, \Closure $ifItem): void
59 | {
60 | if ($this->value instanceof Request) {
61 | $ifRequest($this->value);
62 | } else {
63 | $ifItem($this->value);
64 | }
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/Spider/Processor.php:
--------------------------------------------------------------------------------
1 |
28 | */
29 | private array $middleware = [];
30 |
31 | public function __construct(private EventDispatcherInterface $eventDispatcher)
32 | {
33 | }
34 |
35 | public function withMiddleware(SpiderMiddlewareInterface ...$middleware): self
36 | {
37 | $this->middleware = $middleware;
38 |
39 | return $this;
40 | }
41 |
42 | public function handle(Response $response): \Generator
43 | {
44 | foreach ($this->middleware as $handler) {
45 | $response = $handler->handleResponse($response);
46 |
47 | if ($response->wasDropped()) {
48 | $this->eventDispatcher->dispatch(
49 | new ResponseDropped($response),
50 | ResponseDropped::NAME,
51 | );
52 |
53 | return;
54 | }
55 | }
56 |
57 | /** @var list $results */
58 | $results = $response->getRequest()->callback($response);
59 |
60 | foreach ($results as $result) {
61 | $value = $result->value();
62 | $handleMethod = $value instanceof Request
63 | ? 'handleRequest'
64 | : 'handleItem';
65 |
66 | foreach ($this->middleware as $handler) {
67 | /** @var ItemInterface|Request $value */
68 | $value = $handler->{$handleMethod}($value, $response);
69 |
70 | if ($value->wasDropped()) {
71 | if ($value instanceof Request) {
72 | $this->eventDispatcher->dispatch(
73 | new RequestDropped($value),
74 | RequestDropped::NAME,
75 | );
76 | } else {
77 | $this->eventDispatcher->dispatch(
78 | new ItemDropped($value),
79 | ItemDropped::NAME,
80 | );
81 | }
82 |
83 | break;
84 | }
85 | }
86 |
87 | if (!$value->wasDropped()) {
88 | yield ParseResult::fromValue($value);
89 | }
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/Spider/SpiderInterface.php:
--------------------------------------------------------------------------------
1 |
38 | */
39 | public function getInitialRequests(): array;
40 | }
41 |
--------------------------------------------------------------------------------
/src/Spider/SpiderMiddlewareInterface.php:
--------------------------------------------------------------------------------
1 | $options
26 | *
27 | * @return array{0: class-string, 1: array}
28 | */
29 | public static function withOptions(array $options): array
30 | {
31 | return [static::class, $options];
32 | }
33 |
34 | /**
35 | * @param array $options
36 | */
37 | final public function configure(array $options): void
38 | {
39 | if ($this->optionsResolved) {
40 | return;
41 | }
42 |
43 | $resolver = new OptionsResolver();
44 |
45 | $resolver->setDefaults($this->defaultOptions());
46 |
47 | $this->resolvedOptions = $resolver->resolve($options);
48 | $this->optionsResolved = true;
49 |
50 | $this->onAfterConfigured();
51 | }
52 |
53 | public function option(string $key): mixed
54 | {
55 | if (!$this->optionsResolved) {
56 | $this->configure([]);
57 | }
58 |
59 | return $this->resolvedOptions[$key] ?? null;
60 | }
61 |
62 | /**
63 | * @return array
64 | */
65 | private function defaultOptions(): array
66 | {
67 | return [];
68 | }
69 |
70 | /**
71 | * Called after the `configure` method was called on the object the first
72 | * time. This is a good place to perform any one-time setup that should
73 | * happen before the run starts.
74 | */
75 | private function onAfterConfigured(): void
76 | {
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/Support/ConfigurableInterface.php:
--------------------------------------------------------------------------------
1 | $options
20 | */
21 | public function configure(array $options): void;
22 | }
23 |
--------------------------------------------------------------------------------
/src/Support/Droppable.php:
--------------------------------------------------------------------------------
1 | dropped = true;
26 | $clone->dropReason = $reason;
27 |
28 | return $clone;
29 | }
30 |
31 | public function wasDropped(): bool
32 | {
33 | return $this->dropped;
34 | }
35 |
36 | public function getDropReason(): string
37 | {
38 | return $this->dropReason;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/Support/DroppableInterface.php:
--------------------------------------------------------------------------------
1 | meta[$key] ?? $default;
23 | }
24 |
25 | public function withMeta(string $key, mixed $value): static
26 | {
27 | $newThis = clone $this;
28 | $newThis->meta[$key] = $value;
29 |
30 | return $newThis;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/Testing/Concerns/InteractsWithRequestsAndResponses.php:
--------------------------------------------------------------------------------
1 | makeRequest();
33 |
34 | $processedRequest = $middleware->handleRequest($request);
35 |
36 | self::assertSame($jar, $processedRequest->getOptions()['cookies']);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/tests/Downloader/Middleware/ExecuteJavascriptMiddlewareTest.php:
--------------------------------------------------------------------------------
1 | makeResponse(
32 | $this->makeRequest('http://localhost:8000/javascript'),
33 | );
34 | $middleware = new ExecuteJavascriptMiddleware(new FakeLogger());
35 |
36 | $processedResponse = $middleware->handleResponse($response);
37 |
38 | self::assertSame('Headline', $processedResponse->filter('#content h1')->text(''));
39 | self::assertSame('I was loaded via Javascript!', $processedResponse->filter('#content p')->text(''));
40 | }
41 |
42 | public function testDropResponseIfExceptionOccursWhileExecutingJavascript(): void
43 | {
44 | $throwingBrowsershot = new class() extends Browsershot {
45 | public function bodyHtml(): string
46 | {
47 | throw new \Exception('::exception-message::');
48 | }
49 | };
50 | $middleware = new ExecuteJavascriptMiddleware(
51 | new FakeLogger(),
52 | static fn (string $uri): Browsershot => $throwingBrowsershot->setUrl($uri),
53 | );
54 |
55 | $processedResponse = $middleware->handleResponse($this->makeResponse());
56 |
57 | self::assertTrue($processedResponse->wasDropped());
58 | }
59 |
60 | public function testLogErrors(): void
61 | {
62 | $throwingBrowsershot = new class() extends Browsershot {
63 | public function bodyHtml(): string
64 | {
65 | throw new \Exception('::exception-message::');
66 | }
67 | };
68 | $logger = new FakeLogger();
69 | $middleware = new ExecuteJavascriptMiddleware(
70 | $logger,
71 | static fn (string $uri): Browsershot => $throwingBrowsershot->setUrl($uri),
72 | );
73 |
74 | $middleware->handleResponse($this->makeResponse());
75 |
76 | self::assertTrue(
77 | $logger->messageWasLogged(
78 | 'info',
79 | '[ExecuteJavascriptMiddleware] Error while executing javascript',
80 | ),
81 | );
82 | }
83 |
84 | public function testUsesTheProvidedUserAgentOption(): void
85 | {
86 | $mockBrowserShot = $this->createMock(Browsershot::class);
87 | $response = $this->makeResponse(
88 | $this->makeRequest('http://localhost:8000/javascript'),
89 | );
90 | $middleware = new ExecuteJavascriptMiddleware(
91 | new FakeLogger(),
92 | static fn (string $uri): Browsershot => $mockBrowserShot,
93 | );
94 | $middleware->configure(['userAgent' => 'custom']);
95 |
96 | $mockBrowserShot->expects(self::once())
97 | ->method('userAgent')
98 | ->with(self::equalTo('custom'));
99 |
100 | $middleware->handleResponse($response);
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/tests/Downloader/Middleware/RobotsTxtMiddlewareTest.php:
--------------------------------------------------------------------------------
1 | engine = new Engine(
50 | new ArrayRequestScheduler(new FakeClock()),
51 | new Downloader(new Client(), $dispatcher),
52 | new ItemPipeline($dispatcher),
53 | new Processor($dispatcher),
54 | $dispatcher,
55 | );
56 |
57 | $middleware = new RobotsTxtMiddleware();
58 | $middleware->configure(['fileName' => 'robots']);
59 | $this->middleware = DownloaderMiddlewareAdapter::fromMiddleware($middleware);
60 | }
61 |
62 | public function testOnlyRequestsRobotsTxtOnceForRequestsToSameDomain(): void
63 | {
64 | $parseCallback = static fn () => yield ParseResult::fromValue(self::makeRequest('http://localhost:8000/test2'));
65 | $run = new Run(
66 | [new Request('GET', 'http://localhost:8000/test1', $parseCallback)],
67 | '::namespace::',
68 | downloaderMiddleware: [$this->middleware],
69 | );
70 |
71 | $this->engine->start($run);
72 |
73 | $this->assertRouteWasCrawledTimes('/robots', 1);
74 | }
75 |
76 | public function testAllowsRequestIfAllowedByRobotsTxt(): void
77 | {
78 | $run = new Run(
79 | [self::makeRequest('http://localhost:8000/test1')],
80 | '::namespace::',
81 | downloaderMiddleware: [$this->middleware],
82 | );
83 |
84 | $this->engine->start($run);
85 |
86 | $this->assertRouteWasCrawled('/test1');
87 | }
88 |
89 | public function testDropRequestIfForbiddenByRobotsTxt(): void
90 | {
91 | $run = new Run(
92 | [self::makeRequest('http://localhost:8000/test2')],
93 | '::namespace::',
94 | downloaderMiddleware: [$this->middleware],
95 | );
96 |
97 | $this->engine->start($run);
98 |
99 | $this->assertRouteWasNotCrawled('/test2');
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/tests/Downloader/Middleware/UserAgentMiddlewareTest.php:
--------------------------------------------------------------------------------
1 | configure([]);
34 |
35 | $request = $middleware->handleRequest($this->makeRequest());
36 |
37 | self::assertTrue($request->hasHeader('User-Agent'));
38 | self::assertSame('roach-php', $request->getHeader('User-Agent')[0]);
39 | }
40 |
41 | public function testSetCustomUserAgentOnRequest(): void
42 | {
43 | $middleware = new UserAgentMiddleware();
44 | $middleware->configure(['userAgent' => 'custom']);
45 |
46 | $request = $middleware->handleRequest($this->makeRequest());
47 |
48 | self::assertTrue($request->hasHeader('User-Agent'));
49 | self::assertSame('custom', $request->getHeader('User-Agent')[0]);
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/tests/Downloader/Proxy/ArrayConfigurationLoaderTest.php:
--------------------------------------------------------------------------------
1 | [
27 | 'https' => '::https-proxy-1::',
28 | 'http' => '::http-proxy-1::',
29 | 'no' => ['::no-1::'],
30 | ],
31 | '::host-2::' => [
32 | 'https' => '::https-proxy-2::',
33 | 'http' => '::http-proxy-2::',
34 | 'no' => [],
35 | ],
36 | '::host-3::' => [
37 | 'no' => ['::no-3::'],
38 | ],
39 | ]);
40 |
41 | $proxy = $loader->loadProxyConfiguration();
42 | self::assertEquals(
43 | new Proxy([
44 | '::host-1::' => new ProxyOptions(
45 | '::http-proxy-1::',
46 | '::https-proxy-1::',
47 | ['::no-1::'],
48 | ),
49 | '::host-2::' => new ProxyOptions(
50 | '::http-proxy-2::',
51 | '::https-proxy-2::',
52 | [],
53 | ),
54 | '::host-3::' => new ProxyOptions(
55 | null,
56 | null,
57 | ['::no-3::'],
58 | ),
59 | ]),
60 | $proxy,
61 | );
62 | }
63 |
64 | public function testCreatesAWildcardProxyIfOnlyAURLIsProvided(): void
65 | {
66 | $loader = new ArrayConfigurationLoader('::proxy-url::');
67 |
68 | $proxy = $loader->loadProxyConfiguration();
69 |
70 | self::assertEquals(
71 | new Proxy([
72 | '*' => ProxyOptions::allProtocols('::proxy-url::'),
73 | ]),
74 | $proxy,
75 | );
76 | }
77 |
78 | public function testConfiguresTheSameURLForAllProtocolsIfOnlyAURLIsProvided(): void
79 | {
80 | $loader = new ArrayConfigurationLoader([
81 | '::host::' => '::proxy-url::',
82 | ]);
83 |
84 | $proxy = $loader->loadProxyConfiguration();
85 |
86 | self::assertEquals(
87 | new Proxy([
88 | '::host::' => ProxyOptions::allProtocols('::proxy-url::'),
89 | ]),
90 | $proxy,
91 | );
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/tests/Downloader/Proxy/ProxyTest.php:
--------------------------------------------------------------------------------
1 | optionsFor($this->makeRequest());
33 |
34 | self::assertTrue($options->equals(ProxyOptions::make()));
35 | }
36 |
37 | public function testReturnMatchingProxyOptionsForRequestIfConfigured(): void
38 | {
39 | $proxy = new Proxy([
40 | 'domain-1.com' => ProxyOptions::make()
41 | ->allProtocols('::proxy-url-1::'),
42 | 'domain-2.com' => ProxyOptions::make()
43 | ->allProtocols('::proxy-url-2::'),
44 | ]);
45 |
46 | $options = $proxy->optionsFor(
47 | $this->makeRequest('https://domain-1.com'),
48 | );
49 | self::assertTrue(
50 | $options->equals(
51 | ProxyOptions::make()->allProtocols('::proxy-url-1::'),
52 | ),
53 | );
54 |
55 | $options = $proxy->optionsFor(
56 | $this->makeRequest('https://domain-2.com'),
57 | );
58 | self::assertTrue(
59 | $options->equals(
60 | ProxyOptions::make()->allProtocols('::proxy-url-2::'),
61 | ),
62 | );
63 | }
64 |
65 | public function testReturnsWildcardOptionsIfConfiguredAndDomainDoesntMatch(): void
66 | {
67 | $proxy = new Proxy([
68 | 'domain-1.com' => ProxyOptions::make()
69 | ->allProtocols('::proxy-url-1::'),
70 | '*' => ProxyOptions::make()
71 | ->allProtocols('::proxy-url-2::'),
72 | ]);
73 |
74 | $options = $proxy->optionsFor(
75 | $this->makeRequest('https://domain-2.com'),
76 | );
77 | self::assertTrue(
78 | $options->equals(
79 | ProxyOptions::make()->allProtocols('::proxy-url-2::'),
80 | ),
81 | );
82 | }
83 |
84 | public function testPreferDomainConfigurationOverWildcard(): void
85 | {
86 | $proxy = new Proxy([
87 | 'domain-1.com' => ProxyOptions::make()
88 | ->allProtocols('::proxy-url-1::'),
89 | '*' => ProxyOptions::make()
90 | ->allProtocols('::proxy-url-2::'),
91 | ]);
92 |
93 | $options = $proxy->optionsFor(
94 | $this->makeRequest('https://domain-1.com'),
95 | );
96 | self::assertTrue(
97 | $options->equals(
98 | ProxyOptions::make()->allProtocols('::proxy-url-1::'),
99 | ),
100 | );
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/tests/Events/FakeDispatcherTest.php:
--------------------------------------------------------------------------------
1 | dispatcher = new FakeDispatcher();
30 | }
31 |
32 | public function testAssertDispatchedPassesIfEventWasDispatched(): void
33 | {
34 | $event = new FakeEvent();
35 | $this->dispatcher->dispatch($event, 'event.name');
36 |
37 | $this->dispatcher->assertDispatched('event.name');
38 | }
39 |
40 | public function testAssertDispatchedFailsIfNoEventWasDispatched(): void
41 | {
42 | $this->expectException(AssertionFailedError::class);
43 | $this->dispatcher->assertDispatched('event.name');
44 | }
45 |
46 | public function testAssertDispatchedFailsIfCallbackReturnsFalse(): void
47 | {
48 | $this->dispatcher->dispatch(new FakeEvent(), 'event.name');
49 |
50 | $this->expectException(AssertionFailedError::class);
51 | $this->dispatcher->assertDispatched('event.name', static fn (FakeEvent $event) => false);
52 | }
53 |
54 | public function testAssertDispatchedPassesIfCallbackReturnsTrue(): void
55 | {
56 | $this->dispatcher->dispatch(new FakeEvent(), 'event.name');
57 |
58 | $this->dispatcher->assertDispatched('event.name', static fn (FakeEvent $event) => true);
59 | }
60 |
61 | public function testAssertNotDispatched(): void
62 | {
63 | $event = new FakeEvent();
64 |
65 | $this->dispatcher->assertNotDispatched('event.name');
66 |
67 | $this->dispatcher->dispatch($event, 'event.name');
68 | $this->expectException(AssertionFailedError::class);
69 | $this->dispatcher->assertNotDispatched('event.name');
70 | }
71 |
72 | public function testRunEventListeners(): void
73 | {
74 | $called = false;
75 | $this->dispatcher->listen('event.name', static function () use (&$called): void {
76 | $called = true;
77 | });
78 |
79 | $this->dispatcher->dispatch(new FakeEvent(), 'event.name');
80 |
81 | self::assertTrue($called);
82 | }
83 | }
84 |
85 | final class FakeEvent
86 | {
87 | public function __construct(public array $data = [])
88 | {
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/tests/Extensions/ExtensionTestCase.php:
--------------------------------------------------------------------------------
1 | dispatcher = new FakeDispatcher();
33 | $this->extension = $this->createExtension();
34 |
35 | $this->dispatcher->addSubscriber($this->extension);
36 | }
37 |
38 | abstract protected function createExtension(): ExtensionInterface;
39 |
40 | protected function dispatch(Event $event, string $eventName): void
41 | {
42 | $this->dispatcher->dispatch($event, $eventName);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/tests/Extensions/MaxRequestExtensionTest.php:
--------------------------------------------------------------------------------
1 | extension->configure(['limit' => $threshold]);
35 |
36 | for ($i = 0; $threshold - 1 > $i; ++$i) {
37 | $this->dispatch(
38 | new RequestSending($this->makeRequest()),
39 | RequestSending::NAME,
40 | );
41 | }
42 |
43 | $event = new RequestScheduling($this->makeRequest());
44 | $this->dispatch($event, RequestScheduling::NAME);
45 |
46 | self::assertFalse($event->request->wasDropped());
47 | }
48 |
49 | /**
50 | * @dataProvider thresholdProvider
51 | */
52 | public function testDropRequestAfterThresholdWasReached(int $threshold): void
53 | {
54 | $this->extension->configure(['limit' => $threshold]);
55 |
56 | for ($i = 0; $i < $threshold; ++$i) {
57 | $this->dispatch(
58 | new RequestSending($this->makeRequest()),
59 | RequestSending::NAME,
60 | );
61 | }
62 |
63 | $event = new RequestScheduling($this->makeRequest());
64 | $this->dispatch($event, RequestScheduling::NAME);
65 |
66 | self::assertTrue($event->request->wasDropped());
67 | }
68 |
69 | public static function thresholdProvider(): iterable
70 | {
71 | yield [1];
72 |
73 | yield [2];
74 |
75 | yield [3];
76 |
77 | yield [4];
78 | }
79 |
80 | protected function createExtension(): ExtensionInterface
81 | {
82 | return new MaxRequestExtension();
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/tests/Extensions/ScrapedItemCollectorExtensionTest.php:
--------------------------------------------------------------------------------
1 | extension->configure([]);
28 |
29 | self::assertEmpty($this->extension->getScrapedItems());
30 |
31 | $item = new Item(['::key::' => '::value::']);
32 | $this->dispatch(new ItemScraped($item), ItemScraped::NAME);
33 |
34 | self::assertEquals([$item], $this->extension->getScrapedItems());
35 | }
36 |
37 | protected function createExtension(): ScrapedItemCollectorExtension
38 | {
39 | return new ScrapedItemCollectorExtension();
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/tests/Fixtures/Extension.php:
--------------------------------------------------------------------------------
1 | '::default-option-value::',
32 | ];
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/tests/Fixtures/ItemProcessor.php:
--------------------------------------------------------------------------------
1 | '::default-option-value::',
33 | ];
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/tests/Fixtures/ItemSpiderMiddleware.php:
--------------------------------------------------------------------------------
1 | '::default-option-value::',
33 | ];
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/tests/Fixtures/RequestSpiderMiddleware.php:
--------------------------------------------------------------------------------
1 | '::default-option-value::',
34 | ];
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/tests/Fixtures/ResponseDownloaderMiddleware.php:
--------------------------------------------------------------------------------
1 | > $handledItemClasses
23 | */
24 | public function __construct(private array $handledItemClasses)
25 | {
26 | }
27 |
28 | public function processItem(ItemInterface $item): ItemInterface
29 | {
30 | return $item->drop('::reason::');
31 | }
32 |
33 | protected function getHandledItemClasses(): array
34 | {
35 | return $this->handledItemClasses;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/tests/Fixtures/TestItem.php:
--------------------------------------------------------------------------------
1 | client = new FakeClient();
34 | }
35 |
36 | public function testAssertRequestWasSent(): void
37 | {
38 | $requestA = $this->makeRequest('::url-a::');
39 | $requestB = $this->makeRequest('::url-b::');
40 | $requestC = $this->makeRequest('::url-c::');
41 |
42 | $this->client->pool([$requestA, $requestB]);
43 |
44 | $this->client->assertRequestWasSent($requestA);
45 | $this->client->assertRequestWasSent($requestB);
46 |
47 | $this->expectException(AssertionFailedError::class);
48 | $this->client->assertRequestWasSent($requestC);
49 | }
50 |
51 | public function testAssertRequestWasNotSent(): void
52 | {
53 | $requestA = $this->makeRequest('::url-a::');
54 | $requestB = $this->makeRequest('::url-b::');
55 | $requestC = $this->makeRequest('::url-c::');
56 |
57 | $this->client->pool([$requestC]);
58 |
59 | $this->client->assertRequestWasNotSent($requestA);
60 | $this->client->assertRequestWasNotSent($requestB);
61 |
62 | $this->expectException(AssertionFailedError::class);
63 | $this->client->assertRequestWasNotSent($requestC);
64 | }
65 |
66 | public function testCallOnFulfilledCallbackWithResponseForEachRequest(): void
67 | {
68 | $requests = [
69 | $this->makeRequest('::url-a::')->withMeta('index', 0),
70 | $this->makeRequest('::url-b::')->withMeta('index', 1),
71 | $this->makeRequest('::url-c::')->withMeta('index', 2),
72 | ];
73 |
74 | $this->client->pool($requests, static function (Response $response) use (&$requests): void {
75 | self::assertContains($response->getRequest(), $requests);
76 |
77 | // Remove request from array so it can't be used for
78 | // another reponse as well.
79 | unset($requests[$response->getRequest()->getMeta('index')]);
80 | });
81 |
82 | self::assertEmpty($requests);
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/tests/IntegrationTestCase.php:
--------------------------------------------------------------------------------
1 | skipIfServerNotRunning();
28 |
29 | if (\file_exists(__DIR__ . '/Server/tmp/crawled.json')) {
30 | \unlink(__DIR__ . '/Server/tmp/crawled.json');
31 | }
32 | }
33 |
34 | protected function skipIfServerNotRunning(): void
35 | {
36 | if (false === \file_get_contents("{$this->serverUrl}/ping")) {
37 | self::markTestSkipped('Skipping integration test. Server not running.');
38 | }
39 | }
40 |
41 | protected function assertRouteWasCrawled(string $route): void
42 | {
43 | self::assertArrayHasKey($route, $this->getCrawledRoutes());
44 | }
45 |
46 | protected function assertRouteWasCrawledTimes(string $route, int $times): void
47 | {
48 | $crawledRoutes = $this->getCrawledRoutes();
49 |
50 | self::assertArrayHasKey($route, $crawledRoutes);
51 | self::assertSame($times, $crawledRoutes[$route]);
52 | }
53 |
54 | protected function assertRouteWasNotCrawled(string $route): void
55 | {
56 | self::assertArrayNotHasKey($route, $this->getCrawledRoutes());
57 | }
58 |
59 | private function getCrawledRoutes(): array
60 | {
61 | $response = \file_get_contents("{$this->serverUrl}/crawled-routes");
62 |
63 | if (!$response) {
64 | return [];
65 | }
66 |
67 | return \json_decode(
68 | $response,
69 | true,
70 | 512,
71 | \JSON_THROW_ON_ERROR,
72 | );
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/tests/ItemPipeline/CustomItemProcessorTest.php:
--------------------------------------------------------------------------------
1 | shouldHandle(new TestItem('::foo::', '::bar::')),
31 | );
32 |
33 | $processor = new TestCustomItemProcessor([TestItem2::class]);
34 | self::assertTrue(
35 | $processor->shouldHandle(new TestItem2()),
36 | );
37 | }
38 |
39 | public function testDoesNotHandleItemsNotDefinedInTheChildClass(): void
40 | {
41 | $processor = new TestCustomItemProcessor([TestItem::class]);
42 | self::assertFalse(
43 | $processor->shouldHandle(new TestItem2()),
44 | );
45 |
46 | $processor = new TestCustomItemProcessor([TestItem2::class]);
47 | self::assertFalse(
48 | $processor->shouldHandle(new TestItem('::foo::', '::bar::')),
49 | );
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/tests/ItemPipeline/ItemTest.php:
--------------------------------------------------------------------------------
1 | assertRunWasStarted(TestSpider::class);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/tests/Scheduling/Timing/FakeClockTest.php:
--------------------------------------------------------------------------------
1 | clock = new FakeClock();
30 | }
31 |
32 | public function testWaitUntilTargetTime(): void
33 | {
34 | $now = $this->clock->now();
35 |
36 | $this->clock->sleepUntil($now->add(new \DateInterval('PT1S')));
37 | $then1 = $this->clock->now();
38 | self::assertSame(1, $now->diff($then1)->s);
39 |
40 | $this->clock->sleepUntil($then1->add(new \DateInterval('PT1S')));
41 | $then2 = $this->clock->now();
42 | self::assertSame(1, $then1->diff($then2)->s);
43 | self::assertSame(2, $now->diff($then2)->s);
44 | }
45 |
46 | public function testDontWaitIfTargetDateIsInPast(): void
47 | {
48 | $now = $this->clock->now();
49 |
50 | $this->clock->sleepUntil($now->sub(new \DateInterval('PT2S')));
51 | $then = $this->clock->now();
52 | self::assertSame(0, $now->diff($then)->s);
53 | }
54 |
55 | public function testRecordTimePassedSleepUntil(): void
56 | {
57 | $clock = new FakeClock();
58 |
59 | self::assertSame(0, $clock->timePassed());
60 |
61 | $clock->sleepUntil($clock->now()->add(new \DateInterval('PT5S')));
62 | self::assertSame(5, $clock->timePassed());
63 |
64 | $clock->sleepUntil($clock->now()->add(new \DateInterval('PT2S')));
65 | self::assertSame(7, $clock->timePassed());
66 |
67 | $clock->sleepUntil($clock->now()->add(new \DateInterval('PT3S')));
68 | self::assertSame(10, $clock->timePassed());
69 | }
70 |
71 | public function testRecordTimePassedSleep(): void
72 | {
73 | $clock = new FakeClock();
74 |
75 | self::assertSame(0, $clock->timePassed());
76 |
77 | $clock->sleep(5);
78 | self::assertSame(5, $clock->timePassed());
79 |
80 | $clock->sleep(2);
81 | self::assertSame(7, $clock->timePassed());
82 |
83 | $clock->sleep(3);
84 | self::assertSame(10, $clock->timePassed());
85 | }
86 |
87 | protected function createClock(): ClockInterface
88 | {
89 | return new FakeClock();
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/tests/Server/index.php:
--------------------------------------------------------------------------------
1 | add(static function (ServerRequestInterface $request, RequestHandlerInterface $handler): Response {
27 | $ignoredRoutes = ['/ping', '/crawled-routes'];
28 | $path = $request->getUri()->getPath();
29 |
30 | if (\in_array($path, $ignoredRoutes, true)) {
31 | return $handler->handle($request);
32 | }
33 |
34 | if (!\file_exists(LOG_PATH)) {
35 | \file_put_contents(LOG_PATH, '{}');
36 | }
37 |
38 | try {
39 | $logs = \json_decode(
40 | \file_get_contents(LOG_PATH), // @phpstan-ignore argument.type
41 | true,
42 | 512,
43 | \JSON_THROW_ON_ERROR,
44 | );
45 | } catch (JsonException) {
46 | $logs = [];
47 | }
48 |
49 | if (!isset($logs[$path])) {
50 | $logs[$path] = 0;
51 | }
52 |
53 | ++$logs[$path];
54 | \file_put_contents(__DIR__ . '/tmp/crawled.json', \json_encode($logs, \JSON_THROW_ON_ERROR));
55 |
56 | return $handler->handle($request);
57 | });
58 |
59 | $app->get('/ping', static function (Request $request, Response $response, $args) {
60 | $response->getBody()->write('pong');
61 |
62 | return $response;
63 | });
64 |
65 | $app->get('/crawled-routes', static function (Request $request, Response $response, $args): Response {
66 | $stats = \file_get_contents(LOG_PATH);
67 |
68 | if (false === $stats) {
69 | $stats = '{}';
70 | }
71 |
72 | $response->getBody()->write($stats);
73 |
74 | return $response
75 | ->withHeader('Content-Type', 'application/json');
76 | });
77 |
78 | $app->get('/robots', static function (Request $request, Response $response, $args): Response {
79 | $robots = <<<'PLAIN'
80 | User-agent: *
81 | Disallow: /test2
82 | PLAIN;
83 |
84 | $response->getBody()->write($robots);
85 |
86 | return $response->withAddedHeader('Content-type', 'text/plain');
87 | });
88 |
89 | $app->get('/test1', static function (Request $request, Response $response, $args) {
90 | $response->getBody()->write('Such headline, wow
');
91 |
92 | return $response;
93 | });
94 |
95 | $app->get('/test2', static function (Request $request, Response $response, $args) {
96 | $response->getBody()->write('');
97 |
98 | return $response;
99 | });
100 |
101 | $app->get('/test3', static function (Request $request, Response $response, $args) {
102 | return $response;
103 | });
104 |
105 | $app->get('/javascript', static function (Request $request, Response $response, $args) {
106 | $body = <<<'HTML'
107 | Loading...
108 |
112 | HTML;
113 |
114 | $response->getBody()->write($body);
115 |
116 | return $response;
117 | });
118 |
119 | $app->run();
120 |
--------------------------------------------------------------------------------
/tests/Server/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/tests/Shell/Commands/RunSpiderCommandTest.php:
--------------------------------------------------------------------------------
1 | execute([
39 | 'spider' => TestSpider::class,
40 | ]);
41 |
42 | $commandTester->assertCommandIsSuccessful();
43 | $runner->assertRunWasStarted(TestSpider::class);
44 | }
45 |
46 | public function testPrintsAnErrorIfTheProvidedSpiderClassWasInvalid(): void
47 | {
48 | $commandTester = new CommandTester(new RunSpiderCommand());
49 |
50 | $commandTester->execute([
51 | 'spider' => '::not-a-spider::',
52 | ]);
53 |
54 | self::assertSame(Command::FAILURE, $commandTester->getStatusCode());
55 | self::assertStringContainsString('Invalid spider:', $commandTester->getDisplay(true));
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/tests/Shell/Resolver/FakeNamespaceResolverTest.php:
--------------------------------------------------------------------------------
1 | resolveSpiderNamespace($input);
32 |
33 | self::assertSame($input, $result);
34 | }
35 |
36 | /**
37 | * @return iterable>
38 | */
39 | public static function inputStringProvider(): iterable
40 | {
41 | yield from [
42 | ['::string-1::'],
43 | [TestSpider::class],
44 | ['::string-2::'],
45 | [RequestSpiderMiddleware::class],
46 | ];
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/tests/Shell/Resolver/StaticNamespaceResolverTest.php:
--------------------------------------------------------------------------------
1 | resolveSpiderNamespace('RoachPHP\Tests\Fixtures\TestSpider');
31 |
32 | self::assertSame('RoachPHP\Tests\Fixtures\TestSpider', $result);
33 | }
34 |
35 | public function testThrowsExceptionIfTheProvidedSpiderClassDoesNotExist(): void
36 | {
37 | $resolver = new StaticNamespaceResolver();
38 |
39 | $this->expectException(InvalidSpiderException::class);
40 | $this->expectExceptionMessage('The spider class ::spider-class:: does not exist');
41 |
42 | $resolver->resolveSpiderNamespace('::spider-class::');
43 | }
44 |
45 | public function testThrowsExceptionIfTheProvidedClassIsNotASpider(): void
46 | {
47 | $resolver = new StaticNamespaceResolver();
48 |
49 | $this->expectException(InvalidSpiderException::class);
50 | $this->expectExceptionMessage(\sprintf('The class %s is not a spider', RequestSpiderMiddleware::class));
51 |
52 | $resolver->resolveSpiderNamespace(RequestSpiderMiddleware::class);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/tests/Spider/Configuration/ArrayLoaderTest.php:
--------------------------------------------------------------------------------
1 | load();
31 |
32 | $expected = new Configuration([], [], [], [], [], 5, 0);
33 | self::assertEquals($expected, $actual);
34 | }
35 |
36 | public function testMergePartialOptions(): void
37 | {
38 | $loader = new ArrayLoader([
39 | 'startUrls' => ['::start-url::'],
40 | 'extensions' => [LoggerExtension::class],
41 | 'concurrency' => 2,
42 | ]);
43 |
44 | $actual = $loader->load();
45 |
46 | $expected = new Configuration(['::start-url::'], [], [], [], [LoggerExtension::class], 2, 0);
47 | self::assertEquals($expected, $actual);
48 | }
49 |
50 | public function testMergeAllOptions(): void
51 | {
52 | $loader = new ArrayLoader([
53 | 'startUrls' => ['::start-url::'],
54 | 'downloaderMiddleware' => ['::downloader-middleware::'],
55 | 'spiderMiddleware' => ['::spider-middleware::'],
56 | 'itemProcessors' => ['::item-processor::'],
57 | 'extensions' => [LoggerExtension::class],
58 | 'concurrency' => 2,
59 | 'requestDelay' => 2,
60 | ]);
61 |
62 | $actual = $loader->load();
63 |
64 | $expected = new Configuration(
65 | ['::start-url::'],
66 | ['::downloader-middleware::'], // @phpstan-ignore argument.type
67 | ['::item-processor::'], // @phpstan-ignore argument.type
68 | ['::spider-middleware::'], // @phpstan-ignore argument.type
69 | [LoggerExtension::class],
70 | 2,
71 | 2,
72 | );
73 | self::assertEquals($expected, $actual);
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/tests/Spider/ParseResultTest.php:
--------------------------------------------------------------------------------
1 | yield from []);
30 |
31 | $result->apply(
32 | static fn (Request $request) => self::assertEquals('::url::', (string) $request->getUri()),
33 | static fn () => self::fail('Should not have been called'),
34 | );
35 | }
36 |
37 | public function testPassesItemToCallbackIfResultIsItem(): void
38 | {
39 | $result = ParseResult::item(['::key::' => '::value::']);
40 |
41 | $result->apply(
42 | static fn () => self::fail('Should not have been called'),
43 | static fn (ItemInterface $item) => self::assertSame('::value::', $item->get('::key::')),
44 | );
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/tests/Spider/SpiderTestCase.php:
--------------------------------------------------------------------------------
1 | assertRouteWasCrawledTimes('/test1', 1);
47 | $this->assertRouteWasCrawledTimes('/test2', 1);
48 | }
49 |
50 | public function testOverrideInitialRequests(): void
51 | {
52 | $spider = new class() extends BasicSpider {
53 | // Don't want logging in this test
54 | public array $extensions = [];
55 |
56 | public function parse(Response $response): \Generator
57 | {
58 | yield from [];
59 | }
60 |
61 | protected function initialRequests(): array
62 | {
63 | return [new Request('GET', 'http://localhost:8000/test1', [$this, 'parse'])];
64 | }
65 | };
66 |
67 | Roach::startSpider($spider::class);
68 |
69 | $this->assertRouteWasCrawledTimes('/test1', 1);
70 | }
71 |
72 | public function testCanAccessRunContextFromWithinSpider(): void
73 | {
74 | $spider = new class() extends BasicSpider {
75 | public array $extensions = [];
76 |
77 | public function parse(Response $response): \Generator
78 | {
79 | yield from [];
80 | }
81 |
82 | protected function initialRequests(): array
83 | {
84 | return [
85 | new Request(
86 | 'GET',
87 | // Use initialRequest from passed request context as a heuristic
88 | // if context can be accessed.
89 | $this->context['initialRequest'],
90 | [$this, 'parse'],
91 | ), ];
92 | }
93 | };
94 |
95 | Roach::startSpider($spider::class, context: ['initialRequest' => 'http://localhost:8000/test1']);
96 |
97 | $this->assertRouteWasCrawledTimes('/test1', 1);
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/tests/Support/DroppableTestCase.php:
--------------------------------------------------------------------------------
1 | createDroppable();
27 |
28 | self::assertFalse($droppable->wasDropped());
29 |
30 | $droppable = $droppable->drop('::reason::');
31 |
32 | self::assertTrue($droppable->wasDropped());
33 | }
34 |
35 | public function testGetReason(): void
36 | {
37 | $droppable = $this->createDroppable();
38 |
39 | $dropped = $droppable->drop('::reason::');
40 |
41 | self::assertSame('::reason::', $dropped->getDropReason());
42 | }
43 |
44 | abstract protected function createDroppable(): DroppableInterface;
45 | }
46 |
--------------------------------------------------------------------------------
/tests/Testing/FakeLoggerTest.php:
--------------------------------------------------------------------------------
1 | $context
30 | */
31 | public function testCheckIfSpecificMessageWasLoggedAtLevel(string $level, string $message, array $context): void
32 | {
33 | $logger = new FakeLogger();
34 |
35 | self::assertFalse($logger->messageWasLogged($level, $message));
36 |
37 | $logger->{$level}($message, $context);
38 |
39 | self::assertTrue($logger->messageWasLogged($level, $message));
40 | }
41 |
42 | /**
43 | * @dataProvider logMessageProvider
44 | *
45 | * @param array $context
46 | */
47 | public function testCheckIfMessageWasLoggedWithContext(string $level, string $message, array $context): void
48 | {
49 | $logger = new FakeLogger();
50 |
51 | $logger->{$level}($message, []);
52 | self::assertFalse($logger->messageWasLogged($level, $message, $context));
53 |
54 | $logger->{$level}($message, $context);
55 | self::assertTrue($logger->messageWasLogged($level, $message, $context));
56 | }
57 |
58 | /**
59 | * @return iterable}>
60 | */
61 | public static function logMessageProvider(): iterable
62 | {
63 | yield from [
64 | 'debug' => [
65 | 'debug', '::debug-message::', ['::debug-context::'],
66 | ],
67 | 'info' => [
68 | 'info', '::info-message::', ['::info-context::'],
69 | ],
70 | 'notice' => [
71 | 'notice', '::notice-message::', ['::notice-context::'],
72 | ],
73 | 'warning' => [
74 | 'warning', '::warning-message::', ['::warning-context::'],
75 | ],
76 | 'error' => [
77 | 'error', '::error-message::', ['::error-context::'],
78 | ],
79 | 'critical' => [
80 | 'critical', '::critical-message::', ['::critical-context::'],
81 | ],
82 | 'alert' => [
83 | 'alert', '::alert-message::', ['::alert-context::'],
84 | ],
85 | 'emergency' => [
86 | 'emergency', '::emergency-message::', ['::emergency-context::'],
87 | ],
88 | ];
89 | }
90 | }
91 |
--------------------------------------------------------------------------------