├── .editorconfig
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── test.yaml
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── UPGRADING.md
├── composer.json
├── phpstan-baseline.neon
├── phpstan.neon
├── phpunit.xml.dist
├── pint.json
├── playground.php
├── rector.php
├── src
    ├── Core.php
    ├── DataTransferObjects
    │   └── FeedEntry.php
    ├── PHPScraper.php
    ├── UsesBrowserKit.php
    ├── UsesContent.php
    ├── UsesFeeds.php
    ├── UsesFileParsers.php
    ├── UsesUrls.php
    └── UsesXPathFilters.php
└── tests
    ├── BaseHrefTest.php
    ├── CanonicalTest.php
    ├── CoreTest.php
    ├── CustomSelectorTest.php
    ├── DownloadTest.php
    ├── FeedRssTest.php
    ├── FeedSearchIndexTest.php
    ├── FeedSitemapTest.php
    ├── HeadingTest.php
    ├── ImageTest.php
    ├── KeywordTest.php
    ├── LinkTest.php
    ├── ListsTest.php
    ├── MetaAuthorTest.php
    ├── MetaCharsetTest.php
    ├── MetaContentTypeTest.php
    ├── MetaCsrfTokenTest.php
    ├── MetaDescriptionTest.php
    ├── MetaImageTest.php
    ├── MetaKeywordsTest.php
    ├── MetaViewportTest.php
    ├── NavigationTest.php
    ├── NotFoundTest.php
    ├── OpenGraphTest.php
    ├── OutlineTest.php
    ├── ParagraphsTest.php
    ├── ParserCsvTest.php
    ├── ParserJsonTest.php
    ├── ParserXmlTest.php
    ├── RedirectTest.php
    ├── TitleTest.php
    ├── TwitterCardTest.php
    └── UrlTest.php


/.editorconfig:
--------------------------------------------------------------------------------
 1 | ; This file is for unifying the coding style for different editors and IDEs.
 2 | ; More information at http://editorconfig.org
 3 | 
 4 | root = true
 5 | 
 6 | [*]
 7 | charset = utf-8
 8 | indent_size = 4
 9 | indent_style = space
10 | end_of_line = lf
11 | insert_final_newline = true
12 | trim_trailing_whitespace = true
13 | 
14 | [*.md]
15 | trim_trailing_whitespace = false
16 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: spekulatius
2 | custom: https://phpscraper.de/misc/sponsors.html


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | on: [pull_request]
 2 | 
 3 | jobs:
 4 |   phpunit:
 5 |     name: PHPUnit
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       matrix:
 9 |         php-version: ['8.1', '8.2', '8.3']
10 |     steps:
11 |       - uses: actions/checkout@v4
12 |       - uses: shivammathur/setup-php@v2
13 |         with:
14 |           php-version: ${{ matrix.php-version }}
15 |           coverage: none
16 |           extensions: intl curl
17 |       - run: composer update --no-interaction --no-progress --prefer-dist --ansi
18 |       - run: composer test:unit
19 | 
20 |   phpstan:
21 |     name: PHPStan
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - name: Checkout
25 |         uses: actions/checkout@v4
26 | 
27 |       - name: Install PHP
28 |         uses: shivammathur/setup-php@v2
29 |         with:
30 |           php-version: '8.1'
31 |           coverage: none
32 |         env:
33 |           COMPOSER_TOKEN: ${{ github.token }}
34 |           update: true
35 | 
36 |       - name: Install dependencies
37 |         run: composer update --prefer-dist --no-interaction --no-progress --optimize-autoloader
38 | 
39 |       - name: PHPStan tests
40 |         run: composer test:types
41 | 
42 |   rector:
43 |     name: Rector
44 |     runs-on: ubuntu-latest
45 |     steps:
46 |       - name: Checkout
47 |         uses: actions/checkout@v4
48 | 
49 |       - name: Install PHP
50 |         uses: shivammathur/setup-php@v2
51 |         with:
52 |           php-version: '8.1'
53 |           coverage: none
54 |         env:
55 |           COMPOSER_TOKEN: ${{ github.token }}
56 |           update: true
57 | 
58 |       - name: Install dependencies
59 |         run: composer update --prefer-dist --no-interaction --no-progress --optimize-autoloader
60 | 
61 |       - name: PHPStan tests
62 |         run: composer test:refactor
63 | 
64 |   pint:
65 |     name: Pint
66 |     runs-on: ubuntu-latest
67 |     steps:
68 |       - name: Checkout
69 |         uses: actions/checkout@v4
70 | 
71 |       - name: Install PHP
72 |         uses: shivammathur/setup-php@v2
73 |         with:
74 |           php-version: '8.1'
75 |           coverage: none
76 |           tools: cs2pr
77 |         env:
78 |           COMPOSER_TOKEN: ${{ github.token }}
79 |           update: true
80 | 
81 |       - name: Install dependencies
82 |         run: composer update --prefer-dist --no-interaction --no-progress --optimize-autoloader
83 | 
84 |       - name: Run Pint
85 |         run: composer exec -- pint --test --format=checkstyle | cs2pr
86 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | vendor
 2 | .idea
 3 | .php_cs.cache
 4 | .phpunit.result.cache
 5 | composer.lock
 6 | yarn-error.log
 7 | websites/.yarn/
 8 | websites/.yarnrc.yml
 9 | .vscode
10 | .history
11 | .notes
12 | .tmp/


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # PHPScraper CHANGELOG
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | Parts regarding the [documentation website](https://phpscraper.de), the [test pages](https://github.com/spekulatius/phpscraper-test-pages) and individual documentation changes are omitted for better readability.
  6 | 
  7 | This project adheres to [Semantic Versioning](http://semver.org/).
  8 | 
  9 | ## 3.0.0 (2024-04-09)
 10 | 
 11 | - [#204](https://github.com/spekulatius/PHPScraper/pull/204): Upgrading Symfony dependencies to allow ^7.0
 12 | - [#201](https://github.com/spekulatius/PHPScraper/pull/201): Pint
 13 | - [#200](https://github.com/spekulatius/PHPScraper/pull/200): Upgrade from league/uri 6.x to league/uri 7.x, replacing deprecated function use with new recommended ones
 14 | - [#199](https://github.com/spekulatius/PHPScraper/pull/199): Add CI job names
 15 | - [#196](https://github.com/spekulatius/PHPScraper/pull/196): Upgrading repo tools
 16 | - [#195](https://github.com/spekulatius/PHPScraper/pull/195): Add Pint
 17 | - [#194](https://github.com/spekulatius/PHPScraper/pull/194): Fix HTTPClient config
 18 | - [#192](https://github.com/spekulatius/PHPScraper/pull/192): Fix few problems reported by PHPStan
 19 | - [#190](https://github.com/spekulatius/PHPScraper/pull/190): Fix typos and a critical error
 20 | - [#188](https://github.com/spekulatius/PHPScraper/pull/188): Move phpstan to local temp path to ensure Windows users can run it
 21 | 
 22 | ## 2.0.0 (2023-06-01)
 23 | 
 24 | - [#187](https://github.com/spekulatius/PHPScraper/issues/187): Prepare v2: Improve typing, bringing PHPStan to --level=9. For details check the [CHANGELOG](https://github.com/spekulatius/PHPScraper/blob/master/UPGRADING.md#from-1x-to-2x).
 25 | - [#188](https://github.com/spekulatius/PHPScraper/issues/188): Support PHPStan for Windows Users
 26 | - [#185](https://github.com/spekulatius/PHPScraper/issues/185): Adding PHP 8.3 to test pipeline
 27 | - [#184](https://github.com/spekulatius/PHPScraper/issues/184): Adding PHPStan GitHub Action. Thank you @nadar!
 28 | - [#183](https://github.com/spekulatius/PHPScraper/issues/183): Switch from Goutte to BrowserKit
 29 | - [#182](https://github.com/spekulatius/PHPScraper/issues/182): Drop PHP 7.3 and 7.4
 30 | - [#174](https://github.com/spekulatius/PHPScraper/issues/174): Fix local testing
 31 | - [#173](https://github.com/spekulatius/PHPScraper/issues/173): Fix README example
 32 | - [#171](https://github.com/spekulatius/PHPScraper/issues/171): Various PHPStan improvements
 33 | - [#169](https://github.com/spekulatius/PHPScraper/issues/169): Adding `<meta charset=...>` extraction
 34 | 
 35 | ## 1.0.2 (2022-12-15)
 36 | 
 37 | - [#167](https://github.com/spekulatius/PHPScraper/issues/167): Updating CHANGELOG.md
 38 | - [#166](https://github.com/spekulatius/PHPScraper/issues/166): Minor tidy ups in comments
 39 | - [#165](https://github.com/spekulatius/PHPScraper/issues/165): Adding PHP 8.2 to test workflow
 40 | - [#160](https://github.com/spekulatius/PHPScraper/issues/160): Allow complete interface for HttpClient instead of only one class.
 41 | 
 42 | ## 1.0.1 (2022-12-02)
 43 | 
 44 | - [#156](https://github.com/spekulatius/PHPScraper/issues/156): Tidy up: Make file naming more intuitive and fix comments
 45 | - [#154](https://github.com/spekulatius/PHPScraper/issues/154): Expose GoutteClient as an accessible property
 46 | 
 47 | ## 1.0.0 (2022-11-24)
 48 | 
 49 | - [#151](https://github.com/spekulatius/PHPScraper/issues/151): Migrate website into separate repo.
 50 | - [#150](https://github.com/spekulatius/PHPScraper/issues/150): Switch namespaces. See [UPGRADING](https://github.com/spekulatius/PHPScraper/blob/master/UPGRADING.md) for more details.
 51 | - [#147](https://github.com/spekulatius/PHPScraper/issues/147): Prepare for v1.0
 52 | 
 53 | ## 0.13.0 (2022-11-21)
 54 | 
 55 | - [#146](https://github.com/spekulatius/PHPScraper/issues/146): Implement plain text file/URL parsing.
 56 | 
 57 | ## 0.12.0 (2022-11-10)
 58 | 
 59 | - [#142](https://github.com/spekulatius/PHPScraper/issues/142): Implement feed parsing.
 60 | - [#145](https://github.com/spekulatius/PHPScraper/issues/145): Re-enable previously deactivated tests
 61 | 
 62 | ## 0.11.0 (2022-11-01)
 63 | 
 64 | - [#137](https://github.com/spekulatius/PHPScraper/issues/137): Fix download bug and improve testing
 65 | 
 66 | ## 0.10.0 (2022-11-01)
 67 | 
 68 | - [#136](https://github.com/spekulatius/PHPScraper/issues/136): Expand set of URL-related methods
 69 | 
 70 | ## 0.9.0 (2022-10-28)
 71 | 
 72 | - [#79](https://github.com/spekulatius/PHPScraper/issues/79): Replace URL lib. Sub-domain support dropped.
 73 | 
 74 | ## 0.8.0 (2022-10-27)
 75 | 
 76 | - Maintenance: [Split Core lib](https://github.com/spekulatius/PHPScraper/commit/2ca34caae75e634442daf9c4f886060e41ba8911) for better understandably.
 77 | 
 78 | ## 0.7.0 (2022-10-14)
 79 | 
 80 | - [Generalize Configuration API](https://github.com/spekulatius/PHPScraper/commit/e19baeb19658fbc4846c24eb597876f54c6012a3) for better usability.
 81 | - [Proxy Support](https://github.com/spekulatius/PHPScraper/commit/326bdff4430a326bdb08f6af8452f148250c7784)
 82 | 
 83 | ## 0.6.0 (2022-07-14)
 84 | 
 85 | - [#77](https://github.com/spekulatius/PHPScraper/issues/77): Upgrade to allow Symfony 6
 86 | 
 87 | ## 0.5.0 (2022-08-16)
 88 | 
 89 | - Add [`rel`-interpretation](https://github.com/spekulatius/PHPScraper/commit/47d6f8a0f6adf49de31b691b98ea472a4a382b9f) to link methods.
 90 | - Add support to BYO-HTML: [`setContent`](https://github.com/spekulatius/PHPScraper/commit/9c50d145f280732e26ecf83c8d2978c07466dfcd).
 91 | - Improve typing support
 92 | - [Add Lists](https://github.com/spekulatius/PHPScraper/commit/0aac52853ab394d9f38b004e401c5fbec328e017)
 93 | 
 94 | ## 0.4.0 (2022-08-16)
 95 | 
 96 | - Add [keyword scoring](https://github.com/spekulatius/PHPScraper/commit/e91bce24e4b53d9a1ef19b3f1ded97627eb2076e) in.
 97 | 
 98 | ## 0.3.0 (2022-06-20)
 99 | 
100 | - Add [keyword extraction](https://github.com/spekulatius/PHPScraper/commit/9d20004ead5b9e8350a03fa6fc4de1477b19bd4c) lib in.
101 | 
102 | ## 0.2.0 (2022-06-20)
103 | 
104 | - Adding [support for `internalLinks` & `externalLinks`](https://github.com/spekulatius/PHPScraper/commit/193f422f206b7a10586463fff4a7f9dcc9e896f9).
105 | 
106 | ## 0.1.0 (2022-05-04)
107 | 
108 | - Start testing using PHPUnit.
109 | - Drop keeping own copy of current URL.
110 | - Initial commit with basics functionality.


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guide
 2 | 
 3 | This page contains guidelines for contributing to the this project. Please
 4 | review these guidelines before submitting any pull requests.
 5 | 
 6 | ## Pull Requests
 7 | 
 8 | The pull request process differs for new features and bugs. Before sending a
 9 | pull request for a new feature, you should first create an issue with
10 | `[Proposal]` in the title. The proposal should describe the new feature, as well
11 | as implementation ideas. The proposal will then be reviewed and either approved
12 | or denied. Once a proposal is approved, a pull request may be created
13 | implementing the new feature. Pull requests which do not follow this guideline
14 | will be closed immediately.
15 | 
16 | Pull requests for bugs may be sent without creating any proposal issue. If you
17 | believe that you know of a solution for a bug that has been filed on GitHub,
18 | please leave a comment detailing your proposed fix.
19 | 
20 | ### Feature Requests
21 | 
22 | If you have an idea for a new feature you would like to see added, you may
23 | create an issue on GitHub with `[Request]` in the title. The feature request
24 | will then be reviewed.
25 | 
26 | ## Coding Guidelines
27 | 
28 | This project follows the PSR-0, PSR-1, and PSR-2 coding standards.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <a href="https://github.com/spekulatius/PHPScraper">
  3 |     <picture style="width: 100%;" alt="PHP Scraper: a web utility for PHP">
  4 |       <source srcset="https://github.com/spekulatius/phpscraper-docs/blob/master/.vuepress/public/logo-dark.png" media="(prefers-color-scheme:dark)">
  5 |       <img src="https://github.com/spekulatius/phpscraper-docs/blob/master/.vuepress/public/logo-light.png" alt="PHP Scraper: a web utility for PHP">
  6 |     </picture>
  7 |   </a>
  8 |   <p align="center">
  9 |     <a href="https://github.com/spekulatius/PHPScraper/actions/workflows/test.yaml">
 10 |       <img src="https://github.com/spekulatius/PHPScraper/actions/workflows/test.yaml/badge.svg" alt="Unit Tests">
 11 |     </a>
 12 |     <a href="https://packagist.org/packages/spekulatius/PHPScraper">
 13 |       <img src="https://poser.pugx.org/spekulatius/PHPScraper/d/total.svg" alt="Total Downloads">
 14 |     </a>
 15 |     <a href="https://packagist.org/packages/spekulatius/PHPScraper">
 16 |       <img src="https://poser.pugx.org/spekulatius/PHPScraper/v/stable.svg" alt="Latest Version">
 17 |     </a>
 18 |     <a href="https://packagist.org/packages/spekulatius/PHPScraper">
 19 |       <img src="https://poser.pugx.org/spekulatius/PHPScraper/license.svg" alt="License">
 20 |     </a>
 21 |   </p>
 22 |   <p align="center">
 23 |     <strong>For full documentation, visit <a href="https://phpscraper.de">phpscraper.de</a></strong>.
 24 |   </p>
 25 | </p>
 26 | 
 27 | PHPScraper is a versatile web-utility for PHP. Its primary objective is to streamline the process of extracting information from websites, allowing you to focus on accomplishing tasks without getting caught up in the complexities of selectors, data structure preparation, and conversion.
 28 | 
 29 | Under the hood, it uses
 30 | 
 31 | - [BrowserKit](https://symfony.com/doc/current/components/browser_kit.html) (formerly [Goutte](https://github.com/FriendsOfPHP/Goutte)) to access the web
 32 | - [League/URI](https://github.com/thephpleague/uri) to process URLs
 33 | - [donatello-za/rake-php-plus](https://github.com/donatello-za/rake-php-plus) to extract and analyze keywords
 34 | 
 35 | See [composer.json](https://github.com/spekulatius/PHPScraper/blob/master/composer.json) for more details.
 36 | 
 37 | 
 38 | :timer_clock: PHPScraper in 5 Minutes explained
 39 | -----------------------------------------------
 40 | 
 41 | Here are a few impressions of the way the library works. More examples are on the [project website](https://phpscraper.de/examples/scrape-website-title.html).
 42 | 
 43 | ### Basics: Flexible Calling as an Attribute or Method
 44 | 
 45 | All scraping functionality can be accessed either as a function call or a property call. For example, the title can be accessed in two ways:
 46 | 
 47 | ```php
 48 | // Prep
 49 | $web = new \Spekulatius\PHPScraper\PHPScraper;
 50 | $web->go('https://google.com');
 51 | 
 52 | // Returns "Google"
 53 | echo $web->title;
 54 | 
 55 | // Also returns "Google"
 56 | echo $web->title();
 57 | ```
 58 | 
 59 | ### :battery: Batteries included: Meta data, Links, Images, Headings, Content, Keywords, ...
 60 | 
 61 | Many common use cases are covered already. You can find prepared extractors for various HTML tags, including interesting attributes. You can filter and combine these to your needs. In some cases there is an option to get a simple or detailed version, here in the case of `linksWithDetails`:
 62 | 
 63 | ```PHP
 64 | $web = new \Spekulatius\PHPScraper\PHPScraper;
 65 | 
 66 | // Contains:
 67 | // <a href="https://placekitten.com/456/500" rel="ugc">
 68 | //   <img src="https://placekitten.com/456/400">
 69 | //   <img src="https://placekitten.com/456/300">
 70 | // </a>
 71 | $web->go('https://test-pages.phpscraper.de/links/image-urls.html');
 72 | 
 73 | // Get the first link on the page and print the result
 74 | print_r($web->linksWithDetails[0]);
 75 | // [
 76 | //     'url' => 'https://placekitten.com/456/500',
 77 | //     'protocol' => 'https',
 78 | //     'text' => '',
 79 | //     'title' => null,
 80 | //     'target' => null,
 81 | //     'rel' => 'ugc',
 82 | //     'image' => [
 83 | //         'https://placekitten.com/456/400',
 84 | //         'https://placekitten.com/456/300'
 85 | //     ],
 86 | //     'isNofollow' => false,
 87 | //     'isUGC' => true,
 88 | //     'isSponsored' => false,
 89 | //     'isMe' => false,
 90 | //     'isNoopener' => false,
 91 | //     'isNoreferrer' => false,
 92 | // ]
 93 | ```
 94 | 
 95 | If there aren't any matching elements (here links) on the page, an empty array will be returned. If a method normally returns a string it might return `null`. Details such as `follow_redirects`, etc. are optional configuration parameters (see below).
 96 | 
 97 | Most of the DOM should be covered using these methods:
 98 | 
 99 | - several [meta-tags](https://phpscraper.de/examples/scrape-meta-tags.html) and other [`<head>`-information](https://phpscraper.de/examples/scrape-header-tags.html)
100 | - [Social-Media information](https://phpscraper.de/examples/scrape-social-media-meta-tags.html) like Twitter Card and Facebook Open Graph
101 | - Content: [Headings](https://phpscraper.de/examples/headings.html), [Outline](https://phpscraper.de/examples/outline.html), [Texts](https://phpscraper.de/examples/paragraphs.html) and [Lists](https://phpscraper.de/examples/lists.html)
102 | - [Images](https://phpscraper.de/examples/scrape-images.html)
103 | - [Links](https://phpscraper.de/examples/scrape-links.html)
104 | - [Keywords](https://phpscraper.de/examples/extract-keywords.html)
105 | 
106 |  **A full list of methods with example code can be found on [phpscraper.de](https://phpscraper.de). Further examples are in the [tests](https://github.com/spekulatius/PHPScraper/tree/master/tests).**
107 | 
108 | 
109 | ### Download Files
110 | 
111 | Besides processing the content on the page itself, you can download files using `fetchAsset`:
112 | 
113 | ```php
114 | // Absolute URL
115 | $csvString = $web->fetchAsset('https://test-pages.phpscraper.de/test.csv');
116 | 
117 | // Relative URL after navigation
118 | $csvString = $web
119 |   ->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html')
120 |   ->fetchAsset('/test.csv');
121 | ```
122 | 
123 | You will only need to write the content into a file or cloud storage.
124 | 
125 | 
126 | ### Process the RSS feeds, `sitemap.xml`, etc.
127 | 
128 | PHPScraper can assist in collecting feeds such as [RSS feeds, `sitemap.xml`-entries and static search indexes](https://phpscraper.de/examples/scrape-feeds.html). This can be useful when deciding on the next page to crawl or building up a list of pages on a website.
129 | 
130 | Here we are processing the sitemap into a set of [`FeedEntry`-DTOs](https://github.com/spekulatius/PHPScraper/blob/master/src/DataTransferObjects/FeedEntry.php):
131 | 
132 | ```php
133 | (new \Spekulatius\PHPScraper\PHPScraper)
134 |     ->go('https://phpscraper.de')
135 |     ->sitemap
136 | 
137 | // array(131) {
138 | //   [0]=>
139 | //   object(Spekulatius\PHPScraper\DataTransferObjects\FeedEntry)#165 (3) {
140 | //     ["title"]=>
141 | //     string(0) ""
142 | //     ["description"]=>
143 | //     string(0) ""
144 | //     ["link"]=>
145 | //     string(22) "https://phpscraper.de/"
146 | //   }
147 | //   [1]=>
148 | // ...
149 | ```
150 | 
151 | Whenever post-processing is applied, you can fall back to the underlying `*Raw`-methods.
152 | 
153 | 
154 | ### Process CSV-, XML- and JSON files and URLs
155 | 
156 | PHPScraper comes out of the box with file / URL processing methods for CSV-, XML- and JSON:
157 | 
158 | - `parseJson`
159 | - `parseXml`
160 | - `parseCsv`
161 | - `parseCsvWithHeader` (generates an asso. array using the first row)
162 | 
163 | Each method can process both strings as well as URLs:
164 | 
165 | ```php
166 | // Parse JSON into array:
167 | $json = $web->parseJson('[{"title": "PHP Scraper: a web utility for PHP", "url": "https://phpscraper.de"}]');
168 | // [
169 | //     'title' => 'PHP Scraper: a web utility for PHP',
170 | //     'url' => 'https://phpscraper.de'
171 | // ]
172 | 
173 | // Fetch and parse CSV into a simple array:
174 | $csv = $web->parseCsv('https://test-pages.phpscraper.de/test.csv');
175 | // [
176 | //     ['date', 'value'],
177 | //     ['1945-02-06', 4.20],
178 | //     ['1952-03-11', 42],
179 | // ]
180 | 
181 | // Fetch and parse CSV with first row as header into an asso. array structure:
182 | $csv = $web->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv');
183 | // [
184 | //     ['date' => '1945-02-06', 'value' => 4.20],
185 | //     ['date' => '1952-03-11', 'value' => 42],
186 | // ]
187 | ```
188 | 
189 | Additional CSV parsing parameters such as separator, enclosure and escape are possible.
190 | 
191 | 
192 | ### There is more!
193 | 
194 | There are plenty of examples on the [PHPScraper website](https://phpscraper.de) and in the [tests](https://github.com/spekulatius/PHPScraper/tree/master/tests).
195 | 
196 | Check the [`playground.php`](https://github.com/spekulatius/PHPScraper/blob/master/playground.php) if you prefer learning by doing. You get it up and running with:
197 | 
198 | ```bash
199 | $ git clone git@github.com:spekulatius/PHPScraper.git && composer update
200 | ```
201 | 
202 | :muscle: Roadmap
203 | ----------------
204 | 
205 | The future development is organized into [milestones](https://github.com/spekulatius/PHPScraper/milestones?direction=asc&sort=title). Releases follow [semver](https://semver.org/).
206 | 
207 | ### v1: [Building the first stable version](https://github.com/spekulatius/PHPScraper/milestone/4?closed=1)
208 | 
209 | - Improve documentation and examples.
210 | - Organize code better (move websites into separate repos, etc.)
211 | - Add support for feeds and some typical file types.
212 | 
213 | ### v2: Service Upgrade:
214 | 
215 | - Switch from Goutte to [Symfony BrowserKit](https://symfony.com/doc/current/components/browser_kit.html). Goutte has been archived.
216 | 
217 | ### v3: [Expand the functionality and cover more 'types'](https://github.com/spekulatius/PHPScraper/milestone/5)
218 | 
219 | - Expand to parse a wider range of types, elements, embeds, etc.
220 | - Improve performance with caching and concurrent fetching of assets
221 | - Minor improvements for parsing methods
222 | 
223 | ### v4: [Expand to provide more guidance on building custom scrapers on top of PHPScraper](https://github.com/spekulatius/PHPScraper/milestone/6)
224 | 
225 | TBC.
226 | 
227 | 
228 | :heart_eyes: Sponsors
229 | ---------------------
230 | 
231 | PHPScraper is sponsored by:
232 | 
233 | <a href="https://bringyourownideas.com" target="_blank" rel="noopener noreferrer"><img src="https://bringyourownideas.com/images/byoi-logo.jpg" height="100px"></a>
234 | 
235 | With your support, PHPScraper can became the *PHP swiss army knife for the web*. If you find PHPScraper useful to your work, please consider a [sponsorship](https://github.com/sponsors/spekulatius) or [donation](https://www.buymeacoffee.com/spekulatius). Thank you :muscle:
236 | 
237 | 
238 | :gear: Configuration (optional)
239 | -------------------------------
240 | 
241 | If needed, you can use the following configuration options:
242 | 
243 | ### User Agent
244 | 
245 | You can set the browser agent using `setConfig`:
246 | 
247 | ```php
248 | $web->setConfig([
249 |   'agent' => 'Mozilla/5.0 (X11; Linux x86_64; rv:107.0) Gecko/20100101 Firefox/107.0'
250 | ]);
251 | ```
252 | 
253 | It defaults to `Mozilla/5.0 (compatible; PHP Scraper/1.x; +https://phpscraper.de)`.
254 | 
255 | ### Proxy Support
256 | 
257 | You can configure proxy support with `setConfig`:
258 | 
259 | ```php
260 | $web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']);
261 | ```
262 | 
263 | ### Timeout
264 | 
265 | You can set the `timeout` using `setConfig`:
266 | 
267 | ```php
268 | $web->setConfig(['timeout' => 15]);
269 | ```
270 | 
271 | Setting the timeout to zero will disable it.
272 | 
273 | ### Disabling SSL
274 | 
275 | While unrecommended, it might be required to disable SSL checks. You can do so using:
276 | 
277 | ```php
278 | $web->setConfig(['disable_ssl' => true]);
279 | ```
280 | 
281 | You can call `setConfig` multiple times. It stores the config and merges it with previous settings. This should be kept in mind in the unlikely use-case when unsetting values.
282 | 
283 | 
284 | :rocket: Installation with Composer
285 | -----------------------------------
286 | 
287 | ```bash
288 | composer require spekulatius/phpscraper
289 | ```
290 | 
291 | After the installation, the package will be picked up by the Composer autoloader. If you are using a common PHP application or framework such as Laravel or Symfony you can start scraping now :rocket:
292 | 
293 | If not or you are building a standalone-scraper, please include the autoloader in `vendor/` at the top of your file:
294 | 
295 | ```php
296 | <?php
297 | 
298 | require __DIR__ . '/vendor/autoload.php';
299 | 
300 | // ...
301 | ```
302 | 
303 | Now you can now use any of the examples on the documentation website or from the [`tests/`-folder](https://github.com/spekulatius/PHPScraper/tree/master/tests).
304 | 
305 | Please consider supporting PHPScraper with a star or [sponsorship](https://github.com/sponsors/spekulatius):
306 | 
307 | ```bash
308 | composer thanks
309 | ```
310 | 
311 | Thank you :muscle:
312 | 
313 | 
314 | :white_check_mark: Testing
315 | --------------------------
316 | 
317 | The library comes with a PHPUnit test suite. To run the tests, run the following command from the project folder:
318 | 
319 | ```bash
320 | composer test
321 | ```
322 | 
323 | You can find the tests [here](https://github.com/spekulatius/PHPScraper/tree/master/tests). The test pages are [publicly available](https://github.com/spekulatius/phpscraper-test-pages).
324 | 
325 | ## MISC: [Issues](https://github.com/spekulatius/PHPScraper/issues), [Ideas](https://github.com/spekulatius/PHPScraper/milestones), [Contributing](https://github.com/spekulatius/PHPScraper/blob/master/CONTRIBUTING.md), [CHANGELOG](https://github.com/spekulatius/PHPScraper/blob/master/CHANGELOG.md), [UPGRADING](https://github.com/spekulatius/PHPScraper/blob/master/UPGRADING.md), [LICENSE](https://github.com/spekulatius/PHPScraper/blob/master/LICENSE.md)
326 | 


--------------------------------------------------------------------------------
/UPGRADING.md:
--------------------------------------------------------------------------------
 1 | # Upgrading PHPScraper
 2 | 
 3 | This document will help you upgrading PHPScraper from an earlier version to later versions.
 4 | 
 5 | ## From `0.x` to `1.x`
 6 | 
 7 | - The namespace has been adjusted from `\spekulatius` to `\Spekulatius\PHPScraper`. Any `use` statements or other class references need to updated accordingly:
 8 | 
 9 |   ```diff
10 |   -use spekulatius\phpscraper;
11 |   +use Spekulatius\PHPScraper\PHPScraper;
12 |   ```
13 | 
14 |   or
15 | 
16 |   ```diff
17 |   -$web = new \spekulatius\phpscraper;
18 |   +$web = new \Spekulatius\PHPScraper\PHPScraper;
19 |   ```
20 | 
21 | ## From `1.x` to `2.x`
22 | 
23 | - Support for PHP 7.x was dropped. PHP 8.0 is the minimum for v2.
24 | - The publicly accessible function `parseXML` was renamed to `parseXml`.
25 | - The codebase has been analysed with PHPStan and hardened manually. Due to this, some return types have changed. See [v2 pull request](https://github.com/spekulatius/PHPScraper/pull/187/files) for details.
26 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "spekulatius/phpscraper",
 3 |     "description": "PHPScraper, built with simplicity in mind. See tests/ for more examples.",
 4 |     "keywords": [
 5 |         "PHP scraper",
 6 |         "PHP scraping",
 7 |         "PHP crawler",
 8 |         "xpath scraper",
 9 |         "web scraping",
10 |         "PHP library",
11 |         "web-access"
12 |     ],
13 |     "homepage": "https://phpscraper.de",
14 |     "type": "library",
15 |     "license": "GPL-3.0-or-later",
16 |     "authors": [
17 |         {
18 |             "name": "Peter Thaleikis",
19 |             "homepage": "https://peterthaleikis.com"
20 |         }
21 |     ],
22 |     "require": {
23 |         "php": "^8.1",
24 |         "ext-intl": "*",
25 |         "symfony/dom-crawler": "^5.4 || ^6.0 || ^7.0",
26 |         "donatello-za/rake-php-plus": "^1.0.15",
27 |         "league/uri": "^7.0",
28 |         "symfony/browser-kit": "^6.0 || ^7.0",
29 |         "symfony/http-client": "^6.0 || ^7.0",
30 |         "symfony/css-selector": "^6.0 || ^7.0"
31 |     },
32 |     "require-dev": {
33 |         "symfony/thanks": "^1.0.0",
34 |         "phpunit/phpunit": "^8.0.0|^9.0.0",
35 |         "illuminate/collections": "^8.0.0|^9.0.0",
36 |         "laravel/pint": "^1.0",
37 |         "phpstan/phpstan": "^1.0",
38 |         "rector/rector": "^0.19",
39 |         "symfony/var-dumper": "^6.0"
40 |     },
41 |     "autoload": {
42 |         "psr-4": {
43 |             "Spekulatius\\PHPScraper\\": "src/"
44 |         }
45 |     },
46 |     "autoload-dev": {
47 |         "psr-4": {
48 |             "Spekulatius\\PHPScraper\\Tests\\": "tests/"
49 |         }
50 |     },
51 |     "scripts": {
52 |         "refactor": "./vendor/bin/rector",
53 |         "lint": "./vendor/bin/pint",
54 |         "test:refactor": "./vendor/bin/rector --dry-run",
55 |         "test:lint": "./vendor/bin/pint --test",
56 |         "test:types": "./vendor/bin/phpstan analyse --ansi src/ tests/ --level=9",
57 |         "test:unit": "./vendor/phpunit/phpunit/phpunit --cache-result --cache-result-file=.tmp/phpunit --order-by=defects --colors=always --stop-on-failure",
58 |         "test": [
59 |             "@test:refactor",
60 |             "@test:lint",
61 |             "@test:types",
62 |             "@test:unit"
63 |         ]
64 |     },
65 |     "funding": [
66 |         {
67 |             "type": "github",
68 |             "url": "https://github.com/sponsors/spekulatius"
69 |         },
70 |         {
71 |             "type": "homepage",
72 |             "url": "https://phpscraper.de/misc/sponsors.html"
73 |         }
74 |     ],
75 |     "config": {
76 |         "sort-packages": true,
77 |         "allow-plugins": {
78 |             "symfony/thanks": true
79 |         }
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/phpstan.neon:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - phpstan-baseline.neon
 3 | 
 4 | parameters:
 5 |   level: 9
 6 |   paths:
 7 |     - src/
 8 |     - tests/
 9 |   tmpDir: ./.tmp/phpstan/
10 |   ignoreErrors:
11 |     # TODO Add those return types.
12 |     - '#return type has no value type specified in iterable type array\.$#'
13 | 


--------------------------------------------------------------------------------
/phpunit.xml.dist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <phpunit
 3 |   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |   backupGlobals="false"
 5 |   backupStaticAttributes="false"
 6 |   colors="true"
 7 |   convertErrorsToExceptions="true"
 8 |   convertNoticesToExceptions="true"
 9 |   convertWarningsToExceptions="true"
10 |   processIsolation="false"
11 |   stopOnFailure="false"
12 |   bootstrap="vendor/autoload.php"
13 |   xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/9.3/phpunit.xsd"
14 | >
15 |   <coverage/>
16 |   <testsuites>
17 |     <testsuite name="Tests">
18 |       <directory suffix="Test.php">./tests</directory>
19 |     </testsuite>
20 |   </testsuites>
21 | </phpunit>
22 | 


--------------------------------------------------------------------------------
/pint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "preset": "laravel",
 3 |     "rules": {
 4 |         "simplified_null_return": true,
 5 |         "braces": false,
 6 |         "new_with_braces": {
 7 |             "anonymous_class": false,
 8 |             "named_class": false
 9 |         },
10 |         "concat_space": false,
11 |         "ordered_traits": false
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/playground.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | require __DIR__ . '/vendor/autoload.php';
 4 | 
 5 | echo "\n";
 6 | echo "#########################\n";
 7 | echo "# PHPScraper Playground #\n";
 8 | echo "#########################\n";
 9 | echo "\n";
10 | echo "# Here you can try out your code or examples from phpscraper.de\n";
11 | echo "\n";
12 | 
13 | $web = new \Spekulatius\PHPScraper\PHPScraper;
14 | 


--------------------------------------------------------------------------------
/rector.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | declare(strict_types=1);
 4 | 
 5 | use Rector\CodeQuality\Rector\Class_\InlineConstructorDefaultToPropertyRector;
 6 | use Rector\Config\RectorConfig;
 7 | use Rector\Set\ValueObject\LevelSetList;
 8 | use Rector\Set\ValueObject\SetList;
 9 | 
10 | return static function (RectorConfig $rectorConfig): void {
11 |     $rectorConfig->paths([
12 |         __DIR__.'/src',
13 |     ]);
14 | 
15 |     $rectorConfig->rules([
16 |         InlineConstructorDefaultToPropertyRector::class,
17 |     ]);
18 | 
19 |     $rectorConfig->sets([
20 |         // LevelSetList::UP_TO_PHP_82,
21 |         // SetList::CODE_QUALITY,
22 |         SetList::DEAD_CODE,
23 |         SetList::TYPE_DECLARATION,
24 |     ]);
25 | };
26 | 


--------------------------------------------------------------------------------
/src/Core.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper;
 4 | 
 5 | /**
 6 |  * This class organizes mostly. For individual functionality check the related traits please.
 7 |  */
 8 | class Core
 9 | {
10 |     /**
11 |      * Url related helpers for information about the current location and URL processing.
12 |      */
13 |     use UsesUrls;
14 | 
15 |     /**
16 |      * This trait manages the interaction with BrowserKit (formerly Goutte).
17 |      */
18 |     use UsesBrowserKit;
19 | 
20 |     /**
21 |      * This contains the basic filter methods. Make accessing data easier.
22 |      */
23 |     use UsesXPathFilters;
24 | 
25 |     /**
26 |      * This contains various content-related selectors. meta tags, h1, etc. pp.
27 |      */
28 |     use UsesContent;
29 | 
30 |     /**
31 |      * Shared simple parsers for XML, JSON and CSV.
32 |      */
33 |     use UsesFileParsers;
34 | 
35 |     /**
36 |      * This contains the feeds-related selectors and parsers: RSS, sitemap, search index, etc.
37 |      */
38 |     use UsesFeeds;
39 | }
40 | 


--------------------------------------------------------------------------------
/src/DataTransferObjects/FeedEntry.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\DataTransferObjects;
 4 | 
 5 | /**
 6 |  * A simplified DTO to hold feed entries with incomplete data.
 7 |  *
 8 |  * This isn't aimed at keeping all details but the key values.
 9 |  */
10 | class FeedEntry
11 | {
12 |     /**
13 |      * @todo with drop of PHP7.4 we should make these public and remove the initialization above.
14 |      * @todo with drop of PHP7.4 and 8.0 we should make this `readonly`.
15 |      */
16 |     public function __construct(
17 |         // Support for PHP7.4
18 |         public string $title,
19 |         public string $description,
20 |         public string $link
21 |     ) {
22 |     }
23 | 
24 |     /**
25 |      * @param  array<string, string>  $data
26 |      **/
27 |     public static function fromArray(array $data): self
28 |     {
29 |         // Convert to an object and return the instance.
30 |         return new self(
31 |             $data['title'] ?? '',
32 |             $data['description'] ?? '',
33 |             $data['link']
34 |         );
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/PHPScraper.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper;
  4 | 
  5 | /**
  6 |  * This class manages the Clients and connections.
  7 |  *
  8 |  * Most calls are passed through to the Core class.
  9 |  */
 10 | 
 11 | use Symfony\Component\BrowserKit\HttpBrowser;
 12 | use Symfony\Component\HttpClient\HttpClient as SymfonyHttpClient;
 13 | 
 14 | /**
 15 |  * @phpstan-type PHPScraperConfig array{'follow_redirects'?: bool, 'follow_meta_refresh'?: bool, 'max_redirects'?: int, 'agent'?: string, 'proxy'?: string|null, 'timeout'?: int, 'disable_ssl'?: bool}
 16 |  */
 17 | class PHPScraper
 18 | {
 19 |     /**
 20 |      * Holds the config for the clients.
 21 |      *
 22 |      * @var PHPScraperConfig
 23 |      */
 24 |     protected $config = [];
 25 | 
 26 |     /**
 27 |      * Holds the Core class. It handles the actual scraping.
 28 |      */
 29 |     protected Core $core;
 30 | 
 31 |     /**
 32 |      * @param  PHPScraperConfig  $config
 33 |      */
 34 |     public function __construct(array $config = [])
 35 |     {
 36 |         // Prepare the core. It delegates all further processing.
 37 |         $this->core = new Core;
 38 | 
 39 |         // And set the config.
 40 |         $this->setConfig($config);
 41 |     }
 42 | 
 43 |     /**
 44 |      * Sets the config, generates the required Clients and updates the core with the new clients.
 45 |      *
 46 |      * @param  PHPScraperConfig  $config
 47 |      */
 48 |     public function setConfig(array $config = []): self
 49 |     {
 50 |         // Define the default values
 51 |         $defaults = [
 52 |             // We assume that we want to follow any redirects, in reason.
 53 |             'follow_redirects' => true,
 54 |             'follow_meta_refresh' => true,
 55 |             'max_redirects' => 5,
 56 | 
 57 |             /**
 58 |              * Agent can be overwritten using:
 59 |              *
 60 |              * ```php
 61 |              * $web->setConfig(['agent' => 'My Agent']);
 62 |              * ```
 63 |              */
 64 |             'agent' => 'Mozilla/5.0 (compatible; PHP Scraper/1.x; +https://phpscraper.de)',
 65 | 
 66 |             /**
 67 |              * Setting the Proxy
 68 |              *
 69 |              * ```php
 70 |              * $web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']);
 71 |              * ```
 72 |              */
 73 |             'proxy' => null,
 74 | 
 75 |             /**
 76 |              * Timeout in seconds.
 77 |              *
 78 |              * ```php
 79 |              * $web->setConfig(['timeout' => 15]);
 80 |              * ```
 81 |              */
 82 |             'timeout' => 10,
 83 | 
 84 |             /**
 85 |              * Disable SSL (not recommended unless really needed).
 86 |              *
 87 |              * @var bool
 88 |              */
 89 |             'disable_ssl' => false,
 90 |         ];
 91 | 
 92 |         // Add the defaults in
 93 |         $this->config = array_merge($defaults, $config);
 94 | 
 95 |         // Symfony HttpClient
 96 |         $httpClient = SymfonyHttpClient::create([
 97 |             'proxy' => $this->config['proxy'],
 98 |             'timeout' => $this->config['timeout'],
 99 |             'verify_host' => ! $this->config['disable_ssl'],
100 |             'verify_peer' => ! $this->config['disable_ssl'],
101 |         ]);
102 | 
103 |         // BrowserKit Client and set some config needed for it.
104 |         $client = new HttpBrowser($httpClient);
105 |         $client->followRedirects($this->config['follow_redirects']);
106 |         $client->followMetaRefresh($this->config['follow_meta_refresh']);
107 |         $client->setMaxRedirects($this->config['max_redirects']);
108 |         $client->setServerParameter('HTTP_USER_AGENT', $this->config['agent']);
109 | 
110 |         // Set the client on the core.
111 |         $this->core->setClient($client);
112 |         $this->core->setHttpClient($httpClient);
113 | 
114 |         return $this;
115 |     }
116 | 
117 |     /**
118 |      * Catch calls to properties and process them accordingly.
119 |      *
120 |      * @return mixed
121 |      */
122 |     public function __get(string $name)
123 |     {
124 |         // We are assuming that all calls for properties actually method calls...
125 |         return $this->__call($name);
126 |     }
127 | 
128 |     /**
129 |      * Catches the method calls and tries to satisfy them.
130 |      *
131 |      * @param  array<mixed>  $arguments
132 |      * @return mixed
133 |      */
134 |     public function __call(string $name, array $arguments = [])
135 |     {
136 |         $result = $this->core->$name(...$arguments);
137 | 
138 |         // Did we get a Core class element? Keep this.
139 |         if ($result instanceof Core) {
140 |             $this->core = $result;
141 | 
142 |             return $this;
143 |         }
144 | 
145 |         // Otherwise: just return whatever the core returned.
146 |         return $result;
147 |     }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/UsesBrowserKit.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper;
  4 | 
  5 | use Symfony\Component\BrowserKit\HttpBrowser;
  6 | use Symfony\Component\DomCrawler\Crawler;
  7 | use Symfony\Contracts\HttpClient\HttpClientInterface;
  8 | 
  9 | trait UsesBrowserKit
 10 | {
 11 |     /**
 12 |      * Holds the client
 13 |      *
 14 |      * @var \Symfony\Component\BrowserKit\HttpBrowser
 15 |      */
 16 |     protected $client;
 17 | 
 18 |     /**
 19 |      * Holds the HttpClient
 20 |      *
 21 |      * @var \Symfony\Contracts\HttpClient\HttpClientInterface;
 22 |      */
 23 |     protected $httpClient;
 24 | 
 25 |     /**
 26 |      * Holds the current page (a Crawler object)
 27 |      *
 28 |      * @var \Symfony\Component\DomCrawler\Crawler
 29 |      */
 30 |     protected $currentPage;
 31 | 
 32 |     /**
 33 |      * Overwrites the client
 34 |      */
 35 |     public function setClient(HttpBrowser $client): self
 36 |     {
 37 |         $this->client = $client;
 38 | 
 39 |         return $this;
 40 |     }
 41 | 
 42 |     /**
 43 |      * Overwrites the httpClient
 44 |      */
 45 |     public function setHttpClient(HttpClientInterface $httpClient): self
 46 |     {
 47 |         $this->httpClient = $httpClient;
 48 | 
 49 |         return $this;
 50 |     }
 51 | 
 52 |     /**
 53 |      * Retrieve the client
 54 |      *
 55 |      * @return \Symfony\Component\BrowserKit\HttpBrowser $client
 56 |      */
 57 |     public function client(): HttpBrowser
 58 |     {
 59 |         return $this->client;
 60 |     }
 61 | 
 62 |     /**
 63 |      * Any URL-related methods are in `UsesUrls.php`.
 64 |      **/
 65 | 
 66 |     /**
 67 |      * Navigates to a new page using an URL.
 68 |      */
 69 |     public function go(string $url): self
 70 |     {
 71 |         // Keep it around for internal processing.
 72 |         $this->currentPage = $this->client->request('GET', $url);
 73 | 
 74 |         return $this;
 75 |     }
 76 | 
 77 |     /**
 78 |      * Allows to set HTML content to process.
 79 |      *
 80 |      * This is intended to be used as a work-around, if you already have the DOM.
 81 |      */
 82 |     public function setContent(string $url, string $content): self
 83 |     {
 84 |         // Overwrite the current page with a fresh Crawler instance of the content.
 85 |         $this->currentPage = new Crawler($content, $url);
 86 | 
 87 |         return $this;
 88 |     }
 89 | 
 90 |     /**
 91 |      * Fetch an asset from a given absolute or relative URL
 92 |      */
 93 |     public function fetchAsset(string $url): string
 94 |     {
 95 |         return $this
 96 |             ->httpClient
 97 |             ->request(
 98 |                 'GET',
 99 |                 ($this->currentPage === null) ? $url : (string) $this->makeUrlAbsolute($url),
100 |             )
101 |             ->getContent();
102 |     }
103 | 
104 |     /**
105 |      * Click a link (either with title or url)
106 |      *
107 |      * @param  string  $titleOrUrl
108 |      */
109 |     public function clickLink($titleOrUrl): self
110 |     {
111 |         // If the string starts with http just go to it - we assume it's an URL
112 |         if (\stripos($titleOrUrl, 'http') === 0) {
113 |             // Go to a URL
114 |             $this->go($titleOrUrl);
115 |         } else {
116 |             // Find link based on the title
117 |             $link = $this->currentPage->selectLink($titleOrUrl)->link();
118 | 
119 |             // Click the link and store the DOMCrawler object
120 |             $this->currentPage = $this->client->click($link);
121 |         }
122 | 
123 |         return $this;
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/src/UsesContent.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper;
  4 | 
  5 | use DonatelloZa\RakePlus\RakePlus;
  6 | use League\Uri\Uri;
  7 | use Symfony\Component\DomCrawler\Image as DomCrawlerImage;
  8 | use Symfony\Component\DomCrawler\Link as DomCrawlerLink;
  9 | 
 10 | trait UsesContent
 11 | {
 12 |     /**
 13 |      * Access conveniences: Methods to make the data more accessible.
 14 |      *
 15 |      * I like to have direct access to stuff without too many chained calls.
 16 |      * So I've added a number of things which might be of interest.
 17 |      *
 18 |      * Any suggestions what is missing? Send a PR :)
 19 |      *
 20 |      * @see https://phpscraper.de/contributing
 21 |      */
 22 |     public function title(): ?string
 23 |     {
 24 |         return $this->filterFirstText('//title');
 25 |     }
 26 | 
 27 |     public function charset(): ?string
 28 |     {
 29 |         return $this->filterFirstExtractAttribute('//meta[@charset]', ['charset']);
 30 |     }
 31 | 
 32 |     public function contentType(): ?string
 33 |     {
 34 |         return $this->filterFirstExtractAttribute('//meta[@http-equiv="Content-type"]', ['content']);
 35 |     }
 36 | 
 37 |     public function canonical(): ?string
 38 |     {
 39 |         return $this->filterFirstExtractAttribute('//link[@rel="canonical"]', ['href']);
 40 |     }
 41 | 
 42 |     public function viewportString(): ?string
 43 |     {
 44 |         return $this->filterFirstContent('//meta[@name="viewport"]');
 45 |     }
 46 | 
 47 |     public function viewport(): array
 48 |     {
 49 |         return is_null($this->viewportString()) ? [] : (array) \preg_split('/,\s*/', $this->viewportString());
 50 |     }
 51 | 
 52 |     public function csrfToken(): ?string
 53 |     {
 54 |         return $this->filterFirstExtractAttribute('//meta[@name="csrf-token"]', ['content']);
 55 |     }
 56 | 
 57 |     public function baseHref(): ?string
 58 |     {
 59 |         return $this->filterFirstExtractAttribute('//base', ['href']);
 60 |     }
 61 | 
 62 |     /**
 63 |      * Get the header collected as an array
 64 |      *
 65 |      * @return array{charset: mixed, contentType: mixed, viewport: mixed, canonical: mixed, csrfToken: mixed}
 66 |      */
 67 |     public function headers(): array
 68 |     {
 69 |         return [
 70 |             'charset' => $this->charset(),
 71 |             'contentType' => $this->contentType(),
 72 |             'viewport' => $this->viewport(),
 73 |             'canonical' => $this->canonical(),
 74 |             'csrfToken' => $this->csrfToken(),
 75 |         ];
 76 |     }
 77 | 
 78 |     public function author(): ?string
 79 |     {
 80 |         return $this->filterFirstContent('//meta[@name="author"]');
 81 |     }
 82 | 
 83 |     public function image(): ?string
 84 |     {
 85 |         return $this->makeUrlAbsolute($this->filterFirstContent('//meta[@name="image"]'));
 86 |     }
 87 | 
 88 |     public function keywordString(): ?string
 89 |     {
 90 |         return $this->filterFirstContent('//meta[@name="keywords"]');
 91 |     }
 92 | 
 93 |     public function keywords(): array
 94 |     {
 95 |         return is_null($this->keywordString()) ? [] : (array) \preg_split('/,\s*/', $this->keywordString());
 96 |     }
 97 | 
 98 |     public function description(): ?string
 99 |     {
100 |         return $this->filterFirstContent('//meta[@name="description"]');
101 |     }
102 | 
103 |     /**
104 |      * Get the meta collected as an array
105 |      *
106 |      * @return array{author: mixed, image: mixed, keywords: mixed, description: mixed}
107 |      */
108 |     public function metaTags(): array
109 |     {
110 |         return [
111 |             'author' => $this->author(),
112 |             'image' => $this->image(),
113 |             'keywords' => $this->keywords(),
114 |             'description' => $this->description(),
115 |         ];
116 |     }
117 | 
118 |     /**
119 |      * Gets all Twitter-Card attributes (`twitter:`) as an array
120 |      *
121 |      * @return array<string, string>
122 |      */
123 |     public function twitterCard(): array
124 |     {
125 |         $data = $this
126 |             ->filter('//meta[contains(@name, "twitter:")]')
127 |             ->extract(['name', 'content']);
128 | 
129 |         // Prepare the data
130 |         $result = [];
131 |         foreach ($data as $set) {
132 |             $result[(string) $set[0]] = (string) $set[1];
133 |         }
134 | 
135 |         return $result;
136 |     }
137 | 
138 |     /**
139 |      * Gets any OpenGraph attributes (`og:`) as an array
140 |      *
141 |      * @return array<string, string>
142 |      */
143 |     public function openGraph(): array
144 |     {
145 |         $data = $this
146 |             ->filter('//meta[contains(@property, "og:")]')
147 |             ->extract(['property', 'content']);
148 | 
149 |         // Prepare the data
150 |         $result = [];
151 |         foreach ($data as $set) {
152 |             $result[(string) $set[0]] = (string) $set[1];
153 |         }
154 | 
155 |         return $result;
156 |     }
157 | 
158 |     public function h1(): array
159 |     {
160 |         return $this->filterExtractAttributes('//h1', ['_text']);
161 |     }
162 | 
163 |     public function h2(): array
164 |     {
165 |         return $this->filterExtractAttributes('//h2', ['_text']);
166 |     }
167 | 
168 |     public function h3(): array
169 |     {
170 |         return $this->filterExtractAttributes('//h3', ['_text']);
171 |     }
172 | 
173 |     public function h4(): array
174 |     {
175 |         return $this->filterExtractAttributes('//h4', ['_text']);
176 |     }
177 | 
178 |     public function h5(): array
179 |     {
180 |         return $this->filterExtractAttributes('//h5', ['_text']);
181 |     }
182 | 
183 |     public function h6(): array
184 |     {
185 |         return $this->filterExtractAttributes('//h6', ['_text']);
186 |     }
187 | 
188 |     /**
189 |      * Get all heading tags
190 |      *
191 |      * @return array<array>
192 |      */
193 |     public function headings(): array
194 |     {
195 |         return [
196 |             $this->h1(),
197 |             $this->h2(),
198 |             $this->h3(),
199 |             $this->h4(),
200 |             $this->h5(),
201 |             $this->h6(),
202 |         ];
203 |     }
204 | 
205 |     public function lists(): array
206 |     {
207 |         $lists = [];
208 | 
209 |         /** @var \DOMElement $list */
210 |         foreach ($this->currentPage->filter('ol, ul') as $list) {
211 |             $lists[] = [
212 |                 'type' => $list->tagName,
213 |                 'children' => $list->childNodes,
214 |                 'children_plain' => array_values(array_filter(array_map('trim', explode("\n", $list->textContent)))),
215 |             ];
216 |         }
217 | 
218 |         return $lists;
219 |     }
220 | 
221 |     /**
222 |      * @return array<string>
223 |      **/
224 |     public function orderedLists(): array
225 |     {
226 |         return array_values(array_filter($this->lists(), fn ($list): bool => $list['type'] === 'ol'));
227 |     }
228 | 
229 |     /**
230 |      * @return array<string>
231 |      **/
232 |     public function unorderedLists(): array
233 |     {
234 |         return array_values(array_filter($this->lists(), fn ($list): bool => $list['type'] === 'ul'));
235 |     }
236 | 
237 |     /**
238 |      * @return array<string>
239 |      **/
240 |     public function paragraphs(): array
241 |     {
242 |         return array_map(
243 |             'trim',
244 |             $this->filterExtractAttributes('//p', ['_text'])
245 |         );
246 |     }
247 | 
248 |     /**
249 |      * Get the paragraphs of the page excluding empty paragraphs.
250 |      */
251 |     public function cleanParagraphs(): array
252 |     {
253 |         return array_values(array_filter(
254 |             $this->paragraphs(),
255 |             fn ($paragraph): bool => $paragraph !== ''
256 |         ));
257 |     }
258 | 
259 |     /**
260 |      * Parses the content outline of the web-page
261 |      *
262 |      * @return array<string>
263 |      */
264 |     public function outline(): array
265 |     {
266 |         $result = $this->filterExtractAttributes('//h1|//h2|//h3|//h4|//h5|//h6', ['_name', '_text']);
267 | 
268 |         foreach ($result as $index => $array) {
269 |             $result[$index] = array_combine(['tag', 'content'], (array) $array);
270 |         }
271 | 
272 |         return $result;
273 |     }
274 | 
275 |     /**
276 |      * Parses the content outline of the web-page
277 |      *
278 |      * @return array<array>
279 |      */
280 |     public function outlineWithParagraphs(): array
281 |     {
282 |         $result = $this->filterExtractAttributes('//h1|//h2|//h3|//h4|//h5|//h6|//p', ['_name', '_text']);
283 | 
284 |         foreach ($result as $index => $array) {
285 |             $result[$index] = array_combine(['tag', 'content'], (array) $array);
286 |             $result[$index]['content'] = trim((string) $result[$index]['content']);
287 |         }
288 | 
289 |         return $result;
290 |     }
291 | 
292 |     /**
293 |      * Parses the content outline of the web-page
294 |      */
295 |     public function cleanOutlineWithParagraphs(): array
296 |     {
297 |         $elementsNameAndText = $this->filterExtractAttributes('//h1|//h2|//h3|//h4|//h5|//h6|//p', ['_name', '_text']);
298 | 
299 |         /** @var array<string> $nameAndText */
300 |         foreach ($elementsNameAndText as $index => $nameAndText) {
301 |             // Element has no text.
302 |             if ($nameAndText[1] === '') {
303 |                 continue;
304 |             }
305 | 
306 |             $elementsNameAndText[$index] = [
307 |                 'tag' => $nameAndText[0],
308 |                 'content' => trim($nameAndText[1]),
309 |             ];
310 |         }
311 | 
312 |         return $elementsNameAndText;
313 |     }
314 | 
315 |     /**
316 |      * Internal method to prepare the content for keyword analysis
317 |      *  done in the called methods for the rake analysis
318 |      *
319 |      * Uses:
320 |      *
321 |      *  - Title
322 |      *  - Headings
323 |      *  - Paragraphs/Content
324 |      *  - Link anchors and Titles
325 |      *  - Alt Texts of Images
326 |      *  - Meta Title, Description and Keywords
327 |      *
328 |      * @see https://github.com/Donatello-za/rake-php-plus
329 |      * @see https://phpscraper.de/examples/extract-keywords.html
330 |      * @see https://github.com/spekulatius/phpscraper-keyword-scraping-example
331 |      *
332 |      * @return array<string>
333 |      */
334 |     protected function prepContent(): array
335 |     {
336 |         // Collect content strings
337 |         $content = array_merge(
338 |             // Website title
339 |             [$this->title()],
340 | 
341 |             // Paragraphs
342 |             $this->paragraphs(),
343 | 
344 |             // Various meta tags
345 |             [
346 |                 $this->author(),
347 |                 $this->description(),
348 |                 implode(' ', $this->keywords()),
349 |             ]
350 |         );
351 | 
352 |         // Add headings
353 |         foreach ($this->headings() as $headings) {
354 |             $content += array_values($headings);
355 |         }
356 | 
357 |         // Add image alt texts in
358 |         foreach ($this->linksWithDetails() as $link) {
359 |             $content[] = $link['text'];
360 |             $content[] = $link['title'];
361 |         }
362 |         foreach ($this->imagesWithDetails() as $image) {
363 |             $content[] = $image['alt'];
364 |         }
365 | 
366 |         return $content;
367 |     }
368 | 
369 |     /**
370 |      * Gets a set of keywords based on the rake approach.
371 |      *
372 |      * Uses:
373 |      *
374 |      *  - Title
375 |      *  - Headings
376 |      *  - Paragraphs/Content
377 |      *  - Link anchors and Titles
378 |      *  - Alt Texts of Images
379 |      *  - Meta Title, Description and Keywords
380 |      *
381 |      * @see https://github.com/Donatello-za/rake-php-plus
382 |      * @see https://phpscraper.de/examples/extract-keywords.html
383 |      * @see https://github.com/spekulatius/phpscraper-keyword-scraping-example
384 |      *
385 |      * @param  string  $locale  (default: 'en_US')
386 |      */
387 |     public function contentKeywords($locale = 'en_US'): array
388 |     {
389 |         // Extract the keyword phrases and return a sorted array
390 |         return RakePlus::create(implode(' ', $this->prepContent()), $locale)
391 |             ->sort('asc')
392 |             ->get();
393 |     }
394 | 
395 |     /**
396 |      * Gets a set of keywords with scores based on the rake approach
397 |      *
398 |      * Uses:
399 |      *
400 |      *  - Title
401 |      *  - Headings
402 |      *  - Paragraphs/Content
403 |      *  - Link anchors and Titles
404 |      *  - Alt Texts of Images
405 |      *  - Meta Title, Description and Keywords
406 |      *
407 |      * @see https://github.com/Donatello-za/rake-php-plus
408 |      * @see https://phpscraper.de/examples/extract-keywords.html
409 |      * @see https://github.com/spekulatius/phpscraper-keyword-scraping-example
410 |      *
411 |      * @param  string  $locale  (default: 'en_US')
412 |      */
413 |     public function contentKeywordsWithScores($locale = 'en_US'): array
414 |     {
415 |         // Extract the keyword phrases and return a sorted array
416 |         return RakePlus::create(implode(' ', $this->prepContent()), $locale)
417 |             ->sortByScore('desc')
418 |             ->scores();
419 |     }
420 | 
421 |     /**
422 |      * Get all links on the page as absolute URLs
423 |      *
424 |      * @see https://github.com/spekulatius/link-scraping-test-beautifulsoup-vs-phpscraper
425 |      */
426 |     public function links(): array
427 |     {
428 |         $links = $this->filter('//a')->links();
429 | 
430 |         // Generate a list of all image entries
431 |         $result = [];
432 |         foreach ($links as $link) {
433 |             $result[] = $link->getUri();
434 |         }
435 | 
436 |         return $result;
437 |     }
438 | 
439 |     /**
440 |      * Get all internal links (same root or sub-domain) on the page as absolute URLs
441 |      */
442 |     public function internalLinks(): array
443 |     {
444 |         // Get the current host - to compare against for internal links
445 |         $currentRootDomain = $this->currentHost();
446 | 
447 |         // Filter the array
448 |         return array_values(array_filter(
449 |             $this->links(),
450 |             function ($link) use (&$currentRootDomain): bool {
451 |                 $linkRootDomain = Uri::new($link)->getHost();
452 | 
453 |                 return $currentRootDomain === $linkRootDomain;
454 |             }
455 |         ));
456 |     }
457 | 
458 |     /**
459 |      * Get all external links on the page as absolute URLs
460 |      */
461 |     public function externalLinks(): array
462 |     {
463 |         // Diff the array
464 |         return array_values(array_diff(
465 |             $this->links(),
466 |             $this->internalLinks()
467 |         ));
468 |     }
469 | 
470 |     /**
471 |      * Get all links on the page with commonly interesting details
472 |      */
473 |     public function linksWithDetails(): array
474 |     {
475 |         /** @var array<\DOMElement> $links */
476 |         $links = $this->filter('//a');
477 | 
478 |         // Generate a list of all image entries
479 |         $result = [];
480 | 
481 |         foreach ($links as $link) {
482 |             // Check if the anchor is only an image. If so, wrap it into DomCrawler\Image to get the Uri.
483 |             $image = [];
484 | 
485 |             /** @var \DOMElement $childNode */
486 |             foreach ($link->childNodes as $childNode) {
487 |                 if ($childNode->nodeName === 'img') {
488 |                     $image[] = (new DomCrawlerImage($childNode, $this->currentBaseHost()))->getUri();
489 |                 }
490 |             }
491 | 
492 |             // Collect commonly interesting attributes and URL
493 |             $rel = $link->getAttribute('rel');
494 | 
495 |             // Generate the proper uri using the Symfony's link class
496 |             $uri = (new DomCrawlerLink($link, $this->currentBaseHost()))->getUri();
497 | 
498 |             // Prepare the result set.
499 |             $entry = [
500 |                 'url' => $uri,
501 |                 'protocol' => str_contains($uri, ':') ? explode(':', $uri)[0] : null,
502 |                 'text' => trim($link->nodeValue ?? ''),
503 |                 'title' => $link->getAttribute('title') === '' ? null : $link->getAttribute('title'),
504 |                 'target' => $link->getAttribute('target') === '' ? null : $link->getAttribute('target'),
505 |                 'rel' => ($rel === '') ? null : strtolower($rel),
506 |                 'image' => $image,
507 |                 'isNofollow' => ($rel === '') ? false : str_contains($rel, 'nofollow'),
508 |                 'isUGC' => ($rel === '') ? false : str_contains($rel, 'ugc'),
509 |                 'isSponsored' => ($rel === '') ? false : str_contains($rel, 'sponsored'),
510 |                 'isMe' => ($rel === '') ? false : str_contains($rel, 'me'),
511 |                 'isNoopener' => ($rel === '') ? false : str_contains($rel, 'noopener'),
512 |                 'isNoreferrer' => ($rel === '') ? false : str_contains($rel, 'noreferrer'),
513 |             ];
514 | 
515 |             $result[] = $entry;
516 |         }
517 | 
518 |         return $result;
519 |     }
520 | 
521 |     /**
522 |      * Get all images on the page with absolute URLs
523 |      */
524 |     public function images(): array
525 |     {
526 |         // Generate a list of all image entries
527 |         $result = [];
528 | 
529 |         $images = $this->filter('//img')->images();
530 | 
531 |         /** @var \Symfony\Component\DomCrawler\Image $image */
532 |         foreach ($images as $image) {
533 |             $result[] = $image->getUri();
534 |         }
535 | 
536 |         return $result;
537 |     }
538 | 
539 |     /**
540 |      * Get all images on the page with commonly interesting details
541 |      */
542 |     public function imagesWithDetails(): array
543 |     {
544 |         // Generate a list of all image entries
545 |         $result = [];
546 | 
547 |         /** @var array<\DOMElement> $images */
548 |         $images = $this->filter('//img');
549 | 
550 |         foreach ($images as $image) {
551 |             // Collect the URL and commonly interesting attributes
552 |             $result[] = [
553 |                 // Re-generate the proper uri using the Symfony's image class
554 |                 'url' => (new DomCrawlerImage($image, $this->currentBaseHost()))->getUri(),
555 |                 'alt' => $image->getAttribute('alt'),
556 |                 'width' => $image->getAttribute('width') === '' ? null : $image->getAttribute('width'),
557 |                 'height' => $image->getAttribute('height') === '' ? null : $image->getAttribute('height'),
558 |             ];
559 |         }
560 | 
561 |         return $result;
562 |     }
563 | }
564 | 


--------------------------------------------------------------------------------
/src/UsesFeeds.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper;
  4 | 
  5 | use Spekulatius\PHPScraper\DataTransferObjects\FeedEntry;
  6 | 
  7 | trait UsesFeeds
  8 | {
  9 |     /**
 10 |      * Returns a guessed sitemap URL based on the current host. Usually it's `/sitemap.xml`.
 11 |      */
 12 |     public function sitemapUrl(): string
 13 |     {
 14 |         return $this->currentBaseHost() . '/sitemap.xml';
 15 |     }
 16 | 
 17 |     /**
 18 |      * Resolves the sitemap and returns an array with raw data.
 19 |      *
 20 |      * @return array $sitemap
 21 |      */
 22 |     public function sitemapRaw(?string $url = null): array
 23 |     {
 24 |         return $this->parseXml($this->fetchAsset($url ?? $this->sitemapUrl()));
 25 |     }
 26 | 
 27 |     /**
 28 |      * Resolves the sitemap and returns an array of `FeedEntry`-DTOs.
 29 |      *
 30 |      * @todo Support for text-only sitemaps, split versions, image-sitemaps, etc.?
 31 |      *
 32 |      * @return array<FeedEntry> $sitemap
 33 |      */
 34 |     public function sitemap(?string $url = null): array
 35 |     {
 36 |         return array_map(
 37 |             // Create the generic DTO for each
 38 |             fn ($entry): FeedEntry => FeedEntry::fromArray([
 39 |                 'title' => '',
 40 |                 'description' => '',
 41 |                 'link' => $entry['loc'],
 42 |             ]),
 43 | 
 44 |             // Fetch the sitemap URL, parse it and select the `url` section.
 45 |             $this->sitemapRaw($url)['url']
 46 |         );
 47 |     }
 48 | 
 49 |     /**
 50 |      * Returns the usual location (URL) for the static search index.
 51 |      */
 52 |     public function searchIndexUrl(): string
 53 |     {
 54 |         return $this->currentBaseHost() . '/index.json';
 55 |     }
 56 | 
 57 |     /**
 58 |      * Returns an array of the parsed search index JSON.
 59 |      *
 60 |      * @return array $searchIndex
 61 |      */
 62 |     public function searchIndexRaw(?string $url = null): array
 63 |     {
 64 |         return $this->parseJson($this->fetchAsset($url ?? $this->searchIndexUrl()));
 65 |     }
 66 | 
 67 |     /**
 68 |      * Resolves the search index and returns an array of `\Spekulatius\PHPScraper\DataTransferObjects\FeedEntry`.
 69 |      *
 70 |      * @return array<FeedEntry> $searchIndex
 71 |      */
 72 |     public function searchIndex(?string $url = null): array
 73 |     {
 74 |         return array_map(
 75 |             // Create the generic DTO for each
 76 |             fn ($entry): FeedEntry => FeedEntry::fromArray([
 77 |                 'title' => $entry['title'],
 78 |                 'description' => $entry['snippet'],
 79 |                 'link' => $entry['link'],
 80 |             ]),
 81 | 
 82 |             // Fetch the sitemap URL, parse it and select the `url` section.
 83 |             $this->searchIndexRaw($url)
 84 |         );
 85 |     }
 86 | 
 87 |     /**
 88 |      * Compiles a list of RSS urls based on the <link>-tags on the current page.
 89 |      *
 90 |      * @return array<string>
 91 |      */
 92 |     public function rssUrls(): array
 93 |     {
 94 |         $urls = $this->filterExtractAttributes('//link[@type="application/rss+xml"]', ['href']);
 95 | 
 96 |         return array_map(fn ($url): string => (string) $this->makeUrlAbsolute($url), $urls);
 97 |     }
 98 | 
 99 |     /**
100 |      * Fetches a given set of RSS feeds and returns one array with raw data.
101 |      *
102 |      * @return array $rss
103 |      */
104 |     public function rssRaw(?string ...$urls): array
105 |     {
106 |         return array_map(
107 |             fn ($url) => $this->parseXml($this->fetchAsset((string) $url)),
108 |             $urls === [] ? $this->rssUrls() : $urls
109 |         );
110 |     }
111 | 
112 |     /**
113 |      * Fetches a given set of RSS feeds and returns one array with raw data.
114 |      *
115 |      * @return array<FeedEntry> $rss
116 |      */
117 |     public function rss(?string ...$urls): array
118 |     {
119 |         return array_map(
120 |             // Create the generic DTO for each
121 |             fn ($entry): FeedEntry => FeedEntry::fromArray([
122 |                 'title' => $entry['title'],
123 |                 'link' => $entry['link']['@attributes']['href'],
124 |             ]),
125 | 
126 |             // Fetch the rss URLs, parse it and select the `url` section.
127 |             $this->rssRaw(...$urls)[0]['entry']
128 |         );
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/src/UsesFileParsers.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper;
  4 | 
  5 | trait UsesFileParsers
  6 | {
  7 |     /**
  8 |      * Base Util to decode a CSV string.
  9 |      *
 10 |      * @return array $data
 11 |      */
 12 |     public function csvDecodeRaw(
 13 |         string $csvString,
 14 |         ?string $separator = null,
 15 |         ?string $enclosure = null,
 16 |         ?string $escape = null
 17 |     ): array {
 18 |         try {
 19 |             $csv = array_map(
 20 |                 fn ($line) => str_getcsv($line, $separator ?? ',', $enclosure ?? '"', $escape ?? '\\'),
 21 |                 explode("\n", $csvString)
 22 |             );
 23 | 
 24 |             // While technically 'valid', a single string isn't overly useful and likely not actually a CSV but an URL.
 25 |             if (count($csv) === 1 && count($csv[0]) === 1) {
 26 |                 throw new \Exception('Does not look CSV-like');
 27 |             }
 28 |         } catch (\Exception $e) {
 29 |             throw new \Exception('Failed to parse CSV: ' . $e->getMessage());
 30 |         }
 31 | 
 32 |         return $csv;
 33 |     }
 34 | 
 35 |     /**
 36 |      * Decode CSV and cast types.
 37 |      *
 38 |      * @return array $data
 39 |      */
 40 |     public function csvDecode(
 41 |         string $csvString,
 42 |         ?string $separator = null,
 43 |         ?string $enclosure = null,
 44 |         ?string $escape = null
 45 |     ): array {
 46 |         try {
 47 |             $csv = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape);
 48 | 
 49 |             // Cast native and custom types
 50 |             $csv = array_map(
 51 |                 fn ($line): array => array_map(
 52 |                     fn ($cell) => $this->castType($cell),
 53 |                     $line
 54 |                 ),
 55 |                 $csv
 56 |             );
 57 |         } catch (\Exception $e) {
 58 |             throw new \Exception('Failed to parse CSV: ' . $e->getMessage());
 59 |         }
 60 | 
 61 |         return $csv;
 62 |     }
 63 | 
 64 |     /**
 65 |      * Util to decode a CSV string to asso. array.
 66 |      *
 67 |      * @return array $data
 68 |      */
 69 |     public function csvDecodeWithHeaderRaw(
 70 |         string $csvString,
 71 |         ?string $separator = null,
 72 |         ?string $enclosure = null,
 73 |         ?string $escape = null
 74 |     ): array {
 75 |         try {
 76 |             $csv = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape);
 77 | 
 78 |             $header = array_shift($csv);
 79 | 
 80 |             // Combine the rows with the header entry.
 81 |             array_walk(
 82 |                 $csv,
 83 |                 function (&$row, $key, $header): void {
 84 |                     $row = array_combine($header, $row);
 85 |                 },
 86 |                 $header
 87 |             );
 88 |         } catch (\Exception $e) {
 89 |             throw new \Exception('Failed to parse CSV: ' . $e->getMessage());
 90 |         }
 91 | 
 92 |         return $csv;
 93 |     }
 94 | 
 95 |     /**
 96 |      * Decode a CSV string to asso. array and cast types.
 97 |      *
 98 |      * @return array $data
 99 |      */
100 |     public function csvDecodeWithHeader(
101 |         string $csvString,
102 |         ?string $separator = null,
103 |         ?string $enclosure = null,
104 |         ?string $escape = null
105 |     ): array {
106 |         try {
107 |             $csv = $this->csvDecodeWithHeaderRaw($csvString, $separator, $enclosure, $escape);
108 | 
109 |             // Cast native and custom types
110 |             foreach ($csv as $idx => $row) {
111 |                 foreach ($row as $key => $value) {
112 |                     $csv[$idx][$key] = $this->castType($value);
113 |                 }
114 |             }
115 |         } catch (\Exception $e) {
116 |             throw new \Exception('Failed to parse CSV: ' . $e->getMessage());
117 |         }
118 | 
119 |         return $csv;
120 |     }
121 | 
122 |     /**
123 |      * Helper method to cast types
124 |      */
125 |     public function castType(string $entry): int|float|string
126 |     {
127 |         // Looks like an int?
128 |         if ($entry == (int) $entry) {
129 |             return (int) $entry;
130 |         }
131 | 
132 |         // Looks like a float?
133 |         if ($entry == (float) $entry) {
134 |             return (float) $entry;
135 |         }
136 | 
137 |         return $entry;
138 |     }
139 | 
140 |     /**
141 |      * Parses a given CSV string or fetches the URL and parses it.
142 |      *
143 |      * @return array $data
144 |      */
145 |     public function parseCsv(
146 |         ?string $csvStringOrUrl = null,
147 |         ?string $separator = null,
148 |         ?string $enclosure = null,
149 |         ?string $escape = null
150 |     ): array {
151 |         // Check if we got either a current page or at least a URL string to process
152 |         if ($csvStringOrUrl === null && $this->currentPage === null) {
153 |             throw new \Exception('You can not call parseCsv() without parameter or initial navigation.');
154 |         }
155 | 
156 |         try {
157 |             // If we have a string, let's try to parse the CSV from this.
158 |             if ($csvStringOrUrl !== null) {
159 |                 // Simple: Try to parse what we have been given
160 |                 try {
161 |                     $result = $this->csvDecode($csvStringOrUrl, $separator, $enclosure, $escape);
162 |                 } catch (\Exception $e) {
163 |                     // We don't do anything if it fails - likely we have an URL. Let's continue below.
164 |                 }
165 |             }
166 | 
167 |             /**
168 |              * We fetch the content and process it, if we haven't got a CSV as a string.
169 |              *
170 |              * This is a work-around to allow for:
171 |              *
172 |              * - `$web->parseCsv('https://...')`.
173 |              * - `$web->go('...')->parseCsv()`.
174 |              */
175 |             $result = $result ?? $this->csvDecode(
176 |                 // Fetch the resource either using $csvStringOrUrl
177 |                 $this->fetchAsset(
178 |                     // Fallback on the current URL, if needed and possible (`go` was used before).
179 |                     $csvStringOrUrl ?? $this->currentUrl()
180 |                 ),
181 |                 $separator,
182 |                 $enclosure,
183 |                 $escape
184 |             );
185 |         } catch (\Exception $e) {
186 |             throw new \Exception('Failed to parse CSV: ' . $e->getMessage());
187 |         }
188 | 
189 |         return (array) $result;
190 |     }
191 | 
192 |     /**
193 |      * Parses a given CSV string into an asso. with headers or fetches the URL and parses it.
194 |      *
195 |      * @return array $data
196 |      */
197 |     public function parseCsvWithHeader(
198 |         ?string $csvStringOrUrl = null,
199 |         ?string $separator = null,
200 |         ?string $enclosure = null,
201 |         ?string $escape = null
202 |     ): array {
203 |         // Check if we got either a current page or at least a URL string to process
204 |         if ($csvStringOrUrl === null && $this->currentPage === null) {
205 |             throw new \Exception('You can not call parseCsvWithHeader() without parameter or initial navigation.');
206 |         }
207 | 
208 |         try {
209 |             // If we have a string, let's try to parse the CSV from this.
210 |             if ($csvStringOrUrl !== null) {
211 |                 // Simple: Try to parse what we have been given
212 |                 try {
213 |                     $result = $this->csvDecodeWithHeader($csvStringOrUrl, $separator, $enclosure, $escape);
214 |                 } catch (\Exception $e) {
215 |                     // We don't do anything if it fails - likely we have an URL. Let's continue below.
216 |                 }
217 |             }
218 | 
219 |             /**
220 |              * We fetch the content and process it, if we haven't got a CSV as a string.
221 |              *
222 |              * This is a work-around to allow for:
223 |              *
224 |              * - `$web->parseCsvWithHeader('https://...')`.
225 |              * - `$web->go('...')->parseCsvWithHeader()`.
226 |              */
227 |             $result = $result ?? $this->csvDecodeWithHeader(
228 |                 // Fetch the resource either using $csvStringOrUrl
229 |                 $this->fetchAsset(
230 |                     // Fallback on the current URL, if needed and possible (`go` was used before).
231 |                     $csvStringOrUrl ?? $this->currentUrl()
232 |                 ),
233 |                 $separator,
234 |                 $enclosure,
235 |                 $escape
236 |             );
237 |         } catch (\Exception $e) {
238 |             throw new \Exception('Failed to parse CSV: ' . $e->getMessage());
239 |         }
240 | 
241 |         return (array) $result;
242 |     }
243 | 
244 |     /**
245 |      * Parses a given JSON string or fetches the URL and parses it.
246 |      *
247 |      * @return array $data
248 |      */
249 |     public function parseJson(?string $jsonStringOrUrl = null): array
250 |     {
251 |         // Check if we got either a current page or at least a URL string to process
252 |         if ($jsonStringOrUrl === null && $this->currentPage === null) {
253 |             throw new \Exception('You can not call parseJson() without parameter or initial navigation.');
254 |         }
255 | 
256 |         try {
257 |             // If we have a string, let's try to parse the JSON from this.
258 |             if ($jsonStringOrUrl !== null) {
259 |                 // Simple: Try to parse what we have been given
260 |                 try {
261 |                     $result = json_decode($jsonStringOrUrl, true, 512, JSON_THROW_ON_ERROR);
262 |                 } catch (\Exception $e) {
263 |                     // We don't do anything if it fails - likely we have an URL. Let's continue below.
264 |                 }
265 |             }
266 | 
267 |             /**
268 |              * We fetch the content and process it, if we haven't got a JSON as a string.
269 |              *
270 |              * This is a work-around to allow for:
271 |              *
272 |              * - `$web->parseJson('https://...')`.
273 |              * - `$web->go('...')->parseJson()`.
274 |              */
275 |             $result = $result ?? json_decode(
276 |                 // Fetch the resource either using $jsonStringOrUrl
277 |                 $this->fetchAsset(
278 |                     // Fallback on the current URL, if needed and possible (`go` was used before).
279 |                     $jsonStringOrUrl ?? $this->currentUrl()
280 |                 ),
281 |                 true,
282 |                 512,
283 |                 JSON_THROW_ON_ERROR
284 |             );
285 |         } catch (\Exception $e) {
286 |             throw new \Exception('Failed to parse JSON: ' . $e->getMessage());
287 |         }
288 | 
289 |         return (array) $result;
290 |     }
291 | 
292 |     /**
293 |      * Parses a given XML string or fetches the URL and parses it.
294 |      *
295 |      * @return array $data
296 |      */
297 |     public function parseXml(?string $xmlStringOrUrl = null): array
298 |     {
299 |         // Check if we got either a current page or at least a URL string to process
300 |         if ($xmlStringOrUrl === null && $this->currentPage === null) {
301 |             throw new \Exception('You can not call parseXml() without parameter or initial navigation.');
302 |         }
303 | 
304 |         try {
305 |             // Try to parse the XML. If it works we have got an XML string.
306 |             if ($xmlStringOrUrl !== null) {
307 |                 try {
308 |                     $result = $this->xmlDecode($xmlStringOrUrl);
309 |                 } catch (\Exception $e) {
310 |                     // Do nothing, we just want to try it if it works.
311 |                 }
312 |             }
313 | 
314 |             /**
315 |              * We fetch the content and process it, if we haven't got a XML as a string.
316 |              *
317 |              * This is a work-around to allow for:
318 |              *
319 |              * - `$web->parseXml('https://...')`.
320 |              * - `$web->go('...')->parseXml()`.
321 |              */
322 |             $result = $result ?? $this->xmlDecode($this->fetchAsset(
323 |                 $xmlStringOrUrl ?? $this->currentUrl()
324 |             ));
325 |         } catch (\Exception $e) {
326 |             throw new \Exception('Failed to parse XML: ' . $e->getMessage());
327 |         }
328 | 
329 |         return $result;
330 |     }
331 | 
332 |     protected function xmlDecode(string $xmlString): array
333 |     {
334 |         // XML parser
335 |         $xml = simplexml_load_string(trim($xmlString), 'SimpleXMLElement', LIBXML_NOCDATA);
336 | 
337 |         // Convert XML to JSON and then to an associative array
338 |         return (array) json_decode(json_encode($xml, JSON_THROW_ON_ERROR), true, 512, JSON_THROW_ON_ERROR);
339 |     }
340 | }
341 | 


--------------------------------------------------------------------------------
/src/UsesUrls.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper;
 4 | 
 5 | use League\Uri\Http;
 6 | use League\Uri\Uri;
 7 | use League\Uri\UriResolver;
 8 | 
 9 | trait UsesUrls
10 | {
11 |     /**
12 |      * Returns the current url - this is either set by `go` indirectly or directly using `setContent`.
13 |      *
14 |      * @return string $url
15 |      *
16 |      * @throws \Exception
17 |      */
18 |     public function currentUrl(): string
19 |     {
20 |         // Ensure we aren't having a "call on null" without context.
21 |         if ($this->currentPage === null) {
22 |             throw new \Exception('You can not access the URL before your first navigation using `go`.');
23 |         }
24 | 
25 |         return (string) $this->currentPage->getUri();
26 |     }
27 | 
28 |     /**
29 |      * Returns the current host
30 |      *
31 |      * @return string|null $host
32 |      */
33 |     public function currentHost(): ?string
34 |     {
35 |         return Uri::new($this->currentUrl())->getHost();
36 |     }
37 | 
38 |     /**
39 |      * Returns the current host as defined in `<base href="...">` or the current host.
40 |      *
41 |      * @return string $baseUrl
42 |      */
43 |     public function currentBaseHost(): string
44 |     {
45 |         $uri = Uri::new($this->baseHref() ?? $this->currentUrl());
46 | 
47 |         return $uri->getScheme() . '://' . $uri->getHost();
48 |     }
49 | 
50 |     /**
51 |      * Converts a current URL to be absolute based on <base> or current page.
52 |      *
53 |      * @return ?string $absoluteUrl
54 |      */
55 |     public function makeUrlAbsolute(?string $url = null, ?string $baseUrl = null): ?string
56 |     {
57 |         // Allow to pass null through
58 |         if ($url === null || $this->currentPage === null) {
59 |             return null;
60 |         }
61 | 
62 |         // Resolve the Url using one of the provided/set base href.
63 |         return (string) UriResolver::resolve(
64 |             Http::new($url),
65 |             Http::new($baseUrl ?? $this->baseHref() ?? $this->currentBaseHost()),
66 |         );
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/UsesXPathFilters.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper;
 4 | 
 5 | use Symfony\Component\DomCrawler\Crawler;
 6 | 
 7 | trait UsesXPathFilters
 8 | {
 9 |     /**
10 |      * Filters the current page by a xPath-query
11 |      */
12 |     public function filter(string $query): Crawler
13 |     {
14 |         return $this->currentPage->filterXPath($query);
15 |     }
16 | 
17 |     /**
18 |      * Filters the current page by a xPath-query and returns the first one, or null.
19 |      */
20 |     public function filterFirst(string $query): ?Crawler
21 |     {
22 |         $filteredNodes = $this->filter($query);
23 | 
24 |         return ($filteredNodes->count() === 0) ? null : $filteredNodes->first();
25 |     }
26 | 
27 |     /**
28 |      * Filters the current page by a xPath-query and returns the first ones content, or null.
29 |      */
30 |     public function filterFirstText(string $query): ?string
31 |     {
32 |         $filteredNodes = $this->filter($query);
33 | 
34 |         return ($filteredNodes->count() === 0) ? null : $filteredNodes->first()->text();
35 |     }
36 | 
37 |     /**
38 |      * Filters the current page by a xPath-query and returns the textual content as array.
39 |      *
40 |      * @return array<string>
41 |      */
42 |     public function filterTexts(string $query): array
43 |     {
44 |         return $this->filterExtractAttributes($query, ['_text']);
45 |     }
46 | 
47 |     /**
48 |      * Filters the current page by a xPath-query and returns the selected attributes as array.
49 |      *
50 |      * @param  array<string>  $attributes
51 |      * @return array<string>
52 |      */
53 |     public function filterExtractAttributes(string $query, array $attributes): array
54 |     {
55 |         $filteredNodes = $this->filter($query);
56 | 
57 |         return ($filteredNodes->count() === 0) ? [] : $filteredNodes->extract($attributes);
58 |     }
59 | 
60 |     /**
61 |      * Filters the current page by a xPath-query and returns the selected attributes of the first match.
62 |      *
63 |      * @param  array<string>  $attributes
64 |      */
65 |     public function filterFirstExtractAttribute(string $query, array $attributes): ?string
66 |     {
67 |         $filteredNodes = $this->filter($query);
68 | 
69 |         return ($filteredNodes->count() === 0) ? null : $filteredNodes->first()->extract($attributes)[0];
70 |     }
71 | 
72 |     /**
73 |      * Returns the content attribute for the first result of the query, or null.
74 |      */
75 |     public function filterFirstContent(string $query): ?string
76 |     {
77 |         return $this->filterFirstExtractAttribute($query, ['content']);
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/tests/BaseHrefTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | /**
 6 |  * This tests only the `<base href="...">`-extraction.
 7 |  *
 8 |  * If you are looking for any URL-related tests check `UrlTest.php`.
 9 |  */
10 | class BaseHrefTest extends \PHPUnit\Framework\TestCase
11 | {
12 |     /**
13 |      * @test
14 |      */
15 |     public function testMissingBaseHref()
16 |     {
17 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
18 | 
19 |         // Navigate to the test page.
20 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
21 | 
22 |         // Check the baseHref as not given (null)
23 |         $this->assertNull($web->baseHref);
24 |     }
25 | 
26 |     /**
27 |      * @test
28 |      */
29 |     public function testBaseHref()
30 |     {
31 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
32 | 
33 |         // Navigate to the test page.
34 |         // Contains: <base href="https://test-pages-with-base-href.phpscraper.de/">
35 |         $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
36 | 
37 |         // Check the baseHref
38 |         $this->assertSame(
39 |             'https://test-pages-with-base-href.phpscraper.de/',
40 |             $web->baseHref
41 |         );
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/tests/CanonicalTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class CanonicalTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingCanonical()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Go to the test page
15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
16 | 
17 |         // null if there isn't a canonical set.
18 |         $this->assertNull($web->canonical);
19 |     }
20 | 
21 |     /**
22 |      * @test
23 |      */
24 |     public function testWithCanonical()
25 |     {
26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
27 | 
28 |         // Navigate to the test page.
29 |         // It contains: <link rel="canonical" href="https://test-pages.phpscraper.de/navigation/2.html" />
30 |         $web->go('https://test-pages.phpscraper.de/navigation/1.html');
31 | 
32 |         // Check the canonical
33 |         $this->assertSame(
34 |             'https://test-pages.phpscraper.de/navigation/2.html',
35 |             $web->canonical
36 |         );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/tests/CoreTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class CoreTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMethodAndPropertyCallsAreEqual()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Navigate to test page
15 |         $web->go('https://phpscraper.de');
16 | 
17 |         // Both the method call as well as property call should return the same...
18 |         $this->assertSame($web->title, $web->title());
19 |     }
20 | 
21 |     /**
22 |      * Test if our local variable is updated correctly.
23 |      *
24 |      * @test
25 |      */
26 |     public function testChangeOfCurrentPage()
27 |     {
28 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
29 | 
30 |         // 1. Navigate to test page
31 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
32 | 
33 |         // Both the method call as well as property call should return the same...
34 |         $this->assertSame(
35 |             'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
36 |             $web->currentUrl
37 |         );
38 |         $this->assertSame(
39 |             'Lorem Ipsum',
40 |             $web->title
41 |         );
42 | 
43 |         // 2. Leave the current page and head on to the next one.
44 |         $web->go('https://phpscraper.de');
45 | 
46 |         // We should have navigated.
47 |         $this->assertSame(
48 |             'https://phpscraper.de',
49 |             $web->currentUrl
50 |         );
51 | 
52 |         // Shouldn't match, because we surfed on...
53 |         $this->assertNotSame(
54 |             'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
55 |             $web->currentUrl
56 |         );
57 |         $this->assertNotSame(
58 |             'Lorem Ipsum',
59 |             $web->title
60 |         );
61 |     }
62 | 
63 |     /**
64 |      * Calls should be chainable and easy to access.
65 |      *
66 |      * @test
67 |      */
68 |     public function testBasicChainability()
69 |     {
70 |         // Testing env: First h1: "We are testing here & elsewhere!"
71 |         $url = 'https://test-pages.phpscraper.de/meta/html-entities.html';
72 | 
73 |         // Test 1: Create, navigate to the test page.
74 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
75 |         $web->go($url);
76 | 
77 |         // Check the h1
78 |         $this->assertSame(
79 |             'We are testing here & elsewhere!',
80 |             $web->h1[0]
81 |         );
82 | 
83 |         // Test 2: Chained
84 |         $this->assertSame(
85 |             'We are testing here & elsewhere!',
86 | 
87 |             // Chained
88 |             (new \Spekulatius\PHPScraper\PHPScraper)
89 |                 ->go($url)
90 |                 ->h1[0]
91 |         );
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/tests/CustomSelectorTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class CustomSelectorTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testFailedSelectionBasedOnId()
 11 |     {
 12 |         // Navigate to test page
 13 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 14 |         $web->go('https://test-pages.phpscraper.de/content/selectors.html');
 15 | 
 16 |         // Ensure we got the test page.
 17 |         $this->assertSame(
 18 |             'Selector Tests',
 19 |             $web->title
 20 |         );
 21 | 
 22 |         // Trigger failing test.
 23 |         try {
 24 |             $web->filterFirstText("//[@id='by-id']");
 25 |         } catch (\Exception $e) {
 26 |             $this->assertSame(
 27 |                 'DOMXPath::query(): Invalid expression',
 28 |                 $e->getMessage()
 29 |             );
 30 |         }
 31 |     }
 32 | 
 33 |     /**
 34 |      * @test
 35 |      */
 36 |     public function testSelectionBasedOnId()
 37 |     {
 38 |         // Navigate to test page
 39 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 40 |         $web->go('https://test-pages.phpscraper.de/content/selectors.html');
 41 | 
 42 |         // Ensure we got the test page.
 43 |         $this->assertSame(
 44 |             'Selector Tests',
 45 |             $web->title
 46 |         );
 47 | 
 48 |         // Select content using `->text()`
 49 |         $this->assertSame(
 50 |             'Content by ID',
 51 |             $web->filterFirstText("//*[@id='by-id']")
 52 |         );
 53 |     }
 54 | 
 55 |     /**
 56 |      * @test
 57 |      */
 58 |     public function testSelectionBasedOnTag()
 59 |     {
 60 |         // Navigate to test page
 61 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 62 |         $web->go('https://test-pages.phpscraper.de/content/selectors.html');
 63 | 
 64 |         // Ensure we got the test page.
 65 |         $this->assertSame(
 66 |             'Selector Tests',
 67 |             $web->title
 68 |         );
 69 | 
 70 |         // Select single string using first and chain `->text()`
 71 |         $this->assertSame(
 72 |             'Selector Tests (h1)',
 73 |             $web->filterFirst('//h1')->text()
 74 |         );
 75 | 
 76 |         // Select as array using `filterTexts`:
 77 |         $this->assertSame(
 78 |             ['Selector Tests (h1)'],
 79 |             $web->filterTexts('//h1')
 80 |         );
 81 |     }
 82 | 
 83 |     /**
 84 |      * @test
 85 |      */
 86 |     public function testSelectionBasedOnClass()
 87 |     {
 88 |         // Navigate to test page
 89 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 90 |         $web->go('https://test-pages.phpscraper.de/content/selectors.html');
 91 | 
 92 |         // Ensure we got the test page.
 93 |         $this->assertSame(
 94 |             'Selector Tests',
 95 |             $web->title
 96 |         );
 97 | 
 98 |         // Select without `->text()` and using the filterTexts-method instead.
 99 |         $this->assertSame(
100 |             ['Content by Class 1', 'Content by Class 2'],
101 |             $web->filterTexts("//*[@class='by-class']")
102 |         );
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/tests/DownloadTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class DownloadTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingDownload()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         $this->expectException(\Symfony\Component\HttpClient\Exception\ClientException::class);
15 |         $this->expectExceptionMessage('HTTP/2 404  returned for "https://phpscraper.de/broken-url"');
16 | 
17 |         $web->fetchAsset('https://phpscraper.de/broken-url');
18 |     }
19 | 
20 |     /**
21 |      * @test
22 |      */
23 |     public function testDownload()
24 |     {
25 |         // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
27 |         $xmlString = $web->fetchAsset('https://phpscraper.de/sitemap.xml');
28 | 
29 |         // Convert XML to array
30 |         // Credit: https://stackoverflow.com/a/20431742
31 |         $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA);
32 |         $array = json_decode((string) json_encode($xml), true);
33 | 
34 |         $urls = array_map(
35 |             fn ($url) => $url['loc'],
36 |             $array['url']
37 |         );
38 | 
39 |         $this->assertContains(
40 |             'https://phpscraper.de/',
41 |             $urls
42 |         );
43 |     }
44 | 
45 |     /**
46 |      * We should support both absolute and relative URLs.
47 |      *
48 |      * Here we use the sitemap test page as a reference.
49 |      *
50 |      * @test
51 |      */
52 |     public function testDifferentUrlTypes()
53 |     {
54 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
55 | 
56 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
57 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
58 | 
59 |         // Test 1: Absolute URL
60 |         $this->assertSame(
61 |             $web->fetchAsset($web->sitemapUrl),
62 |             $web->fetchAsset($web->currentBaseHost . '/custom_sitemap.xml'),
63 |         );
64 | 
65 |         // Test 2: Relative URL
66 |         $this->assertSame(
67 |             $web->fetchAsset($web->sitemapUrl),
68 |             $web->fetchAsset('/custom_sitemap.xml'),
69 |         );
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/tests/FeedRssTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | use Spekulatius\PHPScraper\DataTransferObjects\FeedEntry;
  6 | 
  7 | class FeedRssTest extends \PHPUnit\Framework\TestCase
  8 | {
  9 |     /**
 10 |      * @test
 11 |      */
 12 |     public function testMissingRssUrls()
 13 |     {
 14 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 15 | 
 16 |         // Navigate to the test page.
 17 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
 18 | 
 19 |         // This page shouldn't contain any RSS feeds.
 20 |         $this->assertEmpty($web->rssUrls);
 21 |     }
 22 | 
 23 |     /**
 24 |      * @test
 25 |      */
 26 |     public function testRssUrls()
 27 |     {
 28 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 29 | 
 30 |         // Navigate to the test page.
 31 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 32 | 
 33 |         // Did we get the expected result? Any URLs should be made absolute.
 34 |         $this->assertSame([
 35 |             'https://test-pages.phpscraper.de/absolute.xml',
 36 |             'https://test-pages.phpscraper.de/relative.xml',
 37 |         ], $web->rssUrls);
 38 |     }
 39 | 
 40 |     /**
 41 |      * Tests if we can use a custom url instead of a identified one.
 42 |      *
 43 |      * @test
 44 |      */
 45 |     public function testCustomRssUrl()
 46 |     {
 47 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 48 | 
 49 |         // Navigate to the test page.
 50 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 51 | 
 52 |         // We should always allow to use a custom url.
 53 |         // Both files are the same.
 54 |         // One URL isn't linked from the feeds.html and therefore is custom.
 55 |         $this->assertSame(
 56 |             $web->rssRaw('https://test-pages.phpscraper.de/custom_rss.xml'),
 57 |             $web->rssRaw('https://test-pages.phpscraper.de/relative.xml')
 58 |         );
 59 |     }
 60 | 
 61 |     /**
 62 |      * We should support both absolute and relative URLs.
 63 |      *
 64 |      * @test
 65 |      */
 66 |     public function testDifferentRssUrlTypes()
 67 |     {
 68 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 69 | 
 70 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
 71 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 72 | 
 73 |         // Test 1: Absolute URL
 74 |         $this->assertSame(
 75 |             $web->rssRaw($web->rssUrls[0]),
 76 |             $web->rssRaw($web->currentBaseHost . '/custom_rss.xml'),
 77 |         );
 78 | 
 79 |         // Test 2: Relative URL
 80 |         $this->assertSame(
 81 |             $web->rssRaw($web->rssUrls[0]),
 82 |             $web->rssRaw('/custom_rss.xml'),
 83 |         );
 84 |     }
 85 | 
 86 |     /**
 87 |      * Tests the raw parsing.
 88 |      *
 89 |      * @test
 90 |      */
 91 |     public function testRssRawContent()
 92 |     {
 93 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 94 | 
 95 |         // Navigate to the test page.
 96 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 97 | 
 98 |         // The raw RSS is rather unhandy to work with. Let's put it in a var before testing stuff.
 99 |         $rssRaw = $web->rssRaw('https://test-pages.phpscraper.de/custom_rss.xml')[0]['entry'];
100 | 
101 |         // Ensure the structure is an nested array
102 |         $this->assertTrue(is_array($rssRaw));
103 |         $this->assertTrue(is_array($rssRaw[4]));
104 | 
105 |         // Check some entries to ensure the parsing works.
106 |         $this->assertSame(
107 |             $rssRaw[4]['link']['@attributes']['href'],
108 |             'https://peterthaleikis.com/posts/how-i-built-my-first-browser-extension/'
109 |         );
110 |         $this->assertSame(
111 |             $rssRaw[2]['link']['@attributes']['href'],
112 |             'https://peterthaleikis.com/posts/how-to-use-pug-on-netlify/'
113 |         );
114 |         $this->assertSame(
115 |             $rssRaw[0]['link']['@attributes']['href'],
116 |             'https://peterthaleikis.com/posts/startup-name-check:-experiences-of-the-first-week/'
117 |         );
118 |     }
119 | 
120 |     /**
121 |      * Tests the DTO creation.
122 |      *
123 |      * @test
124 |      */
125 |     public function testRss()
126 |     {
127 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
128 | 
129 |         // Navigate to the test page.
130 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
131 | 
132 |         // The raw RSS is rather unhandy to work with (hence we actually use the DTOs).
133 |         $rss = $web->rss('https://test-pages.phpscraper.de/custom_rss.xml');
134 | 
135 |         // Check the count
136 |         $this->assertSame(37, count($rss));
137 | 
138 |         // Check some entries to ensure the parsing works.
139 |         // Set 1
140 |         $this->assertTrue($rss[4] instanceof FeedEntry);
141 |         $this->assertSame(
142 |             $rss[4]->title,
143 |             'How I Built My First Browser Extension'
144 |         );
145 |         $this->assertSame(
146 |             $rss[4]->link,
147 |             'https://peterthaleikis.com/posts/how-i-built-my-first-browser-extension/'
148 |         );
149 | 
150 |         // Set 2
151 |         $this->assertTrue($rss[2] instanceof FeedEntry);
152 |         $this->assertSame(
153 |             $rss[2]->title,
154 |             'How to Use Pug on Netlify?'
155 |         );
156 |         $this->assertSame(
157 |             $rss[2]->link,
158 |             'https://peterthaleikis.com/posts/how-to-use-pug-on-netlify/'
159 |         );
160 | 
161 |         // Set 3
162 |         $this->assertTrue($rss[0] instanceof FeedEntry);
163 |         $this->assertSame(
164 |             $rss[0]->title,
165 |             'Startup Name Check: Experiences of the First week'
166 |         );
167 |         $this->assertSame(
168 |             $rss[0]->link,
169 |             'https://peterthaleikis.com/posts/startup-name-check:-experiences-of-the-first-week/'
170 |         );
171 |     }
172 | }
173 | 


--------------------------------------------------------------------------------
/tests/FeedSearchIndexTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | use Spekulatius\PHPScraper\DataTransferObjects\FeedEntry;
  6 | 
  7 | class FeedSearchIndexTest extends \PHPUnit\Framework\TestCase
  8 | {
  9 |     /**
 10 |      * @test
 11 |      */
 12 |     public function testSearchIndexUrl()
 13 |     {
 14 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 15 | 
 16 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
 17 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 18 | 
 19 |         // Did we get the expected `/index.json`?
 20 |         $this->assertSame(
 21 |             'https://test-pages.phpscraper.de/index.json',
 22 |             $web->searchIndexUrl
 23 |         );
 24 |     }
 25 | 
 26 |     /**
 27 |      * Tests if the default search index path is applied.
 28 |      *
 29 |      * @test
 30 |      */
 31 |     public function testDefaultSearchIndexUrl()
 32 |     {
 33 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 34 | 
 35 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
 36 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 37 | 
 38 |         // `searchIndexUrl` should be the default.
 39 |         $this->assertSame(
 40 |             $web->searchIndexRaw(),
 41 |             $web->searchIndexRaw($web->searchIndexUrl),
 42 |         );
 43 |     }
 44 | 
 45 |     /**
 46 |      * The `custom_index.json` and `index.json` are the same.
 47 |      *
 48 |      * So we compare the two results to ensure the custom URL feature works.
 49 |      *
 50 |      * @test
 51 |      */
 52 |     public function testCustomSearchIndexUrl()
 53 |     {
 54 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 55 | 
 56 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
 57 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 58 | 
 59 |         // We should always allow for custom urls.
 60 |         $this->assertSame(
 61 |             $web->searchIndexRaw($web->searchIndexUrl),
 62 |             $web->searchIndexRaw($web->currentBaseHost . '/custom_index.json'),
 63 |         );
 64 |     }
 65 | 
 66 |     /**
 67 |      * We should support both absolute and relative URLs.
 68 |      *
 69 |      * @test
 70 |      */
 71 |     public function testDifferentSearchIndexUrlTypes()
 72 |     {
 73 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 74 | 
 75 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
 76 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 77 | 
 78 |         // Test 1: Absolute URL
 79 |         $this->assertSame(
 80 |             $web->searchIndexRaw($web->searchIndexUrl),
 81 |             $web->searchIndexRaw($web->currentBaseHost . '/custom_index.json'),
 82 |         );
 83 | 
 84 |         // Test 2: Relative URL
 85 |         $this->assertSame(
 86 |             $web->searchIndexRaw($web->searchIndexUrl),
 87 |             $web->searchIndexRaw('/custom_index.json'),
 88 |         );
 89 |     }
 90 | 
 91 |     /**
 92 |      * Tests the raw parsing.
 93 |      *
 94 |      * @test
 95 |      */
 96 |     public function testSearchIndexRaw()
 97 |     {
 98 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 99 | 
100 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
101 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
102 | 
103 |         // Get the raw searchIndex and store it.
104 |         $searchIndexRaw = $web->searchIndexRaw;
105 | 
106 |         // Ensure the structure is an nested array
107 |         $this->assertTrue(is_array($searchIndexRaw));
108 |         $this->assertTrue(is_array($searchIndexRaw[42]));
109 | 
110 |         // Did we get the expected `/index.json`? It should contain 60 entries.
111 |         $this->assertSame(60, count($searchIndexRaw));
112 | 
113 |         // Check some data to ensure the parsing actually worked.
114 |         $this->assertSame(
115 |             'https://pastablelists.com/en/counties-of-croatia',
116 |             $searchIndexRaw[4]['link']
117 |         );
118 |         $this->assertSame(
119 |             'https://pastablelists.com/en/municipalities-of-macedonia',
120 |             $searchIndexRaw[2]['link']
121 |         );
122 |         $this->assertSame(
123 |             'https://pastablelists.com/en/counties-and-municipalities-of-lithuania',
124 |             $searchIndexRaw[0]['link']
125 |         );
126 |     }
127 | 
128 |     /**
129 |      * Tests the DTO creation.
130 |      *
131 |      * @test
132 |      */
133 |     public function testSearchIndex()
134 |     {
135 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
136 | 
137 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
138 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
139 | 
140 |         // Get the searchIndex and store it.
141 |         $searchIndex = $web->searchIndex;
142 | 
143 |         // Did we get the expected `/index.json`? It should contain 60 entries.
144 |         $this->assertSame(60, count($searchIndex));
145 | 
146 |         // Check some data to ensure the parsing actually worked:
147 |         // Set 1
148 |         $this->assertTrue($searchIndex[4] instanceof FeedEntry);
149 |         $this->assertSame(
150 |             'List of the Counties of Croatia',
151 |             $searchIndex[4]->title,
152 |         );
153 |         $this->assertSame(
154 |             'List of the Counties of Croatia ready for copy and paste or export.',
155 |             $searchIndex[4]->description,
156 |         );
157 |         $this->assertSame(
158 |             'https://pastablelists.com/en/counties-of-croatia',
159 |             $searchIndex[4]->link,
160 |         );
161 | 
162 |         // Set 2
163 |         $this->assertTrue($searchIndex[2] instanceof FeedEntry);
164 |         $this->assertSame(
165 |             'List of the Municipalities of Macedonia',
166 |             $searchIndex[2]->title,
167 |         );
168 |         $this->assertSame(
169 |             'List of the Municipalities of Macedonia ready for copy and paste or export.',
170 |             $searchIndex[2]->description,
171 |         );
172 |         $this->assertSame(
173 |             'https://pastablelists.com/en/municipalities-of-macedonia',
174 |             $searchIndex[2]->link,
175 |         );
176 | 
177 |         // Set 3
178 |         $this->assertTrue($searchIndex[0] instanceof FeedEntry);
179 |         $this->assertSame(
180 |             'List of the Counties and Municipalities of Lithuania',
181 |             $searchIndex[0]->title,
182 |         );
183 |         $this->assertSame(
184 |             'List of the Counties and Municipalities of Lithuania, ready for copy and paste or export.',
185 |             $searchIndex[0]->description,
186 |         );
187 |         $this->assertSame(
188 |             'https://pastablelists.com/en/counties-and-municipalities-of-lithuania',
189 |             $searchIndex[0]->link,
190 |         );
191 |     }
192 | }
193 | 


--------------------------------------------------------------------------------
/tests/FeedSitemapTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | use Spekulatius\PHPScraper\DataTransferObjects\FeedEntry;
  6 | 
  7 | class FeedSitemapTest extends \PHPUnit\Framework\TestCase
  8 | {
  9 |     /**
 10 |      * @test
 11 |      */
 12 |     public function testSitemapUrl()
 13 |     {
 14 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 15 | 
 16 |         // Navigate to the test page. As the URL is guessed, it's only about the base URL.
 17 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 18 | 
 19 |         // Did we get the expected `/sitemap.xml`?
 20 |         $this->assertSame(
 21 |             'https://test-pages.phpscraper.de/sitemap.xml',
 22 |             $web->sitemapUrl
 23 |         );
 24 |     }
 25 | 
 26 |     /**
 27 |      * Tests if the default sitemap path is applied.
 28 |      *
 29 |      * @test
 30 |      */
 31 |     public function testDefaultSitemapUrl()
 32 |     {
 33 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 34 | 
 35 |         // Navigate to the test page. As the URL is guessed, it's only about the base URL.
 36 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 37 | 
 38 |         // The sitemapUrl should be the default.
 39 |         $this->assertSame(
 40 |             $web->sitemapRaw(),
 41 |             $web->sitemapRaw($web->sitemapUrl),
 42 |         );
 43 |     }
 44 | 
 45 |     /**
 46 |      * The files `sitemap.xml` and `custom_sitemap.xml` are the same and used to ensure the custom URL feature works.
 47 |      *
 48 |      * @test
 49 |      */
 50 |     public function testCustomSitemapUrl()
 51 |     {
 52 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 53 | 
 54 |         // Navigate to the test page. As the URL is guessed, it's only about the base URL.
 55 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 56 | 
 57 |         // We should always allow for custom paths.
 58 |         $this->assertSame(
 59 |             $web->sitemapRaw($web->sitemapUrl),
 60 |             $web->sitemapRaw($web->currentBaseHost . '/custom_sitemap.xml'),
 61 |         );
 62 |     }
 63 | 
 64 |     /**
 65 |      * We should support both absolute and relative URLs.
 66 |      *
 67 |      * @test
 68 |      */
 69 |     public function testDifferentSitemapUrlTypes()
 70 |     {
 71 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 72 | 
 73 |         // Navigate to the test page. As the URL is predefined, it's only about the base URL.
 74 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
 75 | 
 76 |         // Test 1: Absolute URL
 77 |         $this->assertSame(
 78 |             $web->sitemapRaw($web->sitemapUrl),
 79 |             $web->sitemapRaw($web->currentBaseHost . '/custom_sitemap.xml'),
 80 |         );
 81 | 
 82 |         // Test 2: Relative URL
 83 |         $this->assertSame(
 84 |             $web->sitemapRaw($web->sitemapUrl),
 85 |             $web->sitemapRaw('/custom_sitemap.xml'),
 86 |         );
 87 |     }
 88 | 
 89 |     /**
 90 |      * Ensure we can parse the sitemap in itself (XML).
 91 |      *
 92 |      * @test
 93 |      */
 94 |     public function testSitemapRaw()
 95 |     {
 96 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 97 | 
 98 |         // Navigate to the test page. As the URL is guessed, it's only about the base URL.
 99 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
100 | 
101 |         // Get the sitemap and store it.
102 |         $sitemapRaw = $web->sitemapRaw;
103 | 
104 |         // Check the count
105 |         $this->assertSame(129, count($sitemapRaw['url']));
106 | 
107 |         // Check some entries to ensure the parsing works as expected.
108 |         $this->assertSame(
109 |             'https://phpscraper.de/apis/linkedin.html',
110 |             $sitemapRaw['url'][4]['loc'],
111 |         );
112 |         $this->assertSame(
113 |             'https://phpscraper.de/de/apis/zalando.html',
114 |             $sitemapRaw['url'][20]['loc'],
115 |         );
116 |     }
117 | 
118 |     /**
119 |      * Tests the DTO creation.
120 |      *
121 |      * @test
122 |      */
123 |     public function testSitemap()
124 |     {
125 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
126 | 
127 |         // Navigate to the test page. As the URL is guessed, it's only about the base URL.
128 |         $web->go('https://test-pages.phpscraper.de/meta/feeds.html');
129 | 
130 |         // Get the sitemap and store it.
131 |         $sitemap = $web->sitemap;
132 | 
133 |         // Check the count
134 |         $this->assertSame(129, count($sitemap));
135 | 
136 |         // Check some samples.
137 |         $this->assertTrue($sitemap[42] instanceof FeedEntry);
138 |         $this->assertSame(
139 |             'https://phpscraper.de/apis/linkedin.html',
140 |             $sitemap[4]->link,
141 |         );
142 |         $this->assertSame(
143 |             'https://phpscraper.de/de/apis/zalando.html',
144 |             $sitemap[20]->link
145 |         );
146 |     }
147 | }
148 | 


--------------------------------------------------------------------------------
/tests/HeadingTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class HeadingTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testMissingHeadings()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // Navigate to the test page.
 15 |         $web->go('https://test-pages.phpscraper.de/meta/no-meta.html');
 16 | 
 17 |         // Check the missing headers (h1 actually exists on the page).
 18 |         $this->assertSame([], $web->h2);
 19 |         $this->assertSame([], $web->h3);
 20 |         $this->assertSame([], $web->h4);
 21 |         $this->assertSame([], $web->h5);
 22 |         $this->assertSame([], $web->h6);
 23 |     }
 24 | 
 25 |     /**
 26 |      * @test
 27 |      */
 28 |     public function testWithHTMLEntity()
 29 |     {
 30 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 31 | 
 32 |         // Navigate to the test page.
 33 |         $web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
 34 | 
 35 |         // Check the h1
 36 |         $this->assertSame(
 37 |             'We are testing here & elsewhere!',
 38 |             $web->h1[0]
 39 |         );
 40 | 
 41 |         // h2s
 42 |         $this->assertSame(2, count($web->h2));
 43 |         $this->assertSame([
 44 |             'Cat & Mouse',
 45 |             'Mouse & Cat',
 46 |         ], $web->h2);
 47 | 
 48 |         // Collection of headings
 49 |         $this->assertSame(
 50 |             [
 51 |                 ['We are testing here & elsewhere!'],
 52 |                 ['Cat & Mouse', 'Mouse & Cat'],
 53 |                 ['1', '2', '3'],
 54 |                 ['Not so important heading'],
 55 |                 [],
 56 |                 [],
 57 |             ],
 58 |             $web->headings
 59 |         );
 60 |     }
 61 | 
 62 |     /**
 63 |      * @test
 64 |      */
 65 |     public function testLoremIpsum()
 66 |     {
 67 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 68 | 
 69 |         // Navigate to the test page.
 70 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
 71 | 
 72 |         // Check the h1
 73 |         $this->assertSame(
 74 |             'We are testing here!',
 75 |             $web->h1[0]
 76 |         );
 77 | 
 78 |         // h2s
 79 |         $this->assertSame(2, count($web->h2));
 80 |         $this->assertSame([
 81 |             'h2s are headings too.',
 82 |             'h2s are headings too.',
 83 |         ], $web->h2);
 84 |     }
 85 | 
 86 |     /**
 87 |      * @test
 88 |      */
 89 |     public function testGermanUmlaute()
 90 |     {
 91 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 92 | 
 93 |         // Navigate to the test page.
 94 |         $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
 95 | 
 96 |         // Check the h1
 97 |         $this->assertSame(
 98 |             'We are testing here ä ü ö!',
 99 |             $web->h1[0]
100 |         );
101 | 
102 |         // h2s
103 |         $this->assertSame(2, count($web->h2));
104 |         $this->assertSame([
105 |             'Täst, ehm, test!',
106 |             'Weiter testen, Müller!',
107 |         ], $web->h2);
108 |     }
109 | 
110 |     /**
111 |      * @test
112 |      */
113 |     public function testChineseCharacters()
114 |     {
115 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
116 | 
117 |         // Navigate to the test page.
118 |         $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
119 | 
120 |         // Check the h1
121 |         $this->assertSame(
122 |             'We are testing here! 加油!',
123 |             $web->h1[0]
124 |         );
125 | 
126 |         // h2s
127 |         $this->assertSame(2, count($web->h2));
128 |         $this->assertSame(['加油!', '加油 #1!'], $web->h2);
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/tests/ImageTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class ImageTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testNoImages()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // Navigate to the test page.
 15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
 16 | 
 17 |         // No images -> an empty array is expected.
 18 |         $this->assertSame([], $web->images);
 19 |         $this->assertSame([], $web->imagesWithDetails);
 20 |     }
 21 | 
 22 |     /**
 23 |      * @test
 24 |      */
 25 |     public function testLoremIpsum()
 26 |     {
 27 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 28 | 
 29 |         // Navigate to the test page.
 30 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
 31 | 
 32 |         // Navigate to the test page. This page contains two images (cat.jpg).
 33 |         $this->assertSame(2, count($web->images));
 34 | 
 35 |         // Check the simple list
 36 |         $this->assertSame([
 37 |             'https://test-pages.phpscraper.de/assets/cat.jpg',
 38 |             'https://test-pages.phpscraper.de/assets/cat.jpg',
 39 |         ], $web->images);
 40 | 
 41 |         // Check the expected data
 42 |         $this->assertSame([
 43 |             [
 44 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
 45 |                 'alt' => 'absolute path',
 46 |                 'width' => null,
 47 |                 'height' => null,
 48 |             ],
 49 |             [
 50 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
 51 |                 'alt' => 'relative path',
 52 |                 'width' => null,
 53 |                 'height' => null,
 54 |             ],
 55 |         ], $web->imagesWithDetails);
 56 |     }
 57 | 
 58 |     /**
 59 |      * @test
 60 |      */
 61 |     public function testGermanUmlaute()
 62 |     {
 63 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 64 | 
 65 |         // Navigate to the test page.
 66 |         $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
 67 | 
 68 |         // Check the h1
 69 |         $this->assertSame(
 70 |             'We are testing here ä ü ö!',
 71 |             $web->h1[0]
 72 |         );
 73 | 
 74 |         // Check the number of images
 75 |         $this->assertSame(2, count($web->images));
 76 | 
 77 |         // Check the simple list
 78 |         $this->assertSame([
 79 |             'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg',
 80 |             'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg',
 81 |         ], $web->images);
 82 | 
 83 |         // Check the expected data
 84 |         $this->assertSame([
 85 |             [
 86 |                 'url' => 'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg',
 87 |                 'alt' => 'absolute path',
 88 |                 'width' => null,
 89 |                 'height' => null,
 90 |             ],
 91 |             [
 92 |                 'url' => 'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg',
 93 |                 'alt' => 'relative path',
 94 |                 'width' => null,
 95 |                 'height' => null,
 96 |             ],
 97 |         ], $web->imagesWithDetails);
 98 |     }
 99 | 
100 |     /**
101 |      * @test
102 |      */
103 |     public function testChineseCharacters()
104 |     {
105 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
106 | 
107 |         // Navigate to the test page.
108 |         $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
109 | 
110 |         // Check the number of images
111 |         $this->assertSame(2, count($web->images));
112 | 
113 |         // Check the simple list
114 |         $this->assertSame([
115 |             'https://test-pages.phpscraper.de/assets/貓.jpg',
116 |             'https://test-pages.phpscraper.de/assets/貓.jpg',
117 |         ], $web->images);
118 | 
119 |         // Check the expected data
120 |         $this->assertSame([
121 |             [
122 |                 'url' => 'https://test-pages.phpscraper.de/assets/貓.jpg',
123 |                 'alt' => 'absolute path',
124 |                 'width' => null,
125 |                 'height' => null,
126 |             ],
127 |             [
128 |                 'url' => 'https://test-pages.phpscraper.de/assets/貓.jpg',
129 |                 'alt' => 'relative path',
130 |                 'width' => null,
131 |                 'height' => null,
132 |             ],
133 |         ], $web->imagesWithDetails);
134 |     }
135 | 
136 |     /**
137 |      * @test
138 |      */
139 |     public function testBaseHref()
140 |     {
141 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
142 | 
143 |         // Navigate to the test page.
144 |         $web->go('https://test-pages.phpscraper.de/images/base-href.html');
145 | 
146 |         // Check the number of images
147 |         $this->assertSame(2, count($web->images));
148 | 
149 |         // Base set:
150 |         $this->assertSame([
151 |             'https://test-pages.phpscraper.de/assets/cat.jpg',
152 |             'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
153 |         ], $web->images);
154 | 
155 |         // Detail set:
156 |         $this->assertSame([
157 |             [
158 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
159 |                 'alt' => 'absolute path with base href',
160 |                 'width' => null,
161 |                 'height' => null,
162 |             ],
163 |             [
164 |                 'url' => 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
165 |                 'alt' => 'relative path with base href',
166 |                 'width' => null,
167 |                 'height' => null,
168 |             ],
169 |         ], $web->imagesWithDetails);
170 |     }
171 | 
172 |     /**
173 |      * @test
174 |      */
175 |     public function testWidth()
176 |     {
177 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
178 | 
179 |         // Navigate to the test page.
180 |         $web->go('https://test-pages.phpscraper.de/images/width.html');
181 | 
182 |         // Check the number of images
183 |         $this->assertSame(3, count($web->images));
184 | 
185 |         // Check the expected data
186 |         $this->assertSame([
187 |             [
188 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
189 |                 'alt' => 'no width',
190 |                 'width' => null,
191 |                 'height' => null,
192 |             ],
193 |             [
194 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
195 |                 'alt' => 'width at 1200px',
196 |                 'width' => '1200px',
197 |                 'height' => null,
198 |             ],
199 |             [
200 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
201 |                 'alt' => 'width at 100rem',
202 |                 'width' => '100rem',
203 |                 'height' => null,
204 |             ],
205 |         ], $web->imagesWithDetails);
206 |     }
207 | 
208 |     /**
209 |      * @test
210 |      */
211 |     public function testHeight()
212 |     {
213 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
214 | 
215 |         // Navigate to the test page.
216 |         $web->go('https://test-pages.phpscraper.de/images/height.html');
217 | 
218 |         // Check the number of imagess
219 |         $this->assertSame(3, count($web->images));
220 | 
221 |         // Check the expected data
222 |         $this->assertSame([
223 |             [
224 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
225 |                 'alt' => 'no height',
226 |                 'width' => null,
227 |                 'height' => null,
228 |             ],
229 |             [
230 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
231 |                 'alt' => 'height at 1200px',
232 |                 'width' => null,
233 |                 'height' => '1200px',
234 |             ],
235 |             [
236 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
237 |                 'alt' => 'height at 100rem',
238 |                 'width' => null,
239 |                 'height' => '100rem',
240 |             ],
241 |         ], $web->imagesWithDetails);
242 |     }
243 | }
244 | 


--------------------------------------------------------------------------------
/tests/KeywordTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class KeywordTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testKeywordExtraction()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Navigate to the test page.
15 |         // It contains 3 paragraphs from the English Wikipedia article for "lorem ipsum"
16 |         $web->go('https://test-pages.phpscraper.de/content/keywords.html');
17 | 
18 |         // Check the keywords on this case...
19 |         $keywords = $web->contentKeywords;
20 | 
21 |         // A selected list of keywords to expect
22 |         $shouldKeywords = [
23 |             '1960s',
24 |             'added',
25 |             'adopted lorem ipsum',
26 |             'advertisements',
27 |             'aldus employed',
28 |             'corrupted version',
29 |             'graphic',
30 |             'improper latin',
31 |             'introduced',
32 |             'keyword extraction tests',
33 |             'test',
34 |             'microsoft word',
35 |             'english wikipedia',
36 |             'lorem ipsum',
37 |             'lorem ipsum text',
38 |         ];
39 | 
40 |         // Check if all are part of the output
41 |         foreach ($shouldKeywords as $keyword) {
42 |             $this->assertTrue(
43 |                 in_array($keyword, $keywords),
44 |                 sprintf('"%s" is missing', $keyword)
45 |             );
46 |         }
47 |     }
48 | 
49 |     /**
50 |      * @test
51 |      */
52 |     public function testKeywordExtractionWithScores()
53 |     {
54 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
55 | 
56 |         // Navigate to the test page.
57 |         // It contains 3 paragraphs from the English Wikipedia article for "lorem ipsum"
58 |         $web->go('https://test-pages.phpscraper.de/content/keywords.html');
59 | 
60 |         // Check the keywords on this case...
61 |         $keywords = $web->contentKeywordsWithScores;
62 | 
63 |         // A selected list of keywords to expect
64 |         $shouldKeywords = [
65 |             'added' => 1.0,
66 |             'adopted lorem ipsum' => 11.0,
67 |             'advertisements' => 1.0,
68 |             'aldus employed' => 4.0,
69 |             'corrupted version' => 4.0,
70 |             'graphic' => 1.0,
71 |             'improper latin' => 4.0,
72 |             'introduced' => 1.0,
73 |             'keyword extraction tests' => 9.0,
74 |             'test' => 1.0,
75 |             'microsoft word' => 5.3333333333333,
76 |             'english wikipedia' => 4.0,
77 |             'lorem ipsum' => 8.0,
78 |             'lorem ipsum text' => 11.0,
79 |         ];
80 | 
81 |         // Check if all are part of the output with the expected score
82 |         foreach ($shouldKeywords as $keyword => $score) {
83 |             // Has the same score
84 |             $this->assertSame(
85 |                 round($keywords[$keyword], 8),
86 |                 round($score, 8),
87 |                 sprintf('Score for "%s" is incorrect', $keyword)
88 |             );
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/tests/LinkTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class LinkTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testNoLinks()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // Navigate to the test page.
 15 |         $web->go('https://test-pages.phpscraper.de/links/no-links.html');
 16 | 
 17 |         // No links -> an empty array is expected.
 18 |         $this->assertSame([], $web->links);
 19 |         $this->assertSame([], $web->linksWithDetails);
 20 |     }
 21 | 
 22 |     /**
 23 |      * @test
 24 |      */
 25 |     public function testTarget()
 26 |     {
 27 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 28 | 
 29 |         // Navigate to the test page.
 30 |         $web->go('https://test-pages.phpscraper.de/links/target.html');
 31 | 
 32 |         // Check the number of links
 33 |         $this->assertSame(6, count($web->links));
 34 | 
 35 |         // Check the simple links list
 36 |         $this->assertSame([
 37 |             'https://placekitten.com/408/287',
 38 |             'https://placekitten.com/444/333',
 39 |             'https://placekitten.com/444/321',
 40 |             'https://placekitten.com/408/287',
 41 |             'https://placekitten.com/444/333',
 42 |             'https://placekitten.com/444/321',
 43 |         ], $web->links);
 44 | 
 45 |         // Check the complex links list
 46 |         $this->assertSame([
 47 |             [
 48 |                 'url' => 'https://placekitten.com/408/287',
 49 |                 'protocol' => 'https',
 50 |                 'text' => 'external kitten',
 51 |                 'title' => null,
 52 |                 'target' => '_blank',
 53 |                 'rel' => null,
 54 |                 'image' => [],
 55 |                 'isNofollow' => false,
 56 |                 'isUGC' => false,
 57 |                 'isSponsored' => false,
 58 |                 'isMe' => false,
 59 |                 'isNoopener' => false,
 60 |                 'isNoreferrer' => false,
 61 |             ], [
 62 |                 'url' => 'https://placekitten.com/444/333',
 63 |                 'protocol' => 'https',
 64 |                 'text' => 'external kitten',
 65 |                 'title' => null,
 66 |                 'target' => '_blank',
 67 |                 'rel' => null,
 68 |                 'image' => [],
 69 |                 'isNofollow' => false,
 70 |                 'isUGC' => false,
 71 |                 'isSponsored' => false,
 72 |                 'isMe' => false,
 73 |                 'isNoopener' => false,
 74 |                 'isNoreferrer' => false,
 75 |             ], [
 76 |                 'url' => 'https://placekitten.com/444/321',
 77 |                 'protocol' => 'https',
 78 |                 'text' => 'external kitten',
 79 |                 'title' => null,
 80 |                 'target' => '_blank',
 81 |                 'rel' => null,
 82 |                 'image' => [],
 83 |                 'isNofollow' => false,
 84 |                 'isUGC' => false,
 85 |                 'isSponsored' => false,
 86 |                 'isMe' => false,
 87 |                 'isNoopener' => false,
 88 |                 'isNoreferrer' => false,
 89 |             ], [
 90 |                 'url' => 'https://placekitten.com/408/287',
 91 |                 'protocol' => 'https',
 92 |                 'text' => 'external kitten',
 93 |                 'title' => null,
 94 |                 'target' => 'kitten',
 95 |                 'rel' => null,
 96 |                 'image' => [],
 97 |                 'isNofollow' => false,
 98 |                 'isUGC' => false,
 99 |                 'isSponsored' => false,
100 |                 'isMe' => false,
101 |                 'isNoopener' => false,
102 |                 'isNoreferrer' => false,
103 |             ], [
104 |                 'url' => 'https://placekitten.com/444/333',
105 |                 'protocol' => 'https',
106 |                 'text' => 'external kitten',
107 |                 'title' => null,
108 |                 'target' => 'kitten',
109 |                 'rel' => null,
110 |                 'image' => [],
111 |                 'isNofollow' => false,
112 |                 'isUGC' => false,
113 |                 'isSponsored' => false,
114 |                 'isMe' => false,
115 |                 'isNoopener' => false,
116 |                 'isNoreferrer' => false,
117 |             ], [
118 |                 'url' => 'https://placekitten.com/444/321',
119 |                 'protocol' => 'https',
120 |                 'text' => 'external kitten',
121 |                 'title' => null,
122 |                 'target' => 'kitten',
123 |                 'rel' => null,
124 |                 'image' => [],
125 |                 'isNofollow' => false,
126 |                 'isUGC' => false,
127 |                 'isSponsored' => false,
128 |                 'isMe' => false,
129 |                 'isNoopener' => false,
130 |                 'isNoreferrer' => false,
131 |             ],
132 |         ], $web->linksWithDetails);
133 |     }
134 | 
135 |     /**
136 |      * @test
137 |      */
138 |     public function testRel()
139 |     {
140 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
141 | 
142 |         // Navigate to the test page.
143 |         // This page contains several links with different rel attributes.
144 |         $web->go('https://test-pages.phpscraper.de/links/rel.html');
145 | 
146 |         // Check the number of links
147 |         $this->assertSame(5, count($web->links));
148 | 
149 |         // Check the simple links list
150 |         $this->assertSame([
151 |             'https://placekitten.com/432/287',
152 |             'https://placekitten.com/456/287',
153 |             'https://placekitten.com/345/287',
154 |             'https://placekitten.com/345/287',
155 |             'https://placekitten.com/345/222',
156 |         ], $web->links);
157 | 
158 |         // Check the complex links list
159 |         $this->assertSame([
160 |             [
161 |                 'url' => 'https://placekitten.com/432/287',
162 |                 'protocol' => 'https',
163 |                 'text' => 'external kitten',
164 |                 'title' => null,
165 |                 'target' => null,
166 |                 'rel' => 'nofollow',
167 |                 'image' => [],
168 |                 'isNofollow' => true,
169 |                 'isUGC' => false,
170 |                 'isSponsored' => false,
171 |                 'isMe' => false,
172 |                 'isNoopener' => false,
173 |                 'isNoreferrer' => false,
174 |             ], [
175 |                 'url' => 'https://placekitten.com/456/287',
176 |                 'protocol' => 'https',
177 |                 'text' => 'external kitten',
178 |                 'title' => null,
179 |                 'target' => null,
180 |                 'rel' => 'ugc',
181 |                 'image' => [],
182 |                 'isNofollow' => false,
183 |                 'isUGC' => true,
184 |                 'isSponsored' => false,
185 |                 'isMe' => false,
186 |                 'isNoopener' => false,
187 |                 'isNoreferrer' => false,
188 |             ], [
189 |                 'url' => 'https://placekitten.com/345/287',
190 |                 'protocol' => 'https',
191 |                 'text' => 'external kitten',
192 |                 'title' => null,
193 |                 'target' => null,
194 |                 'rel' => 'nofollow ugc',
195 |                 'image' => [],
196 |                 'isNofollow' => true,
197 |                 'isUGC' => true,
198 |                 'isSponsored' => false,
199 |                 'isMe' => false,
200 |                 'isNoopener' => false,
201 |                 'isNoreferrer' => false,
202 |             ], [
203 |                 'url' => 'https://placekitten.com/345/287',
204 |                 'protocol' => 'https',
205 |                 'text' => 'external kitten',
206 |                 'title' => null,
207 |                 'target' => null,
208 |                 'rel' => 'noopener',
209 |                 'image' => [],
210 |                 'isNofollow' => false,
211 |                 'isUGC' => false,
212 |                 'isSponsored' => false,
213 |                 'isMe' => false,
214 |                 'isNoopener' => true,
215 |                 'isNoreferrer' => false,
216 |             ], [
217 |                 'url' => 'https://placekitten.com/345/222',
218 |                 'protocol' => 'https',
219 |                 'text' => 'external kitten',
220 |                 'title' => null,
221 |                 'target' => null,
222 |                 'rel' => 'noreferrer',
223 |                 'image' => [],
224 |                 'isNofollow' => false,
225 |                 'isUGC' => false,
226 |                 'isSponsored' => false,
227 |                 'isMe' => false,
228 |                 'isNoopener' => false,
229 |                 'isNoreferrer' => true,
230 |             ],
231 |         ], $web->linksWithDetails);
232 |     }
233 | 
234 |     /**
235 |      * @test
236 |      */
237 |     public function testBaseHref()
238 |     {
239 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
240 | 
241 |         // Navigate to the test page.
242 |         $web->go('https://test-pages.phpscraper.de/links/base-href.html');
243 | 
244 |         // Check the number of links
245 |         $this->assertSame(3, count($web->links));
246 | 
247 |         // Check the simple links list
248 |         $this->assertSame([
249 |             'https://placekitten.com/408/287',
250 |             'https://test-pages.phpscraper.de/assets/cat.jpg',
251 |             'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
252 |         ], $web->links);
253 | 
254 |         // Check the complex links list
255 |         $this->assertSame([
256 |             [
257 |                 'url' => 'https://placekitten.com/408/287',
258 |                 'protocol' => 'https',
259 |                 'text' => 'external kitten',
260 |                 'title' => 'external path with base href',
261 |                 'target' => null,
262 |                 'rel' => null,
263 |                 'image' => [],
264 |                 'isNofollow' => false,
265 |                 'isUGC' => false,
266 |                 'isSponsored' => false,
267 |                 'isMe' => false,
268 |                 'isNoopener' => false,
269 |                 'isNoreferrer' => false,
270 |             ], [
271 |                 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
272 |                 'protocol' => 'https',
273 |                 'text' => 'absolute path to cat',
274 |                 'title' => 'absolute internal path with base href',
275 |                 'target' => null,
276 |                 'rel' => null,
277 |                 'image' => [],
278 |                 'isNofollow' => false,
279 |                 'isUGC' => false,
280 |                 'isSponsored' => false,
281 |                 'isMe' => false,
282 |                 'isNoopener' => false,
283 |                 'isNoreferrer' => false,
284 |             ], [
285 |                 'url' => 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
286 |                 'protocol' => 'https',
287 |                 'text' => 'relative cat',
288 |                 'title' => 'relative path with base href',
289 |                 'target' => null,
290 |                 'rel' => null,
291 |                 'image' => [],
292 |                 'isNofollow' => false,
293 |                 'isUGC' => false,
294 |                 'isSponsored' => false,
295 |                 'isMe' => false,
296 |                 'isNoopener' => false,
297 |                 'isNoreferrer' => false,
298 |             ],
299 |         ], $web->linksWithDetails);
300 |     }
301 | 
302 |     /**
303 |      * @test
304 |      */
305 |     public function testImageUrl()
306 |     {
307 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
308 | 
309 |         // Navigate to the test page.
310 |         $web->go('https://test-pages.phpscraper.de/links/image-url.html');
311 | 
312 |         // Check the number of links
313 |         $this->assertSame(3, count($web->links));
314 | 
315 |         // Check the complex links list
316 |         $this->assertSame([
317 |             [
318 |                 'url' => 'https://placekitten.com/432/500',
319 |                 'protocol' => 'https',
320 |                 'text' => '',
321 |                 'title' => null,
322 |                 'target' => null,
323 |                 'rel' => 'nofollow',
324 |                 'image' => [
325 |                     'https://placekitten.com/432/287',
326 |                 ],
327 |                 'isNofollow' => true,
328 |                 'isUGC' => false,
329 |                 'isSponsored' => false,
330 |                 'isMe' => false,
331 |                 'isNoopener' => false,
332 |                 'isNoreferrer' => false,
333 |             ], [
334 |                 'url' => 'https://placekitten.com/456/500',
335 |                 'protocol' => 'https',
336 |                 'text' => '',
337 |                 'title' => null,
338 |                 'target' => null,
339 |                 'rel' => 'ugc',
340 |                 'image' => [
341 |                     'https://placekitten.com/456/400',
342 |                     'https://placekitten.com/456/300',
343 |                 ],
344 |                 'isNofollow' => false,
345 |                 'isUGC' => true,
346 |                 'isSponsored' => false,
347 |                 'isMe' => false,
348 |                 'isNoopener' => false,
349 |                 'isNoreferrer' => false,
350 |             ], [
351 |                 'url' => 'https://placekitten.com/345/500',
352 |                 'protocol' => 'https',
353 |                 'text' => 'This is image',
354 |                 'title' => null,
355 |                 'target' => null,
356 |                 'rel' => 'nofollow ugc',
357 |                 'image' => [
358 |                     'https://placekitten.com/345/287',
359 |                 ],
360 |                 'isNofollow' => true,
361 |                 'isUGC' => true,
362 |                 'isSponsored' => false,
363 |                 'isMe' => false,
364 |                 'isNoopener' => false,
365 |                 'isNoreferrer' => false,
366 |             ],
367 |         ], $web->linksWithDetails);
368 |     }
369 | 
370 |     /**
371 |      * @test
372 |      */
373 |     public function testInternalLinks()
374 |     {
375 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
376 | 
377 |         // Navigate to the test page.
378 |         $web->go('https://test-pages.phpscraper.de/links/base-href.html');
379 | 
380 |         // Check the internal links list
381 |         $this->assertSame(
382 |             ['https://test-pages.phpscraper.de/assets/cat.jpg'],
383 |             $web->internalLinks
384 |         );
385 |     }
386 | 
387 |     /**
388 |      * @test
389 |      */
390 |     public function testExternalLinks()
391 |     {
392 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
393 | 
394 |         // Navigate to the test page.
395 |         $web->go('https://test-pages.phpscraper.de/links/base-href.html');
396 | 
397 |         // Check the external links list
398 |         $this->assertSame(
399 |             [
400 |                 'https://placekitten.com/408/287',
401 |                 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
402 |             ],
403 |             $web->externalLinks
404 |         );
405 |     }
406 | }
407 | 


--------------------------------------------------------------------------------
/tests/ListsTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class ListsTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function checkCountTest()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         /**
15 |          * Navigate to the test page. This page contains:
16 |          *
17 |          * <h2>Example 1: Unordered List</h2>
18 |          * <ul>
19 |          *     <li>Unordered item 1</li>
20 |          *     <li>Unordered item 2</li>
21 |          *     <li>Unordered item with <b>HTML</b></li>
22 |          * </ul>
23 |          *
24 |          * <h2>Example 2: Ordered List</h2>
25 |          * <ol>
26 |          *     <li>Order list item 1</li>
27 |          *     <li>Order list item 2</li>
28 |          *     <li>Order list item with <i>HTML</i></li>
29 |          * </ol>
30 |          */
31 |         $web->go('https://test-pages.phpscraper.de/content/lists.html');
32 | 
33 |         // Check all lists are recognized
34 |         $this->assertSame(count($web->lists), 2);
35 |         $this->assertSame(count($web->unorderedLists), 1);
36 |         $this->assertSame(count($web->orderedLists), 1);
37 | 
38 |         // Check the contents
39 |         $this->assertSame([
40 |             'Ordered list item 1',
41 |             'Ordered list item 2',
42 |             'Ordered list item with HTML',
43 |         ], $web->orderedLists[0]['children_plain']);
44 | 
45 |         $this->assertSame([
46 |             'Unordered list item 1',
47 |             'Unordered list item 2',
48 |             'Unordered list item with HTML',
49 |         ], $web->unorderedLists[0]['children_plain']);
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/tests/MetaAuthorTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class MetaAuthorTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingAuthor()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Navigate to the test page.
15 |         $web->go('https://test-pages.phpscraper.de/meta/meta/missing.html');
16 | 
17 |         // Check the author as not given (null)
18 |         $this->assertNull($web->author);
19 |     }
20 | 
21 |     /**
22 |      * @test
23 |      */
24 |     public function testWithHTMLEntity()
25 |     {
26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
27 | 
28 |         // Navigate to the test page.
29 |         $web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
30 | 
31 |         // Check the author
32 |         $this->assertSame(
33 |             'Cat & Mouse',
34 |             $web->author
35 |         );
36 |     }
37 | 
38 |     /**
39 |      * @test
40 |      */
41 |     public function testLoremIpsum()
42 |     {
43 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
44 | 
45 |         // Navigate to the test page.
46 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
47 | 
48 |         // Check the author
49 |         $this->assertSame(
50 |             'Lorem ipsum',
51 |             $web->author
52 |         );
53 |     }
54 | 
55 |     /**
56 |      * @test
57 |      */
58 |     public function testGermanUmlaute()
59 |     {
60 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
61 | 
62 |         // Navigate to the test page.
63 |         $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
64 | 
65 |         // Check the author
66 |         $this->assertSame(
67 |             'Müller',
68 |             $web->author
69 |         );
70 |     }
71 | 
72 |     /**
73 |      * @test
74 |      */
75 |     public function testChineseCharacters()
76 |     {
77 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
78 | 
79 |         // Navigate to the test page.
80 |         $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
81 | 
82 |         // Check the author
83 |         $this->assertSame(
84 |             '貓',
85 |             $web->author
86 |         );
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/tests/MetaCharsetTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class MetaCharsetTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingCharset()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Navigate to the test page.
15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
16 | 
17 |         // Check the charset as not given (null)
18 |         $this->assertNull($web->charset);
19 |     }
20 | 
21 |     /**
22 |      * @test
23 |      */
24 |     public function testWithCharset()
25 |     {
26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
27 | 
28 |         // Navigate to the test page.
29 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
30 | 
31 |         // Check the charset
32 |         $this->assertSame(
33 |             'utf-8',
34 |             $web->charset
35 |         );
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/MetaContentTypeTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class MetaContentTypeTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingContentType()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Navigate to the test page.
15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
16 | 
17 |         // Check the contentType as not given (null)
18 |         $this->assertNull($web->contentType);
19 |     }
20 | 
21 |     /**
22 |      * @test
23 |      */
24 |     public function testWithContentType()
25 |     {
26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
27 | 
28 |         // Navigate to the test page.
29 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
30 | 
31 |         // Check the contentType
32 |         $this->assertSame(
33 |             'text/html; charset=utf-8',
34 |             $web->contentType
35 |         );
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/MetaCsrfTokenTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class MetaCsrfTokenTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingCsrfToken()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Navigate to the test page.
15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
16 | 
17 |         // Check the csrfToken as not given (null)
18 |         $this->assertNull($web->csrfToken);
19 |     }
20 | 
21 |     /**
22 |      * @test
23 |      */
24 |     public function testWithCsrfToken()
25 |     {
26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
27 | 
28 |         // Navigate to the test page.
29 |         // Contains: <meta name="csrf-token" content="token" />
30 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
31 | 
32 |         // Check the csrfToken
33 |         $this->assertSame(
34 |             'token',
35 |             $web->csrfToken
36 |         );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/tests/MetaDescriptionTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class MetaDescriptionTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingDescription()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Navigate to the test page.
15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
16 | 
17 |         // Check the description as not given (null)
18 |         $this->assertNull($web->description);
19 |     }
20 | 
21 |     /**
22 |      * @test
23 |      */
24 |     public function testWithHTMLEntity()
25 |     {
26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
27 | 
28 |         // Navigate to the test page.
29 |         $web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
30 | 
31 |         // Check the description
32 |         $this->assertSame(
33 |             'Cat & Mouse',
34 |             $web->description
35 |         );
36 |     }
37 | 
38 |     /**
39 |      * @test
40 |      */
41 |     public function testLoremIpsum()
42 |     {
43 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
44 | 
45 |         // Navigate to the test page.
46 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
47 | 
48 |         // Check the description
49 |         $this->assertSame(
50 |             'Lorem ipsum dolor etc.',
51 |             $web->description
52 |         );
53 |     }
54 | 
55 |     /**
56 |      * @test
57 |      */
58 |     public function testGermanUmlaute()
59 |     {
60 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
61 | 
62 |         // Navigate to the test page.
63 |         $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
64 | 
65 |         // Check the description
66 |         $this->assertSame(
67 |             'Eine deutsche Beschreibung mit Umlauten: ä ü ö',
68 |             $web->description
69 |         );
70 |     }
71 | 
72 |     /**
73 |      * @test
74 |      */
75 |     public function testChineseCharacters()
76 |     {
77 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
78 | 
79 |         // Navigate to the test page.
80 |         $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
81 | 
82 |         // Check the description
83 |         $this->assertSame(
84 |             'A description with Chinese Characters: 加油',
85 |             $web->description
86 |         );
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/tests/MetaImageTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class MetaImageTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testCallMethodsAreEqual()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // Attempt to my blog
 15 |         $web->go('https://peterthaleikis.com');
 16 | 
 17 |         // Both the method call as well as property call should return the same...
 18 |         $this->assertSame($web->image(), $web->image);
 19 |     }
 20 | 
 21 |     /**
 22 |      * @test
 23 |      */
 24 |     public function testMissingImage()
 25 |     {
 26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 27 | 
 28 |         // Navigate to the test page.
 29 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
 30 | 
 31 |         // Check the absolute image path
 32 |         $this->assertNull($web->image);
 33 |     }
 34 | 
 35 |     /**
 36 |      * @test
 37 |      */
 38 |     public function testAbsolutePath()
 39 |     {
 40 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 41 | 
 42 |         // Navigate to the test page.
 43 |         $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path.html');
 44 | 
 45 |         // Check the absolute image path
 46 |         $this->assertSame('https://test-pages.phpscraper.de/assets/cat.jpg', $web->image);
 47 |     }
 48 | 
 49 |     /**
 50 |      * @test
 51 |      */
 52 |     public function testRelativePath()
 53 |     {
 54 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 55 | 
 56 |         // Navigate to the test page.
 57 |         $web->go('https://test-pages.phpscraper.de/meta/image/relative-path.html');
 58 | 
 59 |         // Check the relative image path should be converted into an absolute path.
 60 |         $this->assertSame(
 61 |             'https://test-pages.phpscraper.de/assets/cat.jpg',
 62 |             $web->image
 63 |         );
 64 |     }
 65 | 
 66 |     /**
 67 |      * @test
 68 |      */
 69 |     public function testAbsolutePathWithBaseHref()
 70 |     {
 71 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 72 | 
 73 |         // Navigate to the test page.
 74 |         $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
 75 | 
 76 |         // Check the absolute image path
 77 |         $this->assertSame(
 78 |             'https://test-pages.phpscraper.de/assets/cat.jpg',
 79 |             $web->image
 80 |         );
 81 |     }
 82 | 
 83 |     /**
 84 |      * @test
 85 |      */
 86 |     public function testRelativePathBaseHref()
 87 |     {
 88 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 89 | 
 90 |         // Navigate to the test page.
 91 |         $web->go('https://test-pages.phpscraper.de/meta/image/relative-path-with-base-href.html');
 92 | 
 93 |         // Check the relative image path
 94 |         $this->assertSame(
 95 |             'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
 96 |             $web->image
 97 |         );
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/tests/MetaKeywordsTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class MetaKeywordsTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testMissingKeywords()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // Go to the test page
 15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
 16 | 
 17 |         // null if there aren't any keywords set.
 18 |         $this->assertNull($web->keywordString);
 19 | 
 20 |         // Empty array if there aren't any keywords set.
 21 |         $this->assertTrue(is_iterable($web->keywords));
 22 |         $this->assertTrue(empty($web->keywords));
 23 |     }
 24 | 
 25 |     /**
 26 |      * @test
 27 |      */
 28 |     public function testNoSpaces()
 29 |     {
 30 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 31 | 
 32 |         // Navigate to the test page.
 33 |         $web->go('https://test-pages.phpscraper.de/meta/keywords/parse-no-spaces.html');
 34 | 
 35 |         // Check the keywords on this case...
 36 |         $this->assertSame('one,two,three', $web->keywordString);
 37 |         $this->assertSame(['one', 'two', 'three'], $web->keywords);
 38 |     }
 39 | 
 40 |     /**
 41 |      * @test
 42 |      */
 43 |     public function testSpaces()
 44 |     {
 45 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 46 | 
 47 |         // Navigate to the test page.
 48 |         $web->go('https://test-pages.phpscraper.de/meta/keywords/parse-spaces.html');
 49 | 
 50 |         // Check the keywords on this case...
 51 |         $this->assertSame('one, two, three', $web->keywordString);
 52 |         $this->assertSame(['one', 'two', 'three'], $web->keywords);
 53 |     }
 54 | 
 55 |     /**
 56 |      * @test
 57 |      */
 58 |     public function testIrregularSpaces()
 59 |     {
 60 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 61 | 
 62 |         // Navigate to the test page.
 63 |         $web->go('https://test-pages.phpscraper.de/meta/keywords/parse-irregular-spaces.html');
 64 | 
 65 |         // Check the keywords on this case...
 66 |         $this->assertSame('one, two,   three', $web->keywordString);
 67 |         $this->assertSame(['one', 'two', 'three'], $web->keywords);
 68 |     }
 69 | 
 70 |     /**
 71 |      * @test
 72 |      */
 73 |     public function testWithHTMLEntity()
 74 |     {
 75 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 76 | 
 77 |         // Navigate to the test page.
 78 |         $web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
 79 | 
 80 |         // Check the keywords
 81 |         $this->assertSame(['Cat & Mouse', 'Mouse & Cat'], $web->keywords);
 82 |     }
 83 | 
 84 |     /**
 85 |      * @test
 86 |      */
 87 |     public function testLoremIpsum()
 88 |     {
 89 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 90 | 
 91 |         // Navigate to the test page.
 92 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
 93 | 
 94 |         // Check the keywords
 95 |         $this->assertSame(['Lorem', 'ipsum', 'dolor'], $web->keywords);
 96 |     }
 97 | 
 98 |     /**
 99 |      * @test
100 |      */
101 |     public function testGermanUmlaute()
102 |     {
103 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
104 | 
105 |         // Navigate to the test page.
106 |         $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
107 | 
108 |         // Check the keywords
109 |         $this->assertSame(['keywords', 'schlüsselwörter'], $web->keywords);
110 |     }
111 | 
112 |     /**
113 |      * @test
114 |      */
115 |     public function testChineseCharacters()
116 |     {
117 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
118 | 
119 |         // Navigate to the test page.
120 |         $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
121 | 
122 |         // Check the keywords
123 |         $this->assertSame(['加油', '貓'], $web->keywords);
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/tests/MetaViewportTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class MetaViewportTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingViewport()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Go to the test page
15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
16 | 
17 |         // null if there isn't a viewport set.
18 |         $this->assertNull($web->viewportString);
19 | 
20 |         // Empty array if there aren't any viewports set.
21 |         $this->assertTrue(is_iterable($web->viewport));
22 |         $this->assertTrue(empty($web->viewport));
23 |     }
24 | 
25 |     /**
26 |      * @test
27 |      */
28 |     public function testWithViewport()
29 |     {
30 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
31 | 
32 |         // Navigate to the test page.
33 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
34 | 
35 |         // Check the viewport
36 |         $this->assertSame(
37 |             'width=device-width, initial-scale=1, shrink-to-fit=no, maximum-scale=1, user-scalable=no',
38 |             $web->viewportString
39 |         );
40 |         $this->assertSame(
41 |             ['width=device-width', 'initial-scale=1', 'shrink-to-fit=no', 'maximum-scale=1', 'user-scalable=no'],
42 |             $web->viewport
43 |         );
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/tests/NavigationTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class NavigationTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testSurfWithAbsoluteLink()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // Navigate to test page #1.
 15 |         $web->go('https://test-pages.phpscraper.de/navigation/1.html');
 16 | 
 17 |         // Check the title to see if we actually at the right page...
 18 |         $this->assertSame('Page #1', $web->h1[0]);
 19 | 
 20 |         // Navigate to test page #2 using the absolute link.
 21 |         $web->clickLink('2 absolute');
 22 | 
 23 |         // Check the title and URL to see if we actually moved...
 24 |         $this->assertSame('Page #2', $web->h1[0]);
 25 |         $this->assertSame($web->currentUrl, 'https://test-pages.phpscraper.de/navigation/2.html');
 26 |     }
 27 | 
 28 |     /**
 29 |      * @test
 30 |      */
 31 |     public function testSurfWithRelativeLink()
 32 |     {
 33 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 34 | 
 35 |         // Navigate to test page #1.
 36 |         $web->go('https://test-pages.phpscraper.de/navigation/1.html');
 37 | 
 38 |         // Check the title to see if we actually at the right page...
 39 |         $this->assertSame('Page #1', $web->h1[0]);
 40 | 
 41 |         // Navigate to test page #2 using the relative link.
 42 |         $web->clickLink('2 relative');
 43 | 
 44 |         // Check the title and URL to see if we actually moved...
 45 |         $this->assertSame('Page #2', $web->h1[0]);
 46 |         $this->assertSame($web->currentUrl, 'https://test-pages.phpscraper.de/navigation/2.html');
 47 |     }
 48 | 
 49 |     /**
 50 |      * Test navigation using an anchor text.
 51 |      *
 52 |      * @test
 53 |      */
 54 |     public function testLeavePageByText()
 55 |     {
 56 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 57 | 
 58 |         // Navigate to test page #2.
 59 |         $web->go('https://test-pages.phpscraper.de/navigation/2.html');
 60 | 
 61 |         // Check the title to see if we actually at the right page...
 62 |         $this->assertSame('Page #2', $web->h1[0]);
 63 | 
 64 |         // Click the link with the text:
 65 |         $web->clickLink('external link');
 66 | 
 67 |         // Check the URL
 68 |         $this->assertSame('https://peterthaleikis.com/', $web->currentUrl);
 69 |     }
 70 | 
 71 |     /**
 72 |      * Test if we can navigate out using a redirect.
 73 |      *
 74 |      * @test
 75 |      */
 76 |     public function testLeavePageWithRedirect()
 77 |     {
 78 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 79 | 
 80 |         // Navigate to test page #2.
 81 |         $web->go('https://test-pages.phpscraper.de/navigation/2.html');
 82 | 
 83 |         // Check the title to see if we actually at the right page...
 84 |         $this->assertSame('Page #2', $web->h1[0]);
 85 | 
 86 |         // Click the link with the text:
 87 |         $web->clickLink('external link with redirect');
 88 | 
 89 |         // Check the URL
 90 |         $this->assertSame('https://peterthaleikis.com/', $web->currentUrl);
 91 |     }
 92 | 
 93 |     /**
 94 |      * Test if we can navigate out.
 95 |      *
 96 |      * @test
 97 |      */
 98 |     public function testLeavePageByURL()
 99 |     {
100 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
101 | 
102 |         // Navigate to test page #2.
103 |         $web->go('https://test-pages.phpscraper.de/navigation/2.html');
104 | 
105 |         // Check the title to see if we actually at the right page...
106 |         $this->assertSame('Page #2', $web->h1[0]);
107 | 
108 |         // Click the link with the text:
109 |         $web->clickLink('https://peterthaleikis.com/');
110 | 
111 |         // Check the URL
112 |         $this->assertSame('https://peterthaleikis.com/', $web->currentUrl);
113 |     }
114 | 
115 |     /**
116 |      * Test chainability of `clickLink`.
117 |      *
118 |      * @test
119 |      */
120 |     public function testClickLinkChainability()
121 |     {
122 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
123 | 
124 |         // Navigate to a page, click a link by URL and see if we are on the expected `currentUrl`.
125 |         $web
126 |             ->go('https://test-pages.phpscraper.de/navigation/2.html')
127 |             ->clickLink('https://peterthaleikis.com/');
128 | 
129 |         // Check the URL
130 |         $this->assertSame('https://peterthaleikis.com/', $web->currentUrl);
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/tests/NotFoundTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | use PHPUnit\Framework\TestCase;
 6 | 
 7 | class NotFoundTest extends TestCase
 8 | {
 9 |     /**
10 |      * @test
11 |      */
12 |     public function testPageMissing()
13 |     {
14 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
15 | 
16 |         // Navigate to the test page.
17 |         $web->go('https://test-pages.phpscraper.de/page-does-not-exist.html');
18 | 
19 |         // The built-in server returns this string.
20 |         $this->assertSame('Page Not Found', $web->title);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/tests/OpenGraphTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class OpenGraphTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingOpenGraph()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Go to the test page
15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
16 | 
17 |         // Empty array, because there aren't any open graph props set.
18 |         $this->assertTrue(is_iterable($web->openGraph));
19 |         $this->assertTrue(empty($web->openGraph));
20 |     }
21 | 
22 |     /**
23 |      * @test
24 |      */
25 |     public function testOpenGraph()
26 |     {
27 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
28 | 
29 |         // Navigate to the test page.
30 |         $web->go('https://test-pages.phpscraper.de/og/example.html');
31 | 
32 |         // Check elements
33 |         $this->assertSame('Lorem Ipsum', $web->openGraph['og:title']);
34 |         $this->assertSame('Lorem ipsum dolor etc.', $web->openGraph['og:description']);
35 | 
36 |         // The whole set.
37 |         $this->assertSame(
38 |             [
39 |                 'og:site_name' => 'Lorem ipsum',
40 |                 'og:type' => 'website',
41 |                 'og:title' => 'Lorem Ipsum',
42 |                 'og:description' => 'Lorem ipsum dolor etc.',
43 |                 'og:url' => 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
44 |                 'og:image' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
45 |             ],
46 |             $web->openGraph
47 |         );
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/tests/OutlineTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class OutlineTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function outlineTest()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         /**
 15 |          * Navigate to the test page. This page contains:
 16 |          *
 17 |          * <h1>We are testing here!</h1>
 18 |          * <p>This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.</p>
 19 |          *
 20 |          * <h2>Examples</h2>
 21 |          * <p>There are numerous examples on the website. Please check them out to get more context on how scraping works.</p>
 22 |          *
 23 |          * <h3>Example 1</h3>
 24 |          * <p>Here would be an example.</p>
 25 |          *
 26 |          * <h3>Example 2</h3>
 27 |          * <p>Here would be the second example.</p>
 28 |          *
 29 |          * <h3>Example 3</h3>
 30 |          * <p>Here would be another example.</p>
 31 |          */
 32 |         $web->go('https://test-pages.phpscraper.de/content/outline.html');
 33 | 
 34 |         // Get the content outline
 35 |         $this->assertSame(
 36 |             [
 37 |                 [
 38 |                     'tag' => 'h1',
 39 |                     'content' => 'We are testing here!',
 40 |                 ], [
 41 |                     'tag' => 'h2',
 42 |                     'content' => 'Examples',
 43 |                 ], [
 44 |                     'tag' => 'h3',
 45 |                     'content' => 'Example 1',
 46 |                 ], [
 47 |                     'tag' => 'h3',
 48 |                     'content' => 'Example 2',
 49 |                 ], [
 50 |                     'tag' => 'h3',
 51 |                     'content' => 'Example 3',
 52 |                 ],
 53 |             ],
 54 |             $web->outline
 55 |         );
 56 |     }
 57 | 
 58 |     /**
 59 |      * @test
 60 |      */
 61 |     public function outlineWithParagraphsTest()
 62 |     {
 63 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 64 | 
 65 |         /**
 66 |          * Navigate to the test page. This page contains:
 67 |          *
 68 |          * <h1>We are testing here!</h1>
 69 |          * <p>This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.</p>
 70 |          *
 71 |          * <h2>Examples</h2>
 72 |          * <p>There are numerous examples on the website. Please check them out to get more context on how scraping works.</p>
 73 |          *
 74 |          * <h3>Example 1</h3>
 75 |          * <p>Here would be an example.</p>
 76 |          *
 77 |          * <h3>Example 2</h3>
 78 |          * <p>Here would be the second example.</p>
 79 |          *
 80 |          * <h3>Example 3</h3>
 81 |          * <p>Here would be another example.</p>
 82 |          *
 83 |          * <!-- an empty paragraph to check if it gets filtered out correctly -->
 84 |          * <p></p>
 85 |          */
 86 |         $web->go('https://test-pages.phpscraper.de/content/outline.html');
 87 | 
 88 |         // Get the content outline
 89 |         $this->assertSame(
 90 |             [
 91 |                 [
 92 |                     'tag' => 'h1',
 93 |                     'content' => 'We are testing here!',
 94 |                 ], [
 95 |                     'tag' => 'p',
 96 |                     'content' => 'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
 97 |                 ], [
 98 |                     'tag' => 'h2',
 99 |                     'content' => 'Examples',
100 |                 ], [
101 |                     'tag' => 'p',
102 |                     'content' => 'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
103 |                 ], [
104 |                     'tag' => 'h3',
105 |                     'content' => 'Example 1',
106 |                 ], [
107 |                     'tag' => 'p',
108 |                     'content' => 'Here would be an example.',
109 |                 ], [
110 |                     'tag' => 'h3',
111 |                     'content' => 'Example 2',
112 |                 ], [
113 |                     'tag' => 'p',
114 |                     'content' => 'Here would be the second example.',
115 |                 ], [
116 |                     'tag' => 'h3',
117 |                     'content' => 'Example 3',
118 |                 ], [
119 |                     'tag' => 'p',
120 |                     'content' => 'Here would be another example.',
121 |                 ], [
122 |                     'tag' => 'p',
123 |                     'content' => '',
124 |                 ],
125 |             ],
126 |             $web->outlineWithParagraphs
127 |         );
128 |     }
129 | }
130 | 


--------------------------------------------------------------------------------
/tests/ParagraphsTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class ParagraphsTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function paragraphTest()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         /**
15 |          * Navigate to the test page. This page contains:
16 |          *
17 |          * <h1>We are testing here!</h1>
18 |          * <p>This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.</p>
19 |          *
20 |          * <h2>Examples</h2>
21 |          * <p>There are numerous examples on the website. Please check them out to get more context on how scraping works.</p>
22 |          *
23 |          * <h3>Example 1</h3>
24 |          * <p>Here would be an example.</p>
25 |          *
26 |          * <h3>Example 2</h3>
27 |          * <p>Here would be the second example.</p>
28 |          *
29 |          * <h3>Example 3</h3>
30 |          * <p>Here would be another example.</p>
31 |          *
32 |          * <!-- an empty paragraph to check if it gets filtered out correctly -->
33 |          * <p></p>
34 |          */
35 |         $web->go('https://test-pages.phpscraper.de/content/outline.html');
36 | 
37 |         // Get the paragraphs
38 |         $this->assertSame([
39 |             'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
40 |             'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
41 |             'Here would be an example.',
42 |             'Here would be the second example.',
43 |             'Here would be another example.',
44 |             '',
45 |         ], $web->paragraphs);
46 |     }
47 | 
48 |     /**
49 |      * @test
50 |      */
51 |     public function cleanParagraphTest()
52 |     {
53 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
54 | 
55 |         /**
56 |          * Navigate to the test page. This page contains:
57 |          *
58 |          * <h1>We are testing here!</h1>
59 |          * <p>This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.</p>
60 |          *
61 |          * <h2>Examples</h2>
62 |          * <p>There are numerous examples on the website. Please check them out to get more context on how scraping works.</p>
63 |          *
64 |          * <h3>Example 1</h3>
65 |          * <p>Here would be an example.</p>
66 |          *
67 |          * <h3>Example 2</h3>
68 |          * <p>Here would be the second example.</p>
69 |          *
70 |          * <h3>Example 3</h3>
71 |          * <p>Here would be another example.</p>
72 |          *
73 |          * <!-- an empty paragraph to check if it gets filtered out correctly -->
74 |          * <p></p>
75 |          */
76 |         $web->go('https://test-pages.phpscraper.de/content/outline.html');
77 | 
78 |         // Get the cleaned up paragraphs
79 |         $this->assertSame([
80 |             'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
81 |             'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
82 |             'Here would be an example.',
83 |             'Here would be the second example.',
84 |             'Here would be another example.',
85 |         ], $web->cleanParagraphs);
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/tests/ParserCsvTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class ParserCsvTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testCsvParsingContext()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // This tests ensures an exception is thrown, if no context is given.
 15 |         // Context means either it's been navigated before (URL context) or get something to (fetch +) parse
 16 |         try {
 17 |             $web = new \Spekulatius\PHPScraper\PHPScraper;
 18 |             $web->parseCsv();
 19 |         } catch (\Exception $e) {
 20 |             // Did we get the expected exception?
 21 |             $this->assertSame(
 22 |                 'You can not call parseCsv() without parameter or initial navigation.',
 23 |                 $e->getMessage()
 24 |             );
 25 |         }
 26 | 
 27 |         // This tests ensures an exception is thrown, if no context is given.
 28 |         // Context means either it's been navigated before (URL context) or get something to (fetch +) parse
 29 |         try {
 30 |             $web = new \Spekulatius\PHPScraper\PHPScraper;
 31 |             $web->parseCsvWithHeader();
 32 |         } catch (\Exception $e) {
 33 |             // Did we get the expected exception?
 34 |             $this->assertSame(
 35 |                 'You can not call parseCsvWithHeader() without parameter or initial navigation.',
 36 |                 $e->getMessage()
 37 |             );
 38 |         }
 39 |     }
 40 | 
 41 |     /**
 42 |      * @test
 43 |      */
 44 |     public function testCsvDecodeRaw()
 45 |     {
 46 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 47 | 
 48 |         // Only decoding
 49 |         $this->assertSame(
 50 |             [
 51 |                 ['date', 'value'],
 52 |                 ['1945-02-06', '4.20'],
 53 |                 ['1952-03-11', '42'],
 54 |             ],
 55 |             $web->csvDecodeRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"),
 56 |         );
 57 | 
 58 |         // Fetching and decoding
 59 |         $this->assertSame(
 60 |             [
 61 |                 ['date', 'value'],
 62 |                 ['1945-02-06', '4.20'],
 63 |                 ['1952-03-11', '42'],
 64 |             ],
 65 |             $web->csvDecodeRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
 66 |         );
 67 |     }
 68 | 
 69 |     /**
 70 |      * @test
 71 |      */
 72 |     public function testCsvDecode()
 73 |     {
 74 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 75 | 
 76 |         // Only decoding
 77 |         $this->assertSame(
 78 |             [
 79 |                 ['date', 'value'],
 80 |                 ['1945-02-06', 4.20],
 81 |                 ['1952-03-11', 42],
 82 |             ],
 83 |             $web->csvDecode("date,value\n1945-02-06,4.20\n1952-03-11,42"),
 84 |         );
 85 | 
 86 |         // Fetching and decoding
 87 |         $this->assertSame(
 88 |             [
 89 |                 ['date', 'value'],
 90 |                 ['1945-02-06', 4.20],
 91 |                 ['1952-03-11', 42],
 92 |             ],
 93 |             $web->csvDecode($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
 94 |         );
 95 |     }
 96 | 
 97 |     /**
 98 |      * Test with pipe as separator, enclosure and escape.
 99 |      *
100 |      * @test
101 |      */
102 |     public function testCsvDecodeAndCustomEncoding()
103 |     {
104 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
105 | 
106 |         $this->assertSame(
107 |             [
108 |                 ['date', 'value'],
109 |                 ['1945-02-06', 4.20],
110 |                 ['1952-03-11', 42],
111 |                 ['\\'],
112 |             ],
113 |             $web->csvDecode(
114 |                 "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"\n\\",
115 |                 '|',
116 |                 '"',
117 |                 '\\'
118 |             )
119 |         );
120 |     }
121 | 
122 |     /**
123 |      * @test
124 |      */
125 |     public function testCsvDecodeWithHeaderRaw()
126 |     {
127 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
128 | 
129 |         // Only decoding
130 |         $this->assertSame(
131 |             [
132 |                 ['date' => '1945-02-06', 'value' => '4.20'],
133 |                 ['date' => '1952-03-11', 'value' => '42'],
134 |             ],
135 |             $web->csvDecodeWithHeaderRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"),
136 |         );
137 | 
138 |         // Fetching and decoding
139 |         $this->assertSame(
140 |             [
141 |                 ['date' => '1945-02-06', 'value' => '4.20'],
142 |                 ['date' => '1952-03-11', 'value' => '42'],
143 |             ],
144 |             $web->csvDecodeWithHeaderRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
145 |         );
146 |     }
147 | 
148 |     /**
149 |      * @test
150 |      */
151 |     public function testCsvDecodeWithHeaderAndCasting()
152 |     {
153 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
154 | 
155 |         $this->assertSame(
156 |             [
157 |                 ['date' => '1945-02-06', 'value' => 4.20],
158 |                 ['date' => '1952-03-11', 'value' => 42],
159 |             ],
160 |             $web->csvDecodeWithHeader("date,value\n1945-02-06,4.20\n1952-03-11,42"),
161 |         );
162 |     }
163 | 
164 |     /**
165 |      * Test with header, pipe as separator, and enclosure.
166 |      *
167 |      * @test
168 |      */
169 |     public function testCsvDecodeWithHeaderAndCustomEncoding()
170 |     {
171 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
172 | 
173 |         $this->assertSame(
174 |             [
175 |                 ['date' => '1945-02-06', 'value' => 4.20],
176 |                 ['date' => '1952-03-11', 'value' => 42],
177 |             ],
178 | 
179 |             $web->csvDecodeWithHeader(
180 |                 "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"",
181 |                 '|',
182 |                 '"',
183 |                 '\\'
184 |             )
185 |         );
186 |     }
187 | 
188 |     /**
189 |      * Check the pluming: Test the various ways to call `parseCsv()`.
190 |      *
191 |      * @test
192 |      */
193 |     public function testDifferentCsvCalls()
194 |     {
195 |         // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
196 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
197 | 
198 |         // For the reference we are using a simple CSV and parse it. This matches the hosted CSV.
199 |         $csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42";
200 |         $csvData = [['date', 'value'], ['1945-02-06', 4.20], ['1952-03-11', 42]];
201 | 
202 |         // Case 1: Passing in an CSV string in.
203 |         $this->assertSame(
204 |             // Pass the CSV Data as reference in.
205 |             $csvData,
206 | 
207 |             // Parse the $csvString directly.
208 |             (new \Spekulatius\PHPScraper\PHPScraper)
209 |                 ->parseCsv($csvString)
210 |         );
211 | 
212 |         // Case 2: `go` + `parseCsv()`
213 |         $this->assertSame(
214 |             // Pass the CSV Data as reference in.
215 |             $csvData,
216 | 
217 |             // Chained call using a CSV file as URL.
218 |             (new \Spekulatius\PHPScraper\PHPScraper)
219 |                 ->go('https://test-pages.phpscraper.de/test.csv')
220 |                 ->parseCsv()
221 |         );
222 | 
223 |         // Case 3: `parseCsv()` with absolute URL.
224 |         $this->assertSame(
225 |             // Pass the CSV Data as reference in.
226 |             $csvData,
227 | 
228 |             // Pass the absolutely URL to `parseCsv()`
229 |             (new \Spekulatius\PHPScraper\PHPScraper)
230 |                 ->parseCsv('https://test-pages.phpscraper.de/test.csv')
231 |         );
232 | 
233 |         // Case 4: `go` + `parseCsv()` with relative URL.
234 |         $this->assertSame(
235 |             // Pass the CSV Data as reference in.
236 |             $csvData,
237 | 
238 |             // The 'go' sets the base URL for the following relative path.
239 |             (new \Spekulatius\PHPScraper\PHPScraper)
240 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
241 |                 ->parseCsv('/test.csv')
242 |         );
243 | 
244 |         // Case 5: `go` with base URL + `go` with relative URL + `parseCsv()`.
245 |         // 5.1. Ensure the final URL is correct.
246 |         $this->assertSame(
247 |             'https://test-pages.phpscraper.de/test.csv',
248 | 
249 |             // The first 'go' sets the base URL for the following `go` with relative URL.
250 |             (new \Spekulatius\PHPScraper\PHPScraper)
251 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
252 |                 ->go('/test.csv')
253 |                 ->currentUrl()
254 |         );
255 | 
256 |         // 5.2. Ensure the parsed CSV is correct.
257 |         $this->assertSame(
258 |             // Pass the CSV Data as reference in.
259 |             $csvData,
260 | 
261 |             // The first 'go' sets the base URL for the following `go` with relative URL.
262 |             (new \Spekulatius\PHPScraper\PHPScraper)
263 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
264 |                 ->go('/test.csv')
265 |                 ->parseCsv()
266 |         );
267 | 
268 |         // Case 6: With encoding params
269 |         $this->assertSame(
270 |             // Pass the CSV Data as reference in.
271 |             $csvData,
272 | 
273 |             // The first 'go' sets the base URL for the following `go` with relative URL.
274 |             (new \Spekulatius\PHPScraper\PHPScraper)
275 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
276 |                 ->go('/test-custom.csv')
277 |                 ->parseCsv(null, '|', '"')
278 |         );
279 | 
280 |         // Case 7: With encoding params and (relative) URL
281 |         $this->assertSame(
282 |             // Pass the CSV Data as reference in.
283 |             $csvData,
284 | 
285 |             // The first 'go' sets the base URL for the following `go` with relative URL.
286 |             (new \Spekulatius\PHPScraper\PHPScraper)
287 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
288 |                 ->parseCsv('/test-custom.csv', '|', '"')
289 |         );
290 |     }
291 | 
292 |     /**
293 |      * Check the pluming: Test the various ways to call `parseCsvWithHeader()`.
294 |      *
295 |      * @test
296 |      */
297 |     public function testDifferentCsvWithHeaderCalls()
298 |     {
299 |         // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
300 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
301 | 
302 |         // For the reference we are using a simple CSV and parse it. This matches the hosted CSV.
303 |         $csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42";
304 |         $csvData = [
305 |             ['date' => '1945-02-06', 'value' => 4.20],
306 |             ['date' => '1952-03-11', 'value' => 42],
307 |         ];
308 | 
309 |         // Case 1: Passing in an CSV string in.
310 |         $this->assertSame(
311 |             // Pass the CSV Data as reference in.
312 |             $csvData,
313 | 
314 |             // Parse the $csvString directly.
315 |             (new \Spekulatius\PHPScraper\PHPScraper)
316 |                 ->parseCsvWithHeader($csvString)
317 |         );
318 | 
319 |         // Case 2: `parseCsvWithHeader()`
320 |         $this->assertSame(
321 |             // Pass the CSV Data as reference in.
322 |             $csvData,
323 | 
324 |             // Chained call using a CSV file as URL.
325 |             (new \Spekulatius\PHPScraper\PHPScraper)
326 |                 ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv')
327 |         );
328 | 
329 |         // Case 2: `go` + `parseCsvWithHeader()`
330 |         $this->assertSame(
331 |             // Pass the CSV Data as reference in.
332 |             $csvData,
333 | 
334 |             // Chained call using a CSV file as URL.
335 |             (new \Spekulatius\PHPScraper\PHPScraper)
336 |                 ->go('https://test-pages.phpscraper.de/test.csv')
337 |                 ->parseCsvWithHeader()
338 |         );
339 | 
340 |         // Case 3: `parseCsvWithHeader()` with absolute URL.
341 |         $this->assertSame(
342 |             // Pass the CSV Data as reference in.
343 |             $csvData,
344 | 
345 |             // Pass the absolutely URL to `parseCsvWithHeader()`
346 |             (new \Spekulatius\PHPScraper\PHPScraper)
347 |                 ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv')
348 |         );
349 | 
350 |         // Case 4: `go` + `parseCsvWithHeader()` with relative URL.
351 |         $this->assertSame(
352 |             // Pass the CSV Data as reference in.
353 |             $csvData,
354 | 
355 |             // The 'go' sets the base URL for the following relative path.
356 |             (new \Spekulatius\PHPScraper\PHPScraper)
357 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
358 |                 ->parseCsvWithHeader('/test.csv')
359 |         );
360 | 
361 |         // Case 5: `go` with base URL + `go` with relative URL + `parseCsvWithHeader()`.
362 |         // 5.1. Ensure the final URL is correct.
363 |         $this->assertSame(
364 |             'https://test-pages.phpscraper.de/test.csv',
365 | 
366 |             // The first 'go' sets the base URL for the following `go` with relative URL.
367 |             (new \Spekulatius\PHPScraper\PHPScraper)
368 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
369 |                 ->go('/test.csv')
370 |                 ->currentUrl()
371 |         );
372 | 
373 |         // 5.2. Ensure the parsed CSV is correct.
374 |         $this->assertSame(
375 |             // Pass the CSV Data as reference in.
376 |             $csvData,
377 | 
378 |             // The first 'go' sets the base URL for the following `go` with relative URL.
379 |             (new \Spekulatius\PHPScraper\PHPScraper)
380 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
381 |                 ->go('/test.csv')
382 |                 ->parseCsvWithHeader()
383 |         );
384 | 
385 |         // Case 6: With encoding params
386 |         $this->assertSame(
387 |             // Pass the CSV Data as reference in.
388 |             $csvData,
389 | 
390 |             // The first 'go' sets the base URL for the following `go` with relative URL.
391 |             (new \Spekulatius\PHPScraper\PHPScraper)
392 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
393 |                 ->go('/test-custom.csv')
394 |                 ->parseCsvWithHeader(null, '|', '"')
395 |         );
396 | 
397 |         // Case 7: With encoding params and (relative) URL
398 |         $this->assertSame(
399 |             // Pass the CSV Data as reference in.
400 |             $csvData,
401 | 
402 |             // The first 'go' sets the base URL for the following `go` with relative URL.
403 |             (new \Spekulatius\PHPScraper\PHPScraper)
404 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
405 |                 ->parseCsvWithHeader('/test-custom.csv', '|', '"')
406 |         );
407 |     }
408 | }
409 | 


--------------------------------------------------------------------------------
/tests/ParserJsonTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class ParserJsonTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testJsonParsingContext()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // This tests ensures an exception is thrown, if no context is given.
 15 |         // Context means either it's been navigated before (URL context) or get something to (fetch +) parse
 16 |         try {
 17 |             $web = new \Spekulatius\PHPScraper\PHPScraper;
 18 |             $web->parseJson();
 19 |         } catch (\Exception $e) {
 20 |             // Did we get the expected exception?
 21 |             $this->assertSame(
 22 |                 'You can not call parseJson() without parameter or initial navigation.',
 23 |                 $e->getMessage()
 24 |             );
 25 |         }
 26 |     }
 27 | 
 28 |     /**
 29 |      * Test the various ways to call `parseJson()`.
 30 |      *
 31 |      * @test
 32 |      */
 33 |     public function testDifferentJsonCalls()
 34 |     {
 35 |         // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
 36 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 37 | 
 38 |         // For the reference we are using a simple JSON and parse it.
 39 |         $jsonString = $web->fetchAsset('https://test-pages.phpscraper.de/index.json');
 40 |         $jsonData = json_decode($jsonString, true);
 41 | 
 42 |         // Case 1: Passing in an JSON string in.
 43 |         $this->assertSame(
 44 |             // Pass the JSON Data as reference in.
 45 |             $jsonData,
 46 | 
 47 |             // Parse the $jsonString directly.
 48 |             (new \Spekulatius\PHPScraper\PHPScraper)
 49 |                 ->parseJson($jsonString)
 50 |         );
 51 | 
 52 |         // Case 2: `go` + `parseJson()`
 53 |         $this->assertSame(
 54 |             // Pass the JSON Data as reference in.
 55 |             $jsonData,
 56 | 
 57 |             // Chained call using a JSON file as URL.
 58 |             (new \Spekulatius\PHPScraper\PHPScraper)
 59 |                 ->go('https://test-pages.phpscraper.de/index.json')
 60 |                 ->parseJson()
 61 |         );
 62 | 
 63 |         // Case 3: `parseJson()` with absolute URL.
 64 |         $this->assertSame(
 65 |             // Pass the JSON Data as reference in.
 66 |             $jsonData,
 67 | 
 68 |             // Pass the absolutely URL to `parseJson()`
 69 |             (new \Spekulatius\PHPScraper\PHPScraper)
 70 |                 ->parseJson('https://test-pages.phpscraper.de/index.json')
 71 |         );
 72 | 
 73 |         // Case 4: `go` + `parseJson()` with relative URL.
 74 |         $this->assertSame(
 75 |             // Pass the JSON Data as reference in.
 76 |             $jsonData,
 77 | 
 78 |             // The 'go' sets the base URL for the following relative path.
 79 |             (new \Spekulatius\PHPScraper\PHPScraper)
 80 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
 81 |                 ->parseJson('/index.json')
 82 |         );
 83 | 
 84 |         // Case 5: `go` with base URL + `go` with relative URL + `parseJson()`.
 85 |         // 5.1. Ensure the final URL is correct.
 86 |         $this->assertSame(
 87 |             'https://test-pages.phpscraper.de/index.json',
 88 | 
 89 |             // The first 'go' sets the base URL for the following `go` with relative URL.
 90 |             (new \Spekulatius\PHPScraper\PHPScraper)
 91 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
 92 |                 ->go('/index.json')
 93 |                 ->currentUrl()
 94 |         );
 95 | 
 96 |         // 5.2. Ensure the parsed JSON is correct.
 97 |         $this->assertSame(
 98 |             // Pass the JSON Data as reference in.
 99 |             $jsonData,
100 | 
101 |             // The first 'go' sets the base URL for the following `go` with relative URL.
102 |             (new \Spekulatius\PHPScraper\PHPScraper)
103 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
104 |                 ->go('/index.json')
105 |                 ->parseJson()
106 |         );
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/tests/ParserXmlTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class ParserXmlTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testJsonParsingContext()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // This tests ensures an exception is thrown, if no context is given.
 15 |         // Context means either it's been navigated before (URL context) or get something to (fetch +) parse
 16 |         try {
 17 |             $web = new \Spekulatius\PHPScraper\PHPScraper;
 18 |             $web->parseXml();
 19 |         } catch (\Exception $e) {
 20 |             // Did we get the expected exception?
 21 |             $this->assertSame(
 22 |                 'You can not call parseXml() without parameter or initial navigation.',
 23 |                 $e->getMessage()
 24 |             );
 25 |         }
 26 |     }
 27 | 
 28 |     /**
 29 |      * @test
 30 |      */
 31 |     public function testDifferentXmlCalls()
 32 |     {
 33 |         // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
 34 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 35 | 
 36 |         // For the reference we are using a simple XML and parse it.
 37 |         $xmlString = $web->fetchAsset('https://test-pages.phpscraper.de/sitemap.xml');
 38 |         $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA);
 39 |         $xmlData = json_decode((string) json_encode($xml), true);
 40 | 
 41 |         // Case 1: Passing in an XML string in.
 42 |         $this->assertSame(
 43 |             // Pass the XML Data as reference in.
 44 |             $xmlData,
 45 | 
 46 |             // Parse the XML string directly.
 47 |             (new \Spekulatius\PHPScraper\PHPScraper)
 48 |                 ->parseXml($xmlString)
 49 |         );
 50 | 
 51 |         // Case 2: `go` + `parseXml()`
 52 |         $this->assertSame(
 53 |             // Pass the XML Data as reference in.
 54 |             $xmlData,
 55 | 
 56 |             // Chained call with XML as URL
 57 |             (new \Spekulatius\PHPScraper\PHPScraper)
 58 |                 ->go('https://test-pages.phpscraper.de/sitemap.xml')
 59 |                 ->parseXml()
 60 |         );
 61 | 
 62 |         // Case 3: `parseXml()` with absolute URL.
 63 |         $this->assertSame(
 64 |             // Pass the XML Data as reference in.
 65 |             $xmlData,
 66 | 
 67 |             // Pass the absolutely URL to `parseXml()`
 68 |             (new \Spekulatius\PHPScraper\PHPScraper)
 69 |                 ->parseXml('https://test-pages.phpscraper.de/sitemap.xml')
 70 |         );
 71 | 
 72 |         // Case 4: `go` + `parseXml()` with relative URL.
 73 |         $this->assertSame(
 74 |             // Pass the XML Data as reference in.
 75 |             $xmlData,
 76 | 
 77 |             // The 'go' sets the base URL for the following relative path.
 78 |             (new \Spekulatius\PHPScraper\PHPScraper)
 79 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
 80 |                 ->parseXml('/sitemap.xml')
 81 |         );
 82 | 
 83 |         // Case 5: `go` with base URL + `go` with relative URL + `parseXml()`.
 84 |         // 5.1. Ensure the final URL is correct.
 85 |         $this->assertSame(
 86 |             'https://test-pages.phpscraper.de/sitemap.xml',
 87 | 
 88 |             // The first 'go' sets the base URL for the following `go` with relative URL.
 89 |             (new \Spekulatius\PHPScraper\PHPScraper)
 90 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
 91 |                 ->go('/sitemap.xml')
 92 |                 ->currentUrl()
 93 |         );
 94 | 
 95 |         // 5.2. Ensure the parsed JSON is correct.
 96 |         $this->assertSame(
 97 |             // Pass the XML Data as reference in.
 98 |             $xmlData,
 99 | 
100 |             // The first 'go' sets the base URL for the following `go` with relative URL.
101 |             (new \Spekulatius\PHPScraper\PHPScraper)
102 |                 ->go('https://test-pages.phpscraper.de/meta/feeds.html')
103 |                 ->go('/sitemap.xml')
104 |                 ->parseXml()
105 |         );
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/tests/RedirectTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class RedirectTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testRedirect()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Navigate to the test page: This redirects to phpscraper.de
15 |         $web->go('https://test-pages.phpscraper.de');
16 | 
17 |         $this->assertNotSame(
18 |             $web->currentUrl,
19 |             'https://test-pages.phpscraper.de/'
20 |         );
21 |         $this->assertSame(
22 |             $web->currentUrl,
23 |             'https://phpscraper.de/'
24 |         );
25 |     }
26 | 
27 |     /**
28 |      * @test
29 |      */
30 |     public function testDisabledRedirect()
31 |     {
32 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
33 | 
34 |         $web->setConfig([
35 |             'follow_redirects' => false,
36 |             'follow_meta_refresh' => false,
37 |             'max_redirects' => -1,
38 |         ]);
39 | 
40 |         // Navigate to the test page: This redirects to phpscraper.de
41 |         $web->go('https://test-pages.phpscraper.de');
42 | 
43 |         $this->assertSame(
44 |             'https://test-pages.phpscraper.de',
45 |             $web->currentUrl,
46 |         );
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/TitleTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | class TitleTest extends \PHPUnit\Framework\TestCase
  6 | {
  7 |     /**
  8 |      * @test
  9 |      */
 10 |     public function testMissingTitle()
 11 |     {
 12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 13 | 
 14 |         // Navigate to the test page.
 15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
 16 | 
 17 |         // Check the title as not given (null)
 18 |         $this->assertNull($web->title);
 19 |     }
 20 | 
 21 |     /**
 22 |      * @test
 23 |      */
 24 |     public function testWithHTMLEntity()
 25 |     {
 26 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 27 | 
 28 |         // Navigate to the test page.
 29 |         $web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
 30 | 
 31 |         // Check the title
 32 |         $this->assertSame(
 33 |             'Cat & Mouse',
 34 |             $web->title
 35 |         );
 36 |     }
 37 | 
 38 |     /**
 39 |      * @test
 40 |      */
 41 |     public function testLoremIpsum()
 42 |     {
 43 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 44 | 
 45 |         // Navigate to the test page.
 46 |         $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
 47 | 
 48 |         // Check the title
 49 |         $this->assertSame(
 50 |             'Lorem Ipsum',
 51 |             $web->title
 52 |         );
 53 |     }
 54 | 
 55 |     /**
 56 |      * @test
 57 |      */
 58 |     public function testGermanUmlaute()
 59 |     {
 60 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 61 | 
 62 |         // Navigate to the test page.
 63 |         $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
 64 | 
 65 |         // Check the title
 66 |         $this->assertSame(
 67 |             'A page with plenty of German umlaute everywhere (ä ü ö)',
 68 |             $web->title
 69 |         );
 70 |     }
 71 | 
 72 |     /**
 73 |      * @test
 74 |      */
 75 |     public function testChineseCharacters()
 76 |     {
 77 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 78 | 
 79 |         // Navigate to the test page.
 80 |         $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
 81 | 
 82 |         // Check the title
 83 |         $this->assertSame(
 84 |             'Page with Chinese Characters all over the place (加油)',
 85 |             $web->title
 86 |         );
 87 |     }
 88 | 
 89 |     /**
 90 |      * @test
 91 |      */
 92 |     public function testLongTitle()
 93 |     {
 94 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 95 | 
 96 |         // Navigate to the test page.
 97 |         $web->go('https://test-pages.phpscraper.de/title/long-title.html');
 98 | 
 99 |         // Check the title
100 |         $this->assertSame(
101 |             'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed mollis purus id ex consectetur facilisis. In gravida sodales nisl a consequat. Aenean ipsum sem, congue et rhoncus a, feugiat eget enim. Duis ut malesuada neque. Nam justo est, interdum eu massa in, volutpat vestibulum libero. Mauris a varius mauris, in vulputate ligula. Nulla rhoncus eget purus a sodales. Nulla facilisi. Proin purus purus, sodales non dolor in, lobortis elementum augue. Nulla sagittis, ex eu placerat varius, nulla mi rutrum odio, sit amet lacinia ipsum urna nec massa. Quisque posuere mauris id condimentum viverra.',
102 |             $web->title
103 |         );
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/tests/TwitterCardTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Spekulatius\PHPScraper\Tests;
 4 | 
 5 | class TwitterCardTest extends \PHPUnit\Framework\TestCase
 6 | {
 7 |     /**
 8 |      * @test
 9 |      */
10 |     public function testMissingTwitterCard()
11 |     {
12 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
13 | 
14 |         // Go to the test page
15 |         $web->go('https://test-pages.phpscraper.de/meta/missing.html');
16 | 
17 |         // Empty array, because there aren't any twitter cards props set.
18 |         $this->assertTrue(is_iterable($web->twitterCard));
19 |         $this->assertTrue(empty($web->twitterCard));
20 |     }
21 | 
22 |     /**
23 |      * @test
24 |      */
25 |     public function testTwitterCard()
26 |     {
27 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
28 | 
29 |         // Navigate to the test page.
30 |         $web->go('https://test-pages.phpscraper.de/twittercard/example.html');
31 | 
32 |         // Check elements
33 |         $this->assertSame('summary_large_image', $web->twitterCard['twitter:card']);
34 |         $this->assertSame('Lorem Ipsum', $web->twitterCard['twitter:title']);
35 | 
36 |         // The whole set.
37 |         $this->assertSame(
38 |             [
39 |                 'twitter:card' => 'summary_large_image',
40 |                 'twitter:title' => 'Lorem Ipsum',
41 |                 'twitter:description' => 'Lorem ipsum dolor etc.',
42 |                 'twitter:url' => 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
43 |                 'twitter:image' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
44 |             ],
45 |             $web->twitterCard
46 |         );
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/UrlTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Spekulatius\PHPScraper\Tests;
  4 | 
  5 | /**
  6 |  * Ensure our URL lib, https://github.com/thephpleague/uri, is integrated correctly and works as expected.
  7 |  */
  8 | class UrlTest extends \PHPUnit\Framework\TestCase
  9 | {
 10 |     /**
 11 |      * If null is passed to `makeUrlAbsolute`, it should always return null.
 12 |      *
 13 |      * @test
 14 |      */
 15 |     public function testNullPassingThrough()
 16 |     {
 17 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 18 | 
 19 |         $this->assertNull($web->makeUrlAbsolute(null));
 20 |     }
 21 | 
 22 |     /**
 23 |      * @test
 24 |      */
 25 |     public function validateUriTest()
 26 |     {
 27 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 28 | 
 29 |         // We use any URL for this.
 30 |         $web->go('https://test-pages.phpscraper.de/content/lists.html');
 31 | 
 32 |         // Ensure the URL is set correctly.
 33 |         $this->assertSame(
 34 |             'https://test-pages.phpscraper.de/content/lists.html',
 35 |             $web->currentUrl
 36 |         );
 37 | 
 38 |         // Ensure the host is parsed correctly.
 39 |         $this->assertSame(
 40 |             'test-pages.phpscraper.de',
 41 |             $web->currentHost
 42 |         );
 43 | 
 44 |         // Ensure the host with protocol is parsed correctly.
 45 |         $this->assertSame(
 46 |             'https://test-pages.phpscraper.de',
 47 |             $web->currentBaseHost
 48 |         );
 49 |     }
 50 | 
 51 |     /**
 52 |      * @test
 53 |      */
 54 |     public function testCurrentBaseHostWithBase()
 55 |     {
 56 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 57 | 
 58 |         // Navigate to the test page.
 59 |         // Contains: <base href="https://test-pages-with-base-href.phpscraper.de/">
 60 |         $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
 61 | 
 62 |         // Check the base href being passed through the current base host.
 63 |         $this->assertSame(
 64 |             'https://test-pages-with-base-href.phpscraper.de',
 65 |             $web->currentBaseHost
 66 |         );
 67 |     }
 68 | 
 69 |     /**
 70 |      * Basic processing of the URLs.
 71 |      *
 72 |      * @test
 73 |      */
 74 |     public function testMakeUrlAbsolute()
 75 |     {
 76 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
 77 | 
 78 |         // Navigate to test page: This sets the base URL.
 79 |         $web->go('https://phpscraper.de');
 80 | 
 81 |         // Test variations of paths to be processed
 82 |         // With leading slash
 83 |         $this->assertSame(
 84 |             'https://phpscraper.de/index.html',
 85 |             $web->makeUrlAbsolute('/index.html'),
 86 |         );
 87 | 
 88 |         // Without leading slash
 89 |         $this->assertSame(
 90 |             'https://phpscraper.de/index.html',
 91 |             $web->makeUrlAbsolute('index.html'),
 92 |         );
 93 | 
 94 |         // Paths are considered.
 95 |         $this->assertSame(
 96 |             'https://phpscraper.de/test/index.html',
 97 |             $web->makeUrlAbsolute('test/index.html'),
 98 |         );
 99 | 
100 |         // Absolutely URLs are untouched.
101 |         $this->assertSame(
102 |             'https://example.com/index.html',
103 |             $web->makeUrlAbsolute('https://example.com/index.html'),
104 |         );
105 | 
106 |         // Protocol is considered
107 |         $this->assertSame(
108 |             'http://example.com/index.html',
109 |             $web->makeUrlAbsolute('http://example.com/index.html'),
110 |         );
111 |     }
112 | 
113 |     /**
114 |      * Basic processing of the URLs.
115 |      *
116 |      * @test
117 |      */
118 |     public function testMakeUrlAbsoluteConsiderBaseHref()
119 |     {
120 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
121 | 
122 |         /**
123 |          * Navigate to test page: This sets the base URL.
124 |          *
125 |          * It contains:
126 |          *
127 |          * ```html
128 |          * <base href="https://test-pages-with-base-href.phpscraper.de/">
129 |          * ```
130 |          *
131 |          * While it's located on `test-pages.phpscraper.de`.
132 |          *
133 |          * This page isn't actually used. It's purely to set the context.
134 |          */
135 |         $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
136 | 
137 |         // Test variations of paths to be processed
138 |         // With leading slash
139 |         $this->assertSame(
140 |             'https://test-pages-with-base-href.phpscraper.de/index.html',
141 |             $web->makeUrlAbsolute('/index.html'),
142 |         );
143 | 
144 |         // Without leading slash
145 |         $this->assertSame(
146 |             'https://test-pages-with-base-href.phpscraper.de/index.html',
147 |             $web->makeUrlAbsolute('index.html'),
148 |         );
149 | 
150 |         // Paths are considered.
151 |         $this->assertSame(
152 |             'https://test-pages-with-base-href.phpscraper.de/test/index.html',
153 |             $web->makeUrlAbsolute('test/index.html'),
154 |         );
155 | 
156 |         // Absolutely URLs are untouched.
157 |         $this->assertSame(
158 |             'https://example.com/index.html',
159 |             $web->makeUrlAbsolute('https://example.com/index.html'),
160 |         );
161 | 
162 |         // Protocol is considered
163 |         $this->assertSame(
164 |             'http://example.com/index.html',
165 |             $web->makeUrlAbsolute('http://example.com/index.html'),
166 |         );
167 |     }
168 | 
169 |     /**
170 |      * Test if passed in hosts are considered. It trumps any base-href and current url.
171 |      *
172 |      * @test
173 |      */
174 |     public function testMakeUrlAbsoluteWithBaseHost()
175 |     {
176 |         $web = new \Spekulatius\PHPScraper\PHPScraper;
177 | 
178 |         // Navigate to test page: This sets the base URL.
179 |         $web->go('https://phpscraper.de');
180 | 
181 |         // Test variations of paths to be processed
182 |         // With leading slash
183 |         $this->assertSame(
184 |             'https://example.com/index.html',
185 |             $web->makeUrlAbsolute('/index.html', 'https://example.com'),
186 |         );
187 | 
188 |         // Without leading slash
189 |         $this->assertSame(
190 |             'https://example.com/index.html',
191 |             $web->makeUrlAbsolute('index.html', 'https://example.com'),
192 |         );
193 | 
194 |         // Paths are considered.
195 |         $this->assertSame(
196 |             'https://example.com/test/index.html',
197 |             $web->makeUrlAbsolute('test/index.html', 'https://example.com'),
198 |         );
199 | 
200 |         // Absolutely URLs are untouched.
201 |         $this->assertSame(
202 |             'https://example.com/index.html',
203 |             $web->makeUrlAbsolute('https://example.com/index.html', 'https://example-2.com/test/with/path'),
204 |         );
205 | 
206 |         // Protocol is considered
207 |         $this->assertSame(
208 |             'http://example.com/index.html',
209 |             $web->makeUrlAbsolute('http://example.com/index.html', 'https://example-2.com/test/with/path'),
210 |         );
211 |     }
212 | }
213 | 


--------------------------------------------------------------------------------