├── .editorconfig ├── .github ├── FUNDING.yml └── workflows │ └── test.yaml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── UPGRADING.md ├── composer.json ├── phpstan-baseline.neon ├── phpstan.neon ├── phpunit.xml.dist ├── pint.json ├── playground.php ├── rector.php ├── src ├── Core.php ├── DataTransferObjects │ └── FeedEntry.php ├── PHPScraper.php ├── UsesBrowserKit.php ├── UsesContent.php ├── UsesFeeds.php ├── UsesFileParsers.php ├── UsesUrls.php └── UsesXPathFilters.php └── tests ├── BaseHrefTest.php ├── CanonicalTest.php ├── CoreTest.php ├── CustomSelectorTest.php ├── DownloadTest.php ├── FeedRssTest.php ├── FeedSearchIndexTest.php ├── FeedSitemapTest.php ├── HeadingTest.php ├── ImageTest.php ├── KeywordTest.php ├── LinkTest.php ├── ListsTest.php ├── MetaAuthorTest.php ├── MetaCharsetTest.php ├── MetaContentTypeTest.php ├── MetaCsrfTokenTest.php ├── MetaDescriptionTest.php ├── MetaImageTest.php ├── MetaKeywordsTest.php ├── MetaViewportTest.php ├── NavigationTest.php ├── NotFoundTest.php ├── OpenGraphTest.php ├── OutlineTest.php ├── ParagraphsTest.php ├── ParserCsvTest.php ├── ParserJsonTest.php ├── ParserXmlTest.php ├── RedirectTest.php ├── TitleTest.php ├── TwitterCardTest.php └── UrlTest.php /.editorconfig: -------------------------------------------------------------------------------- 1 | ; This file is for unifying the coding style for different editors and IDEs. 2 | ; More information at http://editorconfig.org 3 | 4 | root = true 5 | 6 | [*] 7 | charset = utf-8 8 | indent_size = 4 9 | indent_style = space 10 | end_of_line = lf 11 | insert_final_newline = true 12 | trim_trailing_whitespace = true 13 | 14 | [*.md] 15 | trim_trailing_whitespace = false 16 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: spekulatius 2 | custom: https://phpscraper.de/misc/sponsors.html -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | on: [pull_request] 2 | 3 | jobs: 4 | phpunit: 5 | name: PHPUnit 6 | runs-on: ubuntu-latest 7 | strategy: 8 | matrix: 9 | php-version: ['8.1', '8.2', '8.3'] 10 | steps: 11 | - uses: actions/checkout@v4 12 | - uses: shivammathur/setup-php@v2 13 | with: 14 | php-version: ${{ matrix.php-version }} 15 | coverage: none 16 | extensions: intl curl 17 | - run: composer update --no-interaction --no-progress --prefer-dist --ansi 18 | - run: composer test:unit 19 | 20 | phpstan: 21 | name: PHPStan 22 | runs-on: ubuntu-latest 23 | steps: 24 | - name: Checkout 25 | uses: actions/checkout@v4 26 | 27 | - name: Install PHP 28 | uses: shivammathur/setup-php@v2 29 | with: 30 | php-version: '8.1' 31 | coverage: none 32 | env: 33 | COMPOSER_TOKEN: ${{ github.token }} 34 | update: true 35 | 36 | - name: Install dependencies 37 | run: composer update --prefer-dist --no-interaction --no-progress --optimize-autoloader 38 | 39 | - name: PHPStan tests 40 | run: composer test:types 41 | 42 | rector: 43 | name: Rector 44 | runs-on: ubuntu-latest 45 | steps: 46 | - name: Checkout 47 | uses: actions/checkout@v4 48 | 49 | - name: Install PHP 50 | uses: shivammathur/setup-php@v2 51 | with: 52 | php-version: '8.1' 53 | coverage: none 54 | env: 55 | COMPOSER_TOKEN: ${{ github.token }} 56 | update: true 57 | 58 | - name: Install dependencies 59 | run: composer update --prefer-dist --no-interaction --no-progress --optimize-autoloader 60 | 61 | - name: PHPStan tests 62 | run: composer test:refactor 63 | 64 | pint: 65 | name: Pint 66 | runs-on: ubuntu-latest 67 | steps: 68 | - name: Checkout 69 | uses: actions/checkout@v4 70 | 71 | - name: Install PHP 72 | uses: shivammathur/setup-php@v2 73 | with: 74 | php-version: '8.1' 75 | coverage: none 76 | tools: cs2pr 77 | env: 78 | COMPOSER_TOKEN: ${{ github.token }} 79 | update: true 80 | 81 | - name: Install dependencies 82 | run: composer update --prefer-dist --no-interaction --no-progress --optimize-autoloader 83 | 84 | - name: Run Pint 85 | run: composer exec -- pint --test --format=checkstyle | cs2pr 86 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vendor 2 | .idea 3 | .php_cs.cache 4 | .phpunit.result.cache 5 | composer.lock 6 | yarn-error.log 7 | websites/.yarn/ 8 | websites/.yarnrc.yml 9 | .vscode 10 | .history 11 | .notes 12 | .tmp/ -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # PHPScraper CHANGELOG 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | Parts regarding the [documentation website](https://phpscraper.de), the [test pages](https://github.com/spekulatius/phpscraper-test-pages) and individual documentation changes are omitted for better readability. 6 | 7 | This project adheres to [Semantic Versioning](http://semver.org/). 8 | 9 | ## 3.0.0 (2024-04-09) 10 | 11 | - [#204](https://github.com/spekulatius/PHPScraper/pull/204): Upgrading Symfony dependencies to allow ^7.0 12 | - [#201](https://github.com/spekulatius/PHPScraper/pull/201): Pint 13 | - [#200](https://github.com/spekulatius/PHPScraper/pull/200): Upgrade from league/uri 6.x to league/uri 7.x, replacing deprecated function use with new recommended ones 14 | - [#199](https://github.com/spekulatius/PHPScraper/pull/199): Add CI job names 15 | - [#196](https://github.com/spekulatius/PHPScraper/pull/196): Upgrading repo tools 16 | - [#195](https://github.com/spekulatius/PHPScraper/pull/195): Add Pint 17 | - [#194](https://github.com/spekulatius/PHPScraper/pull/194): Fix HTTPClient config 18 | - [#192](https://github.com/spekulatius/PHPScraper/pull/192): Fix few problems reported by PHPStan 19 | - [#190](https://github.com/spekulatius/PHPScraper/pull/190): Fix typos and a critical error 20 | - [#188](https://github.com/spekulatius/PHPScraper/pull/188): Move phpstan to local temp path to ensure Windows users can run it 21 | 22 | ## 2.0.0 (2023-06-01) 23 | 24 | - [#187](https://github.com/spekulatius/PHPScraper/issues/187): Prepare v2: Improve typing, bringing PHPStan to --level=9. For details check the [CHANGELOG](https://github.com/spekulatius/PHPScraper/blob/master/UPGRADING.md#from-1x-to-2x). 25 | - [#188](https://github.com/spekulatius/PHPScraper/issues/188): Support PHPStan for Windows Users 26 | - [#185](https://github.com/spekulatius/PHPScraper/issues/185): Adding PHP 8.3 to test pipeline 27 | - [#184](https://github.com/spekulatius/PHPScraper/issues/184): Adding PHPStan GitHub Action. Thank you @nadar! 28 | - [#183](https://github.com/spekulatius/PHPScraper/issues/183): Switch from Goutte to BrowserKit 29 | - [#182](https://github.com/spekulatius/PHPScraper/issues/182): Drop PHP 7.3 and 7.4 30 | - [#174](https://github.com/spekulatius/PHPScraper/issues/174): Fix local testing 31 | - [#173](https://github.com/spekulatius/PHPScraper/issues/173): Fix README example 32 | - [#171](https://github.com/spekulatius/PHPScraper/issues/171): Various PHPStan improvements 33 | - [#169](https://github.com/spekulatius/PHPScraper/issues/169): Adding `` extraction 34 | 35 | ## 1.0.2 (2022-12-15) 36 | 37 | - [#167](https://github.com/spekulatius/PHPScraper/issues/167): Updating CHANGELOG.md 38 | - [#166](https://github.com/spekulatius/PHPScraper/issues/166): Minor tidy ups in comments 39 | - [#165](https://github.com/spekulatius/PHPScraper/issues/165): Adding PHP 8.2 to test workflow 40 | - [#160](https://github.com/spekulatius/PHPScraper/issues/160): Allow complete interface for HttpClient instead of only one class. 41 | 42 | ## 1.0.1 (2022-12-02) 43 | 44 | - [#156](https://github.com/spekulatius/PHPScraper/issues/156): Tidy up: Make file naming more intuitive and fix comments 45 | - [#154](https://github.com/spekulatius/PHPScraper/issues/154): Expose GoutteClient as an accessible property 46 | 47 | ## 1.0.0 (2022-11-24) 48 | 49 | - [#151](https://github.com/spekulatius/PHPScraper/issues/151): Migrate website into separate repo. 50 | - [#150](https://github.com/spekulatius/PHPScraper/issues/150): Switch namespaces. See [UPGRADING](https://github.com/spekulatius/PHPScraper/blob/master/UPGRADING.md) for more details. 51 | - [#147](https://github.com/spekulatius/PHPScraper/issues/147): Prepare for v1.0 52 | 53 | ## 0.13.0 (2022-11-21) 54 | 55 | - [#146](https://github.com/spekulatius/PHPScraper/issues/146): Implement plain text file/URL parsing. 56 | 57 | ## 0.12.0 (2022-11-10) 58 | 59 | - [#142](https://github.com/spekulatius/PHPScraper/issues/142): Implement feed parsing. 60 | - [#145](https://github.com/spekulatius/PHPScraper/issues/145): Re-enable previously deactivated tests 61 | 62 | ## 0.11.0 (2022-11-01) 63 | 64 | - [#137](https://github.com/spekulatius/PHPScraper/issues/137): Fix download bug and improve testing 65 | 66 | ## 0.10.0 (2022-11-01) 67 | 68 | - [#136](https://github.com/spekulatius/PHPScraper/issues/136): Expand set of URL-related methods 69 | 70 | ## 0.9.0 (2022-10-28) 71 | 72 | - [#79](https://github.com/spekulatius/PHPScraper/issues/79): Replace URL lib. Sub-domain support dropped. 73 | 74 | ## 0.8.0 (2022-10-27) 75 | 76 | - Maintenance: [Split Core lib](https://github.com/spekulatius/PHPScraper/commit/2ca34caae75e634442daf9c4f886060e41ba8911) for better understandably. 77 | 78 | ## 0.7.0 (2022-10-14) 79 | 80 | - [Generalize Configuration API](https://github.com/spekulatius/PHPScraper/commit/e19baeb19658fbc4846c24eb597876f54c6012a3) for better usability. 81 | - [Proxy Support](https://github.com/spekulatius/PHPScraper/commit/326bdff4430a326bdb08f6af8452f148250c7784) 82 | 83 | ## 0.6.0 (2022-07-14) 84 | 85 | - [#77](https://github.com/spekulatius/PHPScraper/issues/77): Upgrade to allow Symfony 6 86 | 87 | ## 0.5.0 (2022-08-16) 88 | 89 | - Add [`rel`-interpretation](https://github.com/spekulatius/PHPScraper/commit/47d6f8a0f6adf49de31b691b98ea472a4a382b9f) to link methods. 90 | - Add support to BYO-HTML: [`setContent`](https://github.com/spekulatius/PHPScraper/commit/9c50d145f280732e26ecf83c8d2978c07466dfcd). 91 | - Improve typing support 92 | - [Add Lists](https://github.com/spekulatius/PHPScraper/commit/0aac52853ab394d9f38b004e401c5fbec328e017) 93 | 94 | ## 0.4.0 (2022-08-16) 95 | 96 | - Add [keyword scoring](https://github.com/spekulatius/PHPScraper/commit/e91bce24e4b53d9a1ef19b3f1ded97627eb2076e) in. 97 | 98 | ## 0.3.0 (2022-06-20) 99 | 100 | - Add [keyword extraction](https://github.com/spekulatius/PHPScraper/commit/9d20004ead5b9e8350a03fa6fc4de1477b19bd4c) lib in. 101 | 102 | ## 0.2.0 (2022-06-20) 103 | 104 | - Adding [support for `internalLinks` & `externalLinks`](https://github.com/spekulatius/PHPScraper/commit/193f422f206b7a10586463fff4a7f9dcc9e896f9). 105 | 106 | ## 0.1.0 (2022-05-04) 107 | 108 | - Start testing using PHPUnit. 109 | - Drop keeping own copy of current URL. 110 | - Initial commit with basics functionality. -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | This page contains guidelines for contributing to the this project. Please 4 | review these guidelines before submitting any pull requests. 5 | 6 | ## Pull Requests 7 | 8 | The pull request process differs for new features and bugs. Before sending a 9 | pull request for a new feature, you should first create an issue with 10 | `[Proposal]` in the title. The proposal should describe the new feature, as well 11 | as implementation ideas. The proposal will then be reviewed and either approved 12 | or denied. Once a proposal is approved, a pull request may be created 13 | implementing the new feature. Pull requests which do not follow this guideline 14 | will be closed immediately. 15 | 16 | Pull requests for bugs may be sent without creating any proposal issue. If you 17 | believe that you know of a solution for a bug that has been filed on GitHub, 18 | please leave a comment detailing your proposed fix. 19 | 20 | ### Feature Requests 21 | 22 | If you have an idea for a new feature you would like to see added, you may 23 | create an issue on GitHub with `[Request]` in the title. The feature request 24 | will then be reviewed. 25 | 26 | ## Coding Guidelines 27 | 28 | This project follows the PSR-0, PSR-1, and PSR-2 coding standards. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | 5 | PHP Scraper: a web utility for PHP 6 | 7 | 8 |

9 | 10 | Unit Tests 11 | 12 | 13 | Total Downloads 14 | 15 | 16 | Latest Version 17 | 18 | 19 | License 20 | 21 |

22 |

23 | For full documentation, visit phpscraper.de. 24 |

25 |

26 | 27 | PHPScraper is a versatile web-utility for PHP. Its primary objective is to streamline the process of extracting information from websites, allowing you to focus on accomplishing tasks without getting caught up in the complexities of selectors, data structure preparation, and conversion. 28 | 29 | Under the hood, it uses 30 | 31 | - [BrowserKit](https://symfony.com/doc/current/components/browser_kit.html) (formerly [Goutte](https://github.com/FriendsOfPHP/Goutte)) to access the web 32 | - [League/URI](https://github.com/thephpleague/uri) to process URLs 33 | - [donatello-za/rake-php-plus](https://github.com/donatello-za/rake-php-plus) to extract and analyze keywords 34 | 35 | See [composer.json](https://github.com/spekulatius/PHPScraper/blob/master/composer.json) for more details. 36 | 37 | 38 | :timer_clock: PHPScraper in 5 Minutes explained 39 | ----------------------------------------------- 40 | 41 | Here are a few impressions of the way the library works. More examples are on the [project website](https://phpscraper.de/examples/scrape-website-title.html). 42 | 43 | ### Basics: Flexible Calling as an Attribute or Method 44 | 45 | All scraping functionality can be accessed either as a function call or a property call. For example, the title can be accessed in two ways: 46 | 47 | ```php 48 | // Prep 49 | $web = new \Spekulatius\PHPScraper\PHPScraper; 50 | $web->go('https://google.com'); 51 | 52 | // Returns "Google" 53 | echo $web->title; 54 | 55 | // Also returns "Google" 56 | echo $web->title(); 57 | ``` 58 | 59 | ### :battery: Batteries included: Meta data, Links, Images, Headings, Content, Keywords, ... 60 | 61 | Many common use cases are covered already. You can find prepared extractors for various HTML tags, including interesting attributes. You can filter and combine these to your needs. In some cases there is an option to get a simple or detailed version, here in the case of `linksWithDetails`: 62 | 63 | ```PHP 64 | $web = new \Spekulatius\PHPScraper\PHPScraper; 65 | 66 | // Contains: 67 | // 68 | // 69 | // 70 | // 71 | $web->go('https://test-pages.phpscraper.de/links/image-urls.html'); 72 | 73 | // Get the first link on the page and print the result 74 | print_r($web->linksWithDetails[0]); 75 | // [ 76 | // 'url' => 'https://placekitten.com/456/500', 77 | // 'protocol' => 'https', 78 | // 'text' => '', 79 | // 'title' => null, 80 | // 'target' => null, 81 | // 'rel' => 'ugc', 82 | // 'image' => [ 83 | // 'https://placekitten.com/456/400', 84 | // 'https://placekitten.com/456/300' 85 | // ], 86 | // 'isNofollow' => false, 87 | // 'isUGC' => true, 88 | // 'isSponsored' => false, 89 | // 'isMe' => false, 90 | // 'isNoopener' => false, 91 | // 'isNoreferrer' => false, 92 | // ] 93 | ``` 94 | 95 | If there aren't any matching elements (here links) on the page, an empty array will be returned. If a method normally returns a string it might return `null`. Details such as `follow_redirects`, etc. are optional configuration parameters (see below). 96 | 97 | Most of the DOM should be covered using these methods: 98 | 99 | - several [meta-tags](https://phpscraper.de/examples/scrape-meta-tags.html) and other [``-information](https://phpscraper.de/examples/scrape-header-tags.html) 100 | - [Social-Media information](https://phpscraper.de/examples/scrape-social-media-meta-tags.html) like Twitter Card and Facebook Open Graph 101 | - Content: [Headings](https://phpscraper.de/examples/headings.html), [Outline](https://phpscraper.de/examples/outline.html), [Texts](https://phpscraper.de/examples/paragraphs.html) and [Lists](https://phpscraper.de/examples/lists.html) 102 | - [Images](https://phpscraper.de/examples/scrape-images.html) 103 | - [Links](https://phpscraper.de/examples/scrape-links.html) 104 | - [Keywords](https://phpscraper.de/examples/extract-keywords.html) 105 | 106 | **A full list of methods with example code can be found on [phpscraper.de](https://phpscraper.de). Further examples are in the [tests](https://github.com/spekulatius/PHPScraper/tree/master/tests).** 107 | 108 | 109 | ### Download Files 110 | 111 | Besides processing the content on the page itself, you can download files using `fetchAsset`: 112 | 113 | ```php 114 | // Absolute URL 115 | $csvString = $web->fetchAsset('https://test-pages.phpscraper.de/test.csv'); 116 | 117 | // Relative URL after navigation 118 | $csvString = $web 119 | ->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html') 120 | ->fetchAsset('/test.csv'); 121 | ``` 122 | 123 | You will only need to write the content into a file or cloud storage. 124 | 125 | 126 | ### Process the RSS feeds, `sitemap.xml`, etc. 127 | 128 | PHPScraper can assist in collecting feeds such as [RSS feeds, `sitemap.xml`-entries and static search indexes](https://phpscraper.de/examples/scrape-feeds.html). This can be useful when deciding on the next page to crawl or building up a list of pages on a website. 129 | 130 | Here we are processing the sitemap into a set of [`FeedEntry`-DTOs](https://github.com/spekulatius/PHPScraper/blob/master/src/DataTransferObjects/FeedEntry.php): 131 | 132 | ```php 133 | (new \Spekulatius\PHPScraper\PHPScraper) 134 | ->go('https://phpscraper.de') 135 | ->sitemap 136 | 137 | // array(131) { 138 | // [0]=> 139 | // object(Spekulatius\PHPScraper\DataTransferObjects\FeedEntry)#165 (3) { 140 | // ["title"]=> 141 | // string(0) "" 142 | // ["description"]=> 143 | // string(0) "" 144 | // ["link"]=> 145 | // string(22) "https://phpscraper.de/" 146 | // } 147 | // [1]=> 148 | // ... 149 | ``` 150 | 151 | Whenever post-processing is applied, you can fall back to the underlying `*Raw`-methods. 152 | 153 | 154 | ### Process CSV-, XML- and JSON files and URLs 155 | 156 | PHPScraper comes out of the box with file / URL processing methods for CSV-, XML- and JSON: 157 | 158 | - `parseJson` 159 | - `parseXml` 160 | - `parseCsv` 161 | - `parseCsvWithHeader` (generates an asso. array using the first row) 162 | 163 | Each method can process both strings as well as URLs: 164 | 165 | ```php 166 | // Parse JSON into array: 167 | $json = $web->parseJson('[{"title": "PHP Scraper: a web utility for PHP", "url": "https://phpscraper.de"}]'); 168 | // [ 169 | // 'title' => 'PHP Scraper: a web utility for PHP', 170 | // 'url' => 'https://phpscraper.de' 171 | // ] 172 | 173 | // Fetch and parse CSV into a simple array: 174 | $csv = $web->parseCsv('https://test-pages.phpscraper.de/test.csv'); 175 | // [ 176 | // ['date', 'value'], 177 | // ['1945-02-06', 4.20], 178 | // ['1952-03-11', 42], 179 | // ] 180 | 181 | // Fetch and parse CSV with first row as header into an asso. array structure: 182 | $csv = $web->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv'); 183 | // [ 184 | // ['date' => '1945-02-06', 'value' => 4.20], 185 | // ['date' => '1952-03-11', 'value' => 42], 186 | // ] 187 | ``` 188 | 189 | Additional CSV parsing parameters such as separator, enclosure and escape are possible. 190 | 191 | 192 | ### There is more! 193 | 194 | There are plenty of examples on the [PHPScraper website](https://phpscraper.de) and in the [tests](https://github.com/spekulatius/PHPScraper/tree/master/tests). 195 | 196 | Check the [`playground.php`](https://github.com/spekulatius/PHPScraper/blob/master/playground.php) if you prefer learning by doing. You get it up and running with: 197 | 198 | ```bash 199 | $ git clone git@github.com:spekulatius/PHPScraper.git && composer update 200 | ``` 201 | 202 | :muscle: Roadmap 203 | ---------------- 204 | 205 | The future development is organized into [milestones](https://github.com/spekulatius/PHPScraper/milestones?direction=asc&sort=title). Releases follow [semver](https://semver.org/). 206 | 207 | ### v1: [Building the first stable version](https://github.com/spekulatius/PHPScraper/milestone/4?closed=1) 208 | 209 | - Improve documentation and examples. 210 | - Organize code better (move websites into separate repos, etc.) 211 | - Add support for feeds and some typical file types. 212 | 213 | ### v2: Service Upgrade: 214 | 215 | - Switch from Goutte to [Symfony BrowserKit](https://symfony.com/doc/current/components/browser_kit.html). Goutte has been archived. 216 | 217 | ### v3: [Expand the functionality and cover more 'types'](https://github.com/spekulatius/PHPScraper/milestone/5) 218 | 219 | - Expand to parse a wider range of types, elements, embeds, etc. 220 | - Improve performance with caching and concurrent fetching of assets 221 | - Minor improvements for parsing methods 222 | 223 | ### v4: [Expand to provide more guidance on building custom scrapers on top of PHPScraper](https://github.com/spekulatius/PHPScraper/milestone/6) 224 | 225 | TBC. 226 | 227 | 228 | :heart_eyes: Sponsors 229 | --------------------- 230 | 231 | PHPScraper is sponsored by: 232 | 233 | 234 | 235 | With your support, PHPScraper can became the *PHP swiss army knife for the web*. If you find PHPScraper useful to your work, please consider a [sponsorship](https://github.com/sponsors/spekulatius) or [donation](https://www.buymeacoffee.com/spekulatius). Thank you :muscle: 236 | 237 | 238 | :gear: Configuration (optional) 239 | ------------------------------- 240 | 241 | If needed, you can use the following configuration options: 242 | 243 | ### User Agent 244 | 245 | You can set the browser agent using `setConfig`: 246 | 247 | ```php 248 | $web->setConfig([ 249 | 'agent' => 'Mozilla/5.0 (X11; Linux x86_64; rv:107.0) Gecko/20100101 Firefox/107.0' 250 | ]); 251 | ``` 252 | 253 | It defaults to `Mozilla/5.0 (compatible; PHP Scraper/1.x; +https://phpscraper.de)`. 254 | 255 | ### Proxy Support 256 | 257 | You can configure proxy support with `setConfig`: 258 | 259 | ```php 260 | $web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']); 261 | ``` 262 | 263 | ### Timeout 264 | 265 | You can set the `timeout` using `setConfig`: 266 | 267 | ```php 268 | $web->setConfig(['timeout' => 15]); 269 | ``` 270 | 271 | Setting the timeout to zero will disable it. 272 | 273 | ### Disabling SSL 274 | 275 | While unrecommended, it might be required to disable SSL checks. You can do so using: 276 | 277 | ```php 278 | $web->setConfig(['disable_ssl' => true]); 279 | ``` 280 | 281 | You can call `setConfig` multiple times. It stores the config and merges it with previous settings. This should be kept in mind in the unlikely use-case when unsetting values. 282 | 283 | 284 | :rocket: Installation with Composer 285 | ----------------------------------- 286 | 287 | ```bash 288 | composer require spekulatius/phpscraper 289 | ``` 290 | 291 | After the installation, the package will be picked up by the Composer autoloader. If you are using a common PHP application or framework such as Laravel or Symfony you can start scraping now :rocket: 292 | 293 | If not or you are building a standalone-scraper, please include the autoloader in `vendor/` at the top of your file: 294 | 295 | ```php 296 | 2 | 15 | 16 | 17 | 18 | ./tests 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /pint.json: -------------------------------------------------------------------------------- 1 | { 2 | "preset": "laravel", 3 | "rules": { 4 | "simplified_null_return": true, 5 | "braces": false, 6 | "new_with_braces": { 7 | "anonymous_class": false, 8 | "named_class": false 9 | }, 10 | "concat_space": false, 11 | "ordered_traits": false 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /playground.php: -------------------------------------------------------------------------------- 1 | paths([ 12 | __DIR__.'/src', 13 | ]); 14 | 15 | $rectorConfig->rules([ 16 | InlineConstructorDefaultToPropertyRector::class, 17 | ]); 18 | 19 | $rectorConfig->sets([ 20 | // LevelSetList::UP_TO_PHP_82, 21 | // SetList::CODE_QUALITY, 22 | SetList::DEAD_CODE, 23 | SetList::TYPE_DECLARATION, 24 | ]); 25 | }; 26 | -------------------------------------------------------------------------------- /src/Core.php: -------------------------------------------------------------------------------- 1 | $data 26 | **/ 27 | public static function fromArray(array $data): self 28 | { 29 | // Convert to an object and return the instance. 30 | return new self( 31 | $data['title'] ?? '', 32 | $data['description'] ?? '', 33 | $data['link'] 34 | ); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/PHPScraper.php: -------------------------------------------------------------------------------- 1 | core = new Core; 38 | 39 | // And set the config. 40 | $this->setConfig($config); 41 | } 42 | 43 | /** 44 | * Sets the config, generates the required Clients and updates the core with the new clients. 45 | * 46 | * @param PHPScraperConfig $config 47 | */ 48 | public function setConfig(array $config = []): self 49 | { 50 | // Define the default values 51 | $defaults = [ 52 | // We assume that we want to follow any redirects, in reason. 53 | 'follow_redirects' => true, 54 | 'follow_meta_refresh' => true, 55 | 'max_redirects' => 5, 56 | 57 | /** 58 | * Agent can be overwritten using: 59 | * 60 | * ```php 61 | * $web->setConfig(['agent' => 'My Agent']); 62 | * ``` 63 | */ 64 | 'agent' => 'Mozilla/5.0 (compatible; PHP Scraper/1.x; +https://phpscraper.de)', 65 | 66 | /** 67 | * Setting the Proxy 68 | * 69 | * ```php 70 | * $web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']); 71 | * ``` 72 | */ 73 | 'proxy' => null, 74 | 75 | /** 76 | * Timeout in seconds. 77 | * 78 | * ```php 79 | * $web->setConfig(['timeout' => 15]); 80 | * ``` 81 | */ 82 | 'timeout' => 10, 83 | 84 | /** 85 | * Disable SSL (not recommended unless really needed). 86 | * 87 | * @var bool 88 | */ 89 | 'disable_ssl' => false, 90 | ]; 91 | 92 | // Add the defaults in 93 | $this->config = array_merge($defaults, $config); 94 | 95 | // Symfony HttpClient 96 | $httpClient = SymfonyHttpClient::create([ 97 | 'proxy' => $this->config['proxy'], 98 | 'timeout' => $this->config['timeout'], 99 | 'verify_host' => ! $this->config['disable_ssl'], 100 | 'verify_peer' => ! $this->config['disable_ssl'], 101 | ]); 102 | 103 | // BrowserKit Client and set some config needed for it. 104 | $client = new HttpBrowser($httpClient); 105 | $client->followRedirects($this->config['follow_redirects']); 106 | $client->followMetaRefresh($this->config['follow_meta_refresh']); 107 | $client->setMaxRedirects($this->config['max_redirects']); 108 | $client->setServerParameter('HTTP_USER_AGENT', $this->config['agent']); 109 | 110 | // Set the client on the core. 111 | $this->core->setClient($client); 112 | $this->core->setHttpClient($httpClient); 113 | 114 | return $this; 115 | } 116 | 117 | /** 118 | * Catch calls to properties and process them accordingly. 119 | * 120 | * @return mixed 121 | */ 122 | public function __get(string $name) 123 | { 124 | // We are assuming that all calls for properties actually method calls... 125 | return $this->__call($name); 126 | } 127 | 128 | /** 129 | * Catches the method calls and tries to satisfy them. 130 | * 131 | * @param array $arguments 132 | * @return mixed 133 | */ 134 | public function __call(string $name, array $arguments = []) 135 | { 136 | $result = $this->core->$name(...$arguments); 137 | 138 | // Did we get a Core class element? Keep this. 139 | if ($result instanceof Core) { 140 | $this->core = $result; 141 | 142 | return $this; 143 | } 144 | 145 | // Otherwise: just return whatever the core returned. 146 | return $result; 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/UsesBrowserKit.php: -------------------------------------------------------------------------------- 1 | client = $client; 38 | 39 | return $this; 40 | } 41 | 42 | /** 43 | * Overwrites the httpClient 44 | */ 45 | public function setHttpClient(HttpClientInterface $httpClient): self 46 | { 47 | $this->httpClient = $httpClient; 48 | 49 | return $this; 50 | } 51 | 52 | /** 53 | * Retrieve the client 54 | * 55 | * @return \Symfony\Component\BrowserKit\HttpBrowser $client 56 | */ 57 | public function client(): HttpBrowser 58 | { 59 | return $this->client; 60 | } 61 | 62 | /** 63 | * Any URL-related methods are in `UsesUrls.php`. 64 | **/ 65 | 66 | /** 67 | * Navigates to a new page using an URL. 68 | */ 69 | public function go(string $url): self 70 | { 71 | // Keep it around for internal processing. 72 | $this->currentPage = $this->client->request('GET', $url); 73 | 74 | return $this; 75 | } 76 | 77 | /** 78 | * Allows to set HTML content to process. 79 | * 80 | * This is intended to be used as a work-around, if you already have the DOM. 81 | */ 82 | public function setContent(string $url, string $content): self 83 | { 84 | // Overwrite the current page with a fresh Crawler instance of the content. 85 | $this->currentPage = new Crawler($content, $url); 86 | 87 | return $this; 88 | } 89 | 90 | /** 91 | * Fetch an asset from a given absolute or relative URL 92 | */ 93 | public function fetchAsset(string $url): string 94 | { 95 | return $this 96 | ->httpClient 97 | ->request( 98 | 'GET', 99 | ($this->currentPage === null) ? $url : (string) $this->makeUrlAbsolute($url), 100 | ) 101 | ->getContent(); 102 | } 103 | 104 | /** 105 | * Click a link (either with title or url) 106 | * 107 | * @param string $titleOrUrl 108 | */ 109 | public function clickLink($titleOrUrl): self 110 | { 111 | // If the string starts with http just go to it - we assume it's an URL 112 | if (\stripos($titleOrUrl, 'http') === 0) { 113 | // Go to a URL 114 | $this->go($titleOrUrl); 115 | } else { 116 | // Find link based on the title 117 | $link = $this->currentPage->selectLink($titleOrUrl)->link(); 118 | 119 | // Click the link and store the DOMCrawler object 120 | $this->currentPage = $this->client->click($link); 121 | } 122 | 123 | return $this; 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/UsesContent.php: -------------------------------------------------------------------------------- 1 | filterFirstText('//title'); 25 | } 26 | 27 | public function charset(): ?string 28 | { 29 | return $this->filterFirstExtractAttribute('//meta[@charset]', ['charset']); 30 | } 31 | 32 | public function contentType(): ?string 33 | { 34 | return $this->filterFirstExtractAttribute('//meta[@http-equiv="Content-type"]', ['content']); 35 | } 36 | 37 | public function canonical(): ?string 38 | { 39 | return $this->filterFirstExtractAttribute('//link[@rel="canonical"]', ['href']); 40 | } 41 | 42 | public function viewportString(): ?string 43 | { 44 | return $this->filterFirstContent('//meta[@name="viewport"]'); 45 | } 46 | 47 | public function viewport(): array 48 | { 49 | return is_null($this->viewportString()) ? [] : (array) \preg_split('/,\s*/', $this->viewportString()); 50 | } 51 | 52 | public function csrfToken(): ?string 53 | { 54 | return $this->filterFirstExtractAttribute('//meta[@name="csrf-token"]', ['content']); 55 | } 56 | 57 | public function baseHref(): ?string 58 | { 59 | return $this->filterFirstExtractAttribute('//base', ['href']); 60 | } 61 | 62 | /** 63 | * Get the header collected as an array 64 | * 65 | * @return array{charset: mixed, contentType: mixed, viewport: mixed, canonical: mixed, csrfToken: mixed} 66 | */ 67 | public function headers(): array 68 | { 69 | return [ 70 | 'charset' => $this->charset(), 71 | 'contentType' => $this->contentType(), 72 | 'viewport' => $this->viewport(), 73 | 'canonical' => $this->canonical(), 74 | 'csrfToken' => $this->csrfToken(), 75 | ]; 76 | } 77 | 78 | public function author(): ?string 79 | { 80 | return $this->filterFirstContent('//meta[@name="author"]'); 81 | } 82 | 83 | public function image(): ?string 84 | { 85 | return $this->makeUrlAbsolute($this->filterFirstContent('//meta[@name="image"]')); 86 | } 87 | 88 | public function keywordString(): ?string 89 | { 90 | return $this->filterFirstContent('//meta[@name="keywords"]'); 91 | } 92 | 93 | public function keywords(): array 94 | { 95 | return is_null($this->keywordString()) ? [] : (array) \preg_split('/,\s*/', $this->keywordString()); 96 | } 97 | 98 | public function description(): ?string 99 | { 100 | return $this->filterFirstContent('//meta[@name="description"]'); 101 | } 102 | 103 | /** 104 | * Get the meta collected as an array 105 | * 106 | * @return array{author: mixed, image: mixed, keywords: mixed, description: mixed} 107 | */ 108 | public function metaTags(): array 109 | { 110 | return [ 111 | 'author' => $this->author(), 112 | 'image' => $this->image(), 113 | 'keywords' => $this->keywords(), 114 | 'description' => $this->description(), 115 | ]; 116 | } 117 | 118 | /** 119 | * Gets all Twitter-Card attributes (`twitter:`) as an array 120 | * 121 | * @return array 122 | */ 123 | public function twitterCard(): array 124 | { 125 | $data = $this 126 | ->filter('//meta[contains(@name, "twitter:")]') 127 | ->extract(['name', 'content']); 128 | 129 | // Prepare the data 130 | $result = []; 131 | foreach ($data as $set) { 132 | $result[(string) $set[0]] = (string) $set[1]; 133 | } 134 | 135 | return $result; 136 | } 137 | 138 | /** 139 | * Gets any OpenGraph attributes (`og:`) as an array 140 | * 141 | * @return array 142 | */ 143 | public function openGraph(): array 144 | { 145 | $data = $this 146 | ->filter('//meta[contains(@property, "og:")]') 147 | ->extract(['property', 'content']); 148 | 149 | // Prepare the data 150 | $result = []; 151 | foreach ($data as $set) { 152 | $result[(string) $set[0]] = (string) $set[1]; 153 | } 154 | 155 | return $result; 156 | } 157 | 158 | public function h1(): array 159 | { 160 | return $this->filterExtractAttributes('//h1', ['_text']); 161 | } 162 | 163 | public function h2(): array 164 | { 165 | return $this->filterExtractAttributes('//h2', ['_text']); 166 | } 167 | 168 | public function h3(): array 169 | { 170 | return $this->filterExtractAttributes('//h3', ['_text']); 171 | } 172 | 173 | public function h4(): array 174 | { 175 | return $this->filterExtractAttributes('//h4', ['_text']); 176 | } 177 | 178 | public function h5(): array 179 | { 180 | return $this->filterExtractAttributes('//h5', ['_text']); 181 | } 182 | 183 | public function h6(): array 184 | { 185 | return $this->filterExtractAttributes('//h6', ['_text']); 186 | } 187 | 188 | /** 189 | * Get all heading tags 190 | * 191 | * @return array 192 | */ 193 | public function headings(): array 194 | { 195 | return [ 196 | $this->h1(), 197 | $this->h2(), 198 | $this->h3(), 199 | $this->h4(), 200 | $this->h5(), 201 | $this->h6(), 202 | ]; 203 | } 204 | 205 | public function lists(): array 206 | { 207 | $lists = []; 208 | 209 | /** @var \DOMElement $list */ 210 | foreach ($this->currentPage->filter('ol, ul') as $list) { 211 | $lists[] = [ 212 | 'type' => $list->tagName, 213 | 'children' => $list->childNodes, 214 | 'children_plain' => array_values(array_filter(array_map('trim', explode("\n", $list->textContent)))), 215 | ]; 216 | } 217 | 218 | return $lists; 219 | } 220 | 221 | /** 222 | * @return array 223 | **/ 224 | public function orderedLists(): array 225 | { 226 | return array_values(array_filter($this->lists(), fn ($list): bool => $list['type'] === 'ol')); 227 | } 228 | 229 | /** 230 | * @return array 231 | **/ 232 | public function unorderedLists(): array 233 | { 234 | return array_values(array_filter($this->lists(), fn ($list): bool => $list['type'] === 'ul')); 235 | } 236 | 237 | /** 238 | * @return array 239 | **/ 240 | public function paragraphs(): array 241 | { 242 | return array_map( 243 | 'trim', 244 | $this->filterExtractAttributes('//p', ['_text']) 245 | ); 246 | } 247 | 248 | /** 249 | * Get the paragraphs of the page excluding empty paragraphs. 250 | */ 251 | public function cleanParagraphs(): array 252 | { 253 | return array_values(array_filter( 254 | $this->paragraphs(), 255 | fn ($paragraph): bool => $paragraph !== '' 256 | )); 257 | } 258 | 259 | /** 260 | * Parses the content outline of the web-page 261 | * 262 | * @return array 263 | */ 264 | public function outline(): array 265 | { 266 | $result = $this->filterExtractAttributes('//h1|//h2|//h3|//h4|//h5|//h6', ['_name', '_text']); 267 | 268 | foreach ($result as $index => $array) { 269 | $result[$index] = array_combine(['tag', 'content'], (array) $array); 270 | } 271 | 272 | return $result; 273 | } 274 | 275 | /** 276 | * Parses the content outline of the web-page 277 | * 278 | * @return array 279 | */ 280 | public function outlineWithParagraphs(): array 281 | { 282 | $result = $this->filterExtractAttributes('//h1|//h2|//h3|//h4|//h5|//h6|//p', ['_name', '_text']); 283 | 284 | foreach ($result as $index => $array) { 285 | $result[$index] = array_combine(['tag', 'content'], (array) $array); 286 | $result[$index]['content'] = trim((string) $result[$index]['content']); 287 | } 288 | 289 | return $result; 290 | } 291 | 292 | /** 293 | * Parses the content outline of the web-page 294 | */ 295 | public function cleanOutlineWithParagraphs(): array 296 | { 297 | $elementsNameAndText = $this->filterExtractAttributes('//h1|//h2|//h3|//h4|//h5|//h6|//p', ['_name', '_text']); 298 | 299 | /** @var array $nameAndText */ 300 | foreach ($elementsNameAndText as $index => $nameAndText) { 301 | // Element has no text. 302 | if ($nameAndText[1] === '') { 303 | continue; 304 | } 305 | 306 | $elementsNameAndText[$index] = [ 307 | 'tag' => $nameAndText[0], 308 | 'content' => trim($nameAndText[1]), 309 | ]; 310 | } 311 | 312 | return $elementsNameAndText; 313 | } 314 | 315 | /** 316 | * Internal method to prepare the content for keyword analysis 317 | * done in the called methods for the rake analysis 318 | * 319 | * Uses: 320 | * 321 | * - Title 322 | * - Headings 323 | * - Paragraphs/Content 324 | * - Link anchors and Titles 325 | * - Alt Texts of Images 326 | * - Meta Title, Description and Keywords 327 | * 328 | * @see https://github.com/Donatello-za/rake-php-plus 329 | * @see https://phpscraper.de/examples/extract-keywords.html 330 | * @see https://github.com/spekulatius/phpscraper-keyword-scraping-example 331 | * 332 | * @return array 333 | */ 334 | protected function prepContent(): array 335 | { 336 | // Collect content strings 337 | $content = array_merge( 338 | // Website title 339 | [$this->title()], 340 | 341 | // Paragraphs 342 | $this->paragraphs(), 343 | 344 | // Various meta tags 345 | [ 346 | $this->author(), 347 | $this->description(), 348 | implode(' ', $this->keywords()), 349 | ] 350 | ); 351 | 352 | // Add headings 353 | foreach ($this->headings() as $headings) { 354 | $content += array_values($headings); 355 | } 356 | 357 | // Add image alt texts in 358 | foreach ($this->linksWithDetails() as $link) { 359 | $content[] = $link['text']; 360 | $content[] = $link['title']; 361 | } 362 | foreach ($this->imagesWithDetails() as $image) { 363 | $content[] = $image['alt']; 364 | } 365 | 366 | return $content; 367 | } 368 | 369 | /** 370 | * Gets a set of keywords based on the rake approach. 371 | * 372 | * Uses: 373 | * 374 | * - Title 375 | * - Headings 376 | * - Paragraphs/Content 377 | * - Link anchors and Titles 378 | * - Alt Texts of Images 379 | * - Meta Title, Description and Keywords 380 | * 381 | * @see https://github.com/Donatello-za/rake-php-plus 382 | * @see https://phpscraper.de/examples/extract-keywords.html 383 | * @see https://github.com/spekulatius/phpscraper-keyword-scraping-example 384 | * 385 | * @param string $locale (default: 'en_US') 386 | */ 387 | public function contentKeywords($locale = 'en_US'): array 388 | { 389 | // Extract the keyword phrases and return a sorted array 390 | return RakePlus::create(implode(' ', $this->prepContent()), $locale) 391 | ->sort('asc') 392 | ->get(); 393 | } 394 | 395 | /** 396 | * Gets a set of keywords with scores based on the rake approach 397 | * 398 | * Uses: 399 | * 400 | * - Title 401 | * - Headings 402 | * - Paragraphs/Content 403 | * - Link anchors and Titles 404 | * - Alt Texts of Images 405 | * - Meta Title, Description and Keywords 406 | * 407 | * @see https://github.com/Donatello-za/rake-php-plus 408 | * @see https://phpscraper.de/examples/extract-keywords.html 409 | * @see https://github.com/spekulatius/phpscraper-keyword-scraping-example 410 | * 411 | * @param string $locale (default: 'en_US') 412 | */ 413 | public function contentKeywordsWithScores($locale = 'en_US'): array 414 | { 415 | // Extract the keyword phrases and return a sorted array 416 | return RakePlus::create(implode(' ', $this->prepContent()), $locale) 417 | ->sortByScore('desc') 418 | ->scores(); 419 | } 420 | 421 | /** 422 | * Get all links on the page as absolute URLs 423 | * 424 | * @see https://github.com/spekulatius/link-scraping-test-beautifulsoup-vs-phpscraper 425 | */ 426 | public function links(): array 427 | { 428 | $links = $this->filter('//a')->links(); 429 | 430 | // Generate a list of all image entries 431 | $result = []; 432 | foreach ($links as $link) { 433 | $result[] = $link->getUri(); 434 | } 435 | 436 | return $result; 437 | } 438 | 439 | /** 440 | * Get all internal links (same root or sub-domain) on the page as absolute URLs 441 | */ 442 | public function internalLinks(): array 443 | { 444 | // Get the current host - to compare against for internal links 445 | $currentRootDomain = $this->currentHost(); 446 | 447 | // Filter the array 448 | return array_values(array_filter( 449 | $this->links(), 450 | function ($link) use (&$currentRootDomain): bool { 451 | $linkRootDomain = Uri::new($link)->getHost(); 452 | 453 | return $currentRootDomain === $linkRootDomain; 454 | } 455 | )); 456 | } 457 | 458 | /** 459 | * Get all external links on the page as absolute URLs 460 | */ 461 | public function externalLinks(): array 462 | { 463 | // Diff the array 464 | return array_values(array_diff( 465 | $this->links(), 466 | $this->internalLinks() 467 | )); 468 | } 469 | 470 | /** 471 | * Get all links on the page with commonly interesting details 472 | */ 473 | public function linksWithDetails(): array 474 | { 475 | /** @var array<\DOMElement> $links */ 476 | $links = $this->filter('//a'); 477 | 478 | // Generate a list of all image entries 479 | $result = []; 480 | 481 | foreach ($links as $link) { 482 | // Check if the anchor is only an image. If so, wrap it into DomCrawler\Image to get the Uri. 483 | $image = []; 484 | 485 | /** @var \DOMElement $childNode */ 486 | foreach ($link->childNodes as $childNode) { 487 | if ($childNode->nodeName === 'img') { 488 | $image[] = (new DomCrawlerImage($childNode, $this->currentBaseHost()))->getUri(); 489 | } 490 | } 491 | 492 | // Collect commonly interesting attributes and URL 493 | $rel = $link->getAttribute('rel'); 494 | 495 | // Generate the proper uri using the Symfony's link class 496 | $uri = (new DomCrawlerLink($link, $this->currentBaseHost()))->getUri(); 497 | 498 | // Prepare the result set. 499 | $entry = [ 500 | 'url' => $uri, 501 | 'protocol' => str_contains($uri, ':') ? explode(':', $uri)[0] : null, 502 | 'text' => trim($link->nodeValue ?? ''), 503 | 'title' => $link->getAttribute('title') === '' ? null : $link->getAttribute('title'), 504 | 'target' => $link->getAttribute('target') === '' ? null : $link->getAttribute('target'), 505 | 'rel' => ($rel === '') ? null : strtolower($rel), 506 | 'image' => $image, 507 | 'isNofollow' => ($rel === '') ? false : str_contains($rel, 'nofollow'), 508 | 'isUGC' => ($rel === '') ? false : str_contains($rel, 'ugc'), 509 | 'isSponsored' => ($rel === '') ? false : str_contains($rel, 'sponsored'), 510 | 'isMe' => ($rel === '') ? false : str_contains($rel, 'me'), 511 | 'isNoopener' => ($rel === '') ? false : str_contains($rel, 'noopener'), 512 | 'isNoreferrer' => ($rel === '') ? false : str_contains($rel, 'noreferrer'), 513 | ]; 514 | 515 | $result[] = $entry; 516 | } 517 | 518 | return $result; 519 | } 520 | 521 | /** 522 | * Get all images on the page with absolute URLs 523 | */ 524 | public function images(): array 525 | { 526 | // Generate a list of all image entries 527 | $result = []; 528 | 529 | $images = $this->filter('//img')->images(); 530 | 531 | /** @var \Symfony\Component\DomCrawler\Image $image */ 532 | foreach ($images as $image) { 533 | $result[] = $image->getUri(); 534 | } 535 | 536 | return $result; 537 | } 538 | 539 | /** 540 | * Get all images on the page with commonly interesting details 541 | */ 542 | public function imagesWithDetails(): array 543 | { 544 | // Generate a list of all image entries 545 | $result = []; 546 | 547 | /** @var array<\DOMElement> $images */ 548 | $images = $this->filter('//img'); 549 | 550 | foreach ($images as $image) { 551 | // Collect the URL and commonly interesting attributes 552 | $result[] = [ 553 | // Re-generate the proper uri using the Symfony's image class 554 | 'url' => (new DomCrawlerImage($image, $this->currentBaseHost()))->getUri(), 555 | 'alt' => $image->getAttribute('alt'), 556 | 'width' => $image->getAttribute('width') === '' ? null : $image->getAttribute('width'), 557 | 'height' => $image->getAttribute('height') === '' ? null : $image->getAttribute('height'), 558 | ]; 559 | } 560 | 561 | return $result; 562 | } 563 | } 564 | -------------------------------------------------------------------------------- /src/UsesFeeds.php: -------------------------------------------------------------------------------- 1 | currentBaseHost() . '/sitemap.xml'; 15 | } 16 | 17 | /** 18 | * Resolves the sitemap and returns an array with raw data. 19 | * 20 | * @return array $sitemap 21 | */ 22 | public function sitemapRaw(?string $url = null): array 23 | { 24 | return $this->parseXml($this->fetchAsset($url ?? $this->sitemapUrl())); 25 | } 26 | 27 | /** 28 | * Resolves the sitemap and returns an array of `FeedEntry`-DTOs. 29 | * 30 | * @todo Support for text-only sitemaps, split versions, image-sitemaps, etc.? 31 | * 32 | * @return array $sitemap 33 | */ 34 | public function sitemap(?string $url = null): array 35 | { 36 | return array_map( 37 | // Create the generic DTO for each 38 | fn ($entry): FeedEntry => FeedEntry::fromArray([ 39 | 'title' => '', 40 | 'description' => '', 41 | 'link' => $entry['loc'], 42 | ]), 43 | 44 | // Fetch the sitemap URL, parse it and select the `url` section. 45 | $this->sitemapRaw($url)['url'] 46 | ); 47 | } 48 | 49 | /** 50 | * Returns the usual location (URL) for the static search index. 51 | */ 52 | public function searchIndexUrl(): string 53 | { 54 | return $this->currentBaseHost() . '/index.json'; 55 | } 56 | 57 | /** 58 | * Returns an array of the parsed search index JSON. 59 | * 60 | * @return array $searchIndex 61 | */ 62 | public function searchIndexRaw(?string $url = null): array 63 | { 64 | return $this->parseJson($this->fetchAsset($url ?? $this->searchIndexUrl())); 65 | } 66 | 67 | /** 68 | * Resolves the search index and returns an array of `\Spekulatius\PHPScraper\DataTransferObjects\FeedEntry`. 69 | * 70 | * @return array $searchIndex 71 | */ 72 | public function searchIndex(?string $url = null): array 73 | { 74 | return array_map( 75 | // Create the generic DTO for each 76 | fn ($entry): FeedEntry => FeedEntry::fromArray([ 77 | 'title' => $entry['title'], 78 | 'description' => $entry['snippet'], 79 | 'link' => $entry['link'], 80 | ]), 81 | 82 | // Fetch the sitemap URL, parse it and select the `url` section. 83 | $this->searchIndexRaw($url) 84 | ); 85 | } 86 | 87 | /** 88 | * Compiles a list of RSS urls based on the -tags on the current page. 89 | * 90 | * @return array 91 | */ 92 | public function rssUrls(): array 93 | { 94 | $urls = $this->filterExtractAttributes('//link[@type="application/rss+xml"]', ['href']); 95 | 96 | return array_map(fn ($url): string => (string) $this->makeUrlAbsolute($url), $urls); 97 | } 98 | 99 | /** 100 | * Fetches a given set of RSS feeds and returns one array with raw data. 101 | * 102 | * @return array $rss 103 | */ 104 | public function rssRaw(?string ...$urls): array 105 | { 106 | return array_map( 107 | fn ($url) => $this->parseXml($this->fetchAsset((string) $url)), 108 | $urls === [] ? $this->rssUrls() : $urls 109 | ); 110 | } 111 | 112 | /** 113 | * Fetches a given set of RSS feeds and returns one array with raw data. 114 | * 115 | * @return array $rss 116 | */ 117 | public function rss(?string ...$urls): array 118 | { 119 | return array_map( 120 | // Create the generic DTO for each 121 | fn ($entry): FeedEntry => FeedEntry::fromArray([ 122 | 'title' => $entry['title'], 123 | 'link' => $entry['link']['@attributes']['href'], 124 | ]), 125 | 126 | // Fetch the rss URLs, parse it and select the `url` section. 127 | $this->rssRaw(...$urls)[0]['entry'] 128 | ); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/UsesFileParsers.php: -------------------------------------------------------------------------------- 1 | str_getcsv($line, $separator ?? ',', $enclosure ?? '"', $escape ?? '\\'), 21 | explode("\n", $csvString) 22 | ); 23 | 24 | // While technically 'valid', a single string isn't overly useful and likely not actually a CSV but an URL. 25 | if (count($csv) === 1 && count($csv[0]) === 1) { 26 | throw new \Exception('Does not look CSV-like'); 27 | } 28 | } catch (\Exception $e) { 29 | throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); 30 | } 31 | 32 | return $csv; 33 | } 34 | 35 | /** 36 | * Decode CSV and cast types. 37 | * 38 | * @return array $data 39 | */ 40 | public function csvDecode( 41 | string $csvString, 42 | ?string $separator = null, 43 | ?string $enclosure = null, 44 | ?string $escape = null 45 | ): array { 46 | try { 47 | $csv = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape); 48 | 49 | // Cast native and custom types 50 | $csv = array_map( 51 | fn ($line): array => array_map( 52 | fn ($cell) => $this->castType($cell), 53 | $line 54 | ), 55 | $csv 56 | ); 57 | } catch (\Exception $e) { 58 | throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); 59 | } 60 | 61 | return $csv; 62 | } 63 | 64 | /** 65 | * Util to decode a CSV string to asso. array. 66 | * 67 | * @return array $data 68 | */ 69 | public function csvDecodeWithHeaderRaw( 70 | string $csvString, 71 | ?string $separator = null, 72 | ?string $enclosure = null, 73 | ?string $escape = null 74 | ): array { 75 | try { 76 | $csv = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape); 77 | 78 | $header = array_shift($csv); 79 | 80 | // Combine the rows with the header entry. 81 | array_walk( 82 | $csv, 83 | function (&$row, $key, $header): void { 84 | $row = array_combine($header, $row); 85 | }, 86 | $header 87 | ); 88 | } catch (\Exception $e) { 89 | throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); 90 | } 91 | 92 | return $csv; 93 | } 94 | 95 | /** 96 | * Decode a CSV string to asso. array and cast types. 97 | * 98 | * @return array $data 99 | */ 100 | public function csvDecodeWithHeader( 101 | string $csvString, 102 | ?string $separator = null, 103 | ?string $enclosure = null, 104 | ?string $escape = null 105 | ): array { 106 | try { 107 | $csv = $this->csvDecodeWithHeaderRaw($csvString, $separator, $enclosure, $escape); 108 | 109 | // Cast native and custom types 110 | foreach ($csv as $idx => $row) { 111 | foreach ($row as $key => $value) { 112 | $csv[$idx][$key] = $this->castType($value); 113 | } 114 | } 115 | } catch (\Exception $e) { 116 | throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); 117 | } 118 | 119 | return $csv; 120 | } 121 | 122 | /** 123 | * Helper method to cast types 124 | */ 125 | public function castType(string $entry): int|float|string 126 | { 127 | // Looks like an int? 128 | if ($entry == (int) $entry) { 129 | return (int) $entry; 130 | } 131 | 132 | // Looks like a float? 133 | if ($entry == (float) $entry) { 134 | return (float) $entry; 135 | } 136 | 137 | return $entry; 138 | } 139 | 140 | /** 141 | * Parses a given CSV string or fetches the URL and parses it. 142 | * 143 | * @return array $data 144 | */ 145 | public function parseCsv( 146 | ?string $csvStringOrUrl = null, 147 | ?string $separator = null, 148 | ?string $enclosure = null, 149 | ?string $escape = null 150 | ): array { 151 | // Check if we got either a current page or at least a URL string to process 152 | if ($csvStringOrUrl === null && $this->currentPage === null) { 153 | throw new \Exception('You can not call parseCsv() without parameter or initial navigation.'); 154 | } 155 | 156 | try { 157 | // If we have a string, let's try to parse the CSV from this. 158 | if ($csvStringOrUrl !== null) { 159 | // Simple: Try to parse what we have been given 160 | try { 161 | $result = $this->csvDecode($csvStringOrUrl, $separator, $enclosure, $escape); 162 | } catch (\Exception $e) { 163 | // We don't do anything if it fails - likely we have an URL. Let's continue below. 164 | } 165 | } 166 | 167 | /** 168 | * We fetch the content and process it, if we haven't got a CSV as a string. 169 | * 170 | * This is a work-around to allow for: 171 | * 172 | * - `$web->parseCsv('https://...')`. 173 | * - `$web->go('...')->parseCsv()`. 174 | */ 175 | $result = $result ?? $this->csvDecode( 176 | // Fetch the resource either using $csvStringOrUrl 177 | $this->fetchAsset( 178 | // Fallback on the current URL, if needed and possible (`go` was used before). 179 | $csvStringOrUrl ?? $this->currentUrl() 180 | ), 181 | $separator, 182 | $enclosure, 183 | $escape 184 | ); 185 | } catch (\Exception $e) { 186 | throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); 187 | } 188 | 189 | return (array) $result; 190 | } 191 | 192 | /** 193 | * Parses a given CSV string into an asso. with headers or fetches the URL and parses it. 194 | * 195 | * @return array $data 196 | */ 197 | public function parseCsvWithHeader( 198 | ?string $csvStringOrUrl = null, 199 | ?string $separator = null, 200 | ?string $enclosure = null, 201 | ?string $escape = null 202 | ): array { 203 | // Check if we got either a current page or at least a URL string to process 204 | if ($csvStringOrUrl === null && $this->currentPage === null) { 205 | throw new \Exception('You can not call parseCsvWithHeader() without parameter or initial navigation.'); 206 | } 207 | 208 | try { 209 | // If we have a string, let's try to parse the CSV from this. 210 | if ($csvStringOrUrl !== null) { 211 | // Simple: Try to parse what we have been given 212 | try { 213 | $result = $this->csvDecodeWithHeader($csvStringOrUrl, $separator, $enclosure, $escape); 214 | } catch (\Exception $e) { 215 | // We don't do anything if it fails - likely we have an URL. Let's continue below. 216 | } 217 | } 218 | 219 | /** 220 | * We fetch the content and process it, if we haven't got a CSV as a string. 221 | * 222 | * This is a work-around to allow for: 223 | * 224 | * - `$web->parseCsvWithHeader('https://...')`. 225 | * - `$web->go('...')->parseCsvWithHeader()`. 226 | */ 227 | $result = $result ?? $this->csvDecodeWithHeader( 228 | // Fetch the resource either using $csvStringOrUrl 229 | $this->fetchAsset( 230 | // Fallback on the current URL, if needed and possible (`go` was used before). 231 | $csvStringOrUrl ?? $this->currentUrl() 232 | ), 233 | $separator, 234 | $enclosure, 235 | $escape 236 | ); 237 | } catch (\Exception $e) { 238 | throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); 239 | } 240 | 241 | return (array) $result; 242 | } 243 | 244 | /** 245 | * Parses a given JSON string or fetches the URL and parses it. 246 | * 247 | * @return array $data 248 | */ 249 | public function parseJson(?string $jsonStringOrUrl = null): array 250 | { 251 | // Check if we got either a current page or at least a URL string to process 252 | if ($jsonStringOrUrl === null && $this->currentPage === null) { 253 | throw new \Exception('You can not call parseJson() without parameter or initial navigation.'); 254 | } 255 | 256 | try { 257 | // If we have a string, let's try to parse the JSON from this. 258 | if ($jsonStringOrUrl !== null) { 259 | // Simple: Try to parse what we have been given 260 | try { 261 | $result = json_decode($jsonStringOrUrl, true, 512, JSON_THROW_ON_ERROR); 262 | } catch (\Exception $e) { 263 | // We don't do anything if it fails - likely we have an URL. Let's continue below. 264 | } 265 | } 266 | 267 | /** 268 | * We fetch the content and process it, if we haven't got a JSON as a string. 269 | * 270 | * This is a work-around to allow for: 271 | * 272 | * - `$web->parseJson('https://...')`. 273 | * - `$web->go('...')->parseJson()`. 274 | */ 275 | $result = $result ?? json_decode( 276 | // Fetch the resource either using $jsonStringOrUrl 277 | $this->fetchAsset( 278 | // Fallback on the current URL, if needed and possible (`go` was used before). 279 | $jsonStringOrUrl ?? $this->currentUrl() 280 | ), 281 | true, 282 | 512, 283 | JSON_THROW_ON_ERROR 284 | ); 285 | } catch (\Exception $e) { 286 | throw new \Exception('Failed to parse JSON: ' . $e->getMessage()); 287 | } 288 | 289 | return (array) $result; 290 | } 291 | 292 | /** 293 | * Parses a given XML string or fetches the URL and parses it. 294 | * 295 | * @return array $data 296 | */ 297 | public function parseXml(?string $xmlStringOrUrl = null): array 298 | { 299 | // Check if we got either a current page or at least a URL string to process 300 | if ($xmlStringOrUrl === null && $this->currentPage === null) { 301 | throw new \Exception('You can not call parseXml() without parameter or initial navigation.'); 302 | } 303 | 304 | try { 305 | // Try to parse the XML. If it works we have got an XML string. 306 | if ($xmlStringOrUrl !== null) { 307 | try { 308 | $result = $this->xmlDecode($xmlStringOrUrl); 309 | } catch (\Exception $e) { 310 | // Do nothing, we just want to try it if it works. 311 | } 312 | } 313 | 314 | /** 315 | * We fetch the content and process it, if we haven't got a XML as a string. 316 | * 317 | * This is a work-around to allow for: 318 | * 319 | * - `$web->parseXml('https://...')`. 320 | * - `$web->go('...')->parseXml()`. 321 | */ 322 | $result = $result ?? $this->xmlDecode($this->fetchAsset( 323 | $xmlStringOrUrl ?? $this->currentUrl() 324 | )); 325 | } catch (\Exception $e) { 326 | throw new \Exception('Failed to parse XML: ' . $e->getMessage()); 327 | } 328 | 329 | return $result; 330 | } 331 | 332 | protected function xmlDecode(string $xmlString): array 333 | { 334 | // XML parser 335 | $xml = simplexml_load_string(trim($xmlString), 'SimpleXMLElement', LIBXML_NOCDATA); 336 | 337 | // Convert XML to JSON and then to an associative array 338 | return (array) json_decode(json_encode($xml, JSON_THROW_ON_ERROR), true, 512, JSON_THROW_ON_ERROR); 339 | } 340 | } 341 | -------------------------------------------------------------------------------- /src/UsesUrls.php: -------------------------------------------------------------------------------- 1 | currentPage === null) { 22 | throw new \Exception('You can not access the URL before your first navigation using `go`.'); 23 | } 24 | 25 | return (string) $this->currentPage->getUri(); 26 | } 27 | 28 | /** 29 | * Returns the current host 30 | * 31 | * @return string|null $host 32 | */ 33 | public function currentHost(): ?string 34 | { 35 | return Uri::new($this->currentUrl())->getHost(); 36 | } 37 | 38 | /** 39 | * Returns the current host as defined in `` or the current host. 40 | * 41 | * @return string $baseUrl 42 | */ 43 | public function currentBaseHost(): string 44 | { 45 | $uri = Uri::new($this->baseHref() ?? $this->currentUrl()); 46 | 47 | return $uri->getScheme() . '://' . $uri->getHost(); 48 | } 49 | 50 | /** 51 | * Converts a current URL to be absolute based on or current page. 52 | * 53 | * @return ?string $absoluteUrl 54 | */ 55 | public function makeUrlAbsolute(?string $url = null, ?string $baseUrl = null): ?string 56 | { 57 | // Allow to pass null through 58 | if ($url === null || $this->currentPage === null) { 59 | return null; 60 | } 61 | 62 | // Resolve the Url using one of the provided/set base href. 63 | return (string) UriResolver::resolve( 64 | Http::new($url), 65 | Http::new($baseUrl ?? $this->baseHref() ?? $this->currentBaseHost()), 66 | ); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/UsesXPathFilters.php: -------------------------------------------------------------------------------- 1 | currentPage->filterXPath($query); 15 | } 16 | 17 | /** 18 | * Filters the current page by a xPath-query and returns the first one, or null. 19 | */ 20 | public function filterFirst(string $query): ?Crawler 21 | { 22 | $filteredNodes = $this->filter($query); 23 | 24 | return ($filteredNodes->count() === 0) ? null : $filteredNodes->first(); 25 | } 26 | 27 | /** 28 | * Filters the current page by a xPath-query and returns the first ones content, or null. 29 | */ 30 | public function filterFirstText(string $query): ?string 31 | { 32 | $filteredNodes = $this->filter($query); 33 | 34 | return ($filteredNodes->count() === 0) ? null : $filteredNodes->first()->text(); 35 | } 36 | 37 | /** 38 | * Filters the current page by a xPath-query and returns the textual content as array. 39 | * 40 | * @return array 41 | */ 42 | public function filterTexts(string $query): array 43 | { 44 | return $this->filterExtractAttributes($query, ['_text']); 45 | } 46 | 47 | /** 48 | * Filters the current page by a xPath-query and returns the selected attributes as array. 49 | * 50 | * @param array $attributes 51 | * @return array 52 | */ 53 | public function filterExtractAttributes(string $query, array $attributes): array 54 | { 55 | $filteredNodes = $this->filter($query); 56 | 57 | return ($filteredNodes->count() === 0) ? [] : $filteredNodes->extract($attributes); 58 | } 59 | 60 | /** 61 | * Filters the current page by a xPath-query and returns the selected attributes of the first match. 62 | * 63 | * @param array $attributes 64 | */ 65 | public function filterFirstExtractAttribute(string $query, array $attributes): ?string 66 | { 67 | $filteredNodes = $this->filter($query); 68 | 69 | return ($filteredNodes->count() === 0) ? null : $filteredNodes->first()->extract($attributes)[0]; 70 | } 71 | 72 | /** 73 | * Returns the content attribute for the first result of the query, or null. 74 | */ 75 | public function filterFirstContent(string $query): ?string 76 | { 77 | return $this->filterFirstExtractAttribute($query, ['content']); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /tests/BaseHrefTest.php: -------------------------------------------------------------------------------- 1 | `-extraction. 7 | * 8 | * If you are looking for any URL-related tests check `UrlTest.php`. 9 | */ 10 | class BaseHrefTest extends \PHPUnit\Framework\TestCase 11 | { 12 | /** 13 | * @test 14 | */ 15 | public function testMissingBaseHref() 16 | { 17 | $web = new \Spekulatius\PHPScraper\PHPScraper; 18 | 19 | // Navigate to the test page. 20 | $web->go('https://test-pages.phpscraper.de/meta/missing.html'); 21 | 22 | // Check the baseHref as not given (null) 23 | $this->assertNull($web->baseHref); 24 | } 25 | 26 | /** 27 | * @test 28 | */ 29 | public function testBaseHref() 30 | { 31 | $web = new \Spekulatius\PHPScraper\PHPScraper; 32 | 33 | // Navigate to the test page. 34 | // Contains: 35 | $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html'); 36 | 37 | // Check the baseHref 38 | $this->assertSame( 39 | 'https://test-pages-with-base-href.phpscraper.de/', 40 | $web->baseHref 41 | ); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tests/CanonicalTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // null if there isn't a canonical set. 18 | $this->assertNull($web->canonical); 19 | } 20 | 21 | /** 22 | * @test 23 | */ 24 | public function testWithCanonical() 25 | { 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | 28 | // Navigate to the test page. 29 | // It contains: 30 | $web->go('https://test-pages.phpscraper.de/navigation/1.html'); 31 | 32 | // Check the canonical 33 | $this->assertSame( 34 | 'https://test-pages.phpscraper.de/navigation/2.html', 35 | $web->canonical 36 | ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/CoreTest.php: -------------------------------------------------------------------------------- 1 | go('https://phpscraper.de'); 16 | 17 | // Both the method call as well as property call should return the same... 18 | $this->assertSame($web->title, $web->title()); 19 | } 20 | 21 | /** 22 | * Test if our local variable is updated correctly. 23 | * 24 | * @test 25 | */ 26 | public function testChangeOfCurrentPage() 27 | { 28 | $web = new \Spekulatius\PHPScraper\PHPScraper; 29 | 30 | // 1. Navigate to test page 31 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 32 | 33 | // Both the method call as well as property call should return the same... 34 | $this->assertSame( 35 | 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html', 36 | $web->currentUrl 37 | ); 38 | $this->assertSame( 39 | 'Lorem Ipsum', 40 | $web->title 41 | ); 42 | 43 | // 2. Leave the current page and head on to the next one. 44 | $web->go('https://phpscraper.de'); 45 | 46 | // We should have navigated. 47 | $this->assertSame( 48 | 'https://phpscraper.de', 49 | $web->currentUrl 50 | ); 51 | 52 | // Shouldn't match, because we surfed on... 53 | $this->assertNotSame( 54 | 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html', 55 | $web->currentUrl 56 | ); 57 | $this->assertNotSame( 58 | 'Lorem Ipsum', 59 | $web->title 60 | ); 61 | } 62 | 63 | /** 64 | * Calls should be chainable and easy to access. 65 | * 66 | * @test 67 | */ 68 | public function testBasicChainability() 69 | { 70 | // Testing env: First h1: "We are testing here & elsewhere!" 71 | $url = 'https://test-pages.phpscraper.de/meta/html-entities.html'; 72 | 73 | // Test 1: Create, navigate to the test page. 74 | $web = new \Spekulatius\PHPScraper\PHPScraper; 75 | $web->go($url); 76 | 77 | // Check the h1 78 | $this->assertSame( 79 | 'We are testing here & elsewhere!', 80 | $web->h1[0] 81 | ); 82 | 83 | // Test 2: Chained 84 | $this->assertSame( 85 | 'We are testing here & elsewhere!', 86 | 87 | // Chained 88 | (new \Spekulatius\PHPScraper\PHPScraper) 89 | ->go($url) 90 | ->h1[0] 91 | ); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /tests/CustomSelectorTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/content/selectors.html'); 15 | 16 | // Ensure we got the test page. 17 | $this->assertSame( 18 | 'Selector Tests', 19 | $web->title 20 | ); 21 | 22 | // Trigger failing test. 23 | try { 24 | $web->filterFirstText("//[@id='by-id']"); 25 | } catch (\Exception $e) { 26 | $this->assertSame( 27 | 'DOMXPath::query(): Invalid expression', 28 | $e->getMessage() 29 | ); 30 | } 31 | } 32 | 33 | /** 34 | * @test 35 | */ 36 | public function testSelectionBasedOnId() 37 | { 38 | // Navigate to test page 39 | $web = new \Spekulatius\PHPScraper\PHPScraper; 40 | $web->go('https://test-pages.phpscraper.de/content/selectors.html'); 41 | 42 | // Ensure we got the test page. 43 | $this->assertSame( 44 | 'Selector Tests', 45 | $web->title 46 | ); 47 | 48 | // Select content using `->text()` 49 | $this->assertSame( 50 | 'Content by ID', 51 | $web->filterFirstText("//*[@id='by-id']") 52 | ); 53 | } 54 | 55 | /** 56 | * @test 57 | */ 58 | public function testSelectionBasedOnTag() 59 | { 60 | // Navigate to test page 61 | $web = new \Spekulatius\PHPScraper\PHPScraper; 62 | $web->go('https://test-pages.phpscraper.de/content/selectors.html'); 63 | 64 | // Ensure we got the test page. 65 | $this->assertSame( 66 | 'Selector Tests', 67 | $web->title 68 | ); 69 | 70 | // Select single string using first and chain `->text()` 71 | $this->assertSame( 72 | 'Selector Tests (h1)', 73 | $web->filterFirst('//h1')->text() 74 | ); 75 | 76 | // Select as array using `filterTexts`: 77 | $this->assertSame( 78 | ['Selector Tests (h1)'], 79 | $web->filterTexts('//h1') 80 | ); 81 | } 82 | 83 | /** 84 | * @test 85 | */ 86 | public function testSelectionBasedOnClass() 87 | { 88 | // Navigate to test page 89 | $web = new \Spekulatius\PHPScraper\PHPScraper; 90 | $web->go('https://test-pages.phpscraper.de/content/selectors.html'); 91 | 92 | // Ensure we got the test page. 93 | $this->assertSame( 94 | 'Selector Tests', 95 | $web->title 96 | ); 97 | 98 | // Select without `->text()` and using the filterTexts-method instead. 99 | $this->assertSame( 100 | ['Content by Class 1', 'Content by Class 2'], 101 | $web->filterTexts("//*[@class='by-class']") 102 | ); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /tests/DownloadTest.php: -------------------------------------------------------------------------------- 1 | expectException(\Symfony\Component\HttpClient\Exception\ClientException::class); 15 | $this->expectExceptionMessage('HTTP/2 404 returned for "https://phpscraper.de/broken-url"'); 16 | 17 | $web->fetchAsset('https://phpscraper.de/broken-url'); 18 | } 19 | 20 | /** 21 | * @test 22 | */ 23 | public function testDownload() 24 | { 25 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | $xmlString = $web->fetchAsset('https://phpscraper.de/sitemap.xml'); 28 | 29 | // Convert XML to array 30 | // Credit: https://stackoverflow.com/a/20431742 31 | $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); 32 | $array = json_decode((string) json_encode($xml), true); 33 | 34 | $urls = array_map( 35 | fn ($url) => $url['loc'], 36 | $array['url'] 37 | ); 38 | 39 | $this->assertContains( 40 | 'https://phpscraper.de/', 41 | $urls 42 | ); 43 | } 44 | 45 | /** 46 | * We should support both absolute and relative URLs. 47 | * 48 | * Here we use the sitemap test page as a reference. 49 | * 50 | * @test 51 | */ 52 | public function testDifferentUrlTypes() 53 | { 54 | $web = new \Spekulatius\PHPScraper\PHPScraper; 55 | 56 | // Navigate to the test page. As the URL is predefined, it's only about the base URL. 57 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 58 | 59 | // Test 1: Absolute URL 60 | $this->assertSame( 61 | $web->fetchAsset($web->sitemapUrl), 62 | $web->fetchAsset($web->currentBaseHost . '/custom_sitemap.xml'), 63 | ); 64 | 65 | // Test 2: Relative URL 66 | $this->assertSame( 67 | $web->fetchAsset($web->sitemapUrl), 68 | $web->fetchAsset('/custom_sitemap.xml'), 69 | ); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /tests/FeedRssTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 18 | 19 | // This page shouldn't contain any RSS feeds. 20 | $this->assertEmpty($web->rssUrls); 21 | } 22 | 23 | /** 24 | * @test 25 | */ 26 | public function testRssUrls() 27 | { 28 | $web = new \Spekulatius\PHPScraper\PHPScraper; 29 | 30 | // Navigate to the test page. 31 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 32 | 33 | // Did we get the expected result? Any URLs should be made absolute. 34 | $this->assertSame([ 35 | 'https://test-pages.phpscraper.de/absolute.xml', 36 | 'https://test-pages.phpscraper.de/relative.xml', 37 | ], $web->rssUrls); 38 | } 39 | 40 | /** 41 | * Tests if we can use a custom url instead of a identified one. 42 | * 43 | * @test 44 | */ 45 | public function testCustomRssUrl() 46 | { 47 | $web = new \Spekulatius\PHPScraper\PHPScraper; 48 | 49 | // Navigate to the test page. 50 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 51 | 52 | // We should always allow to use a custom url. 53 | // Both files are the same. 54 | // One URL isn't linked from the feeds.html and therefore is custom. 55 | $this->assertSame( 56 | $web->rssRaw('https://test-pages.phpscraper.de/custom_rss.xml'), 57 | $web->rssRaw('https://test-pages.phpscraper.de/relative.xml') 58 | ); 59 | } 60 | 61 | /** 62 | * We should support both absolute and relative URLs. 63 | * 64 | * @test 65 | */ 66 | public function testDifferentRssUrlTypes() 67 | { 68 | $web = new \Spekulatius\PHPScraper\PHPScraper; 69 | 70 | // Navigate to the test page. As the URL is predefined, it's only about the base URL. 71 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 72 | 73 | // Test 1: Absolute URL 74 | $this->assertSame( 75 | $web->rssRaw($web->rssUrls[0]), 76 | $web->rssRaw($web->currentBaseHost . '/custom_rss.xml'), 77 | ); 78 | 79 | // Test 2: Relative URL 80 | $this->assertSame( 81 | $web->rssRaw($web->rssUrls[0]), 82 | $web->rssRaw('/custom_rss.xml'), 83 | ); 84 | } 85 | 86 | /** 87 | * Tests the raw parsing. 88 | * 89 | * @test 90 | */ 91 | public function testRssRawContent() 92 | { 93 | $web = new \Spekulatius\PHPScraper\PHPScraper; 94 | 95 | // Navigate to the test page. 96 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 97 | 98 | // The raw RSS is rather unhandy to work with. Let's put it in a var before testing stuff. 99 | $rssRaw = $web->rssRaw('https://test-pages.phpscraper.de/custom_rss.xml')[0]['entry']; 100 | 101 | // Ensure the structure is an nested array 102 | $this->assertTrue(is_array($rssRaw)); 103 | $this->assertTrue(is_array($rssRaw[4])); 104 | 105 | // Check some entries to ensure the parsing works. 106 | $this->assertSame( 107 | $rssRaw[4]['link']['@attributes']['href'], 108 | 'https://peterthaleikis.com/posts/how-i-built-my-first-browser-extension/' 109 | ); 110 | $this->assertSame( 111 | $rssRaw[2]['link']['@attributes']['href'], 112 | 'https://peterthaleikis.com/posts/how-to-use-pug-on-netlify/' 113 | ); 114 | $this->assertSame( 115 | $rssRaw[0]['link']['@attributes']['href'], 116 | 'https://peterthaleikis.com/posts/startup-name-check:-experiences-of-the-first-week/' 117 | ); 118 | } 119 | 120 | /** 121 | * Tests the DTO creation. 122 | * 123 | * @test 124 | */ 125 | public function testRss() 126 | { 127 | $web = new \Spekulatius\PHPScraper\PHPScraper; 128 | 129 | // Navigate to the test page. 130 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 131 | 132 | // The raw RSS is rather unhandy to work with (hence we actually use the DTOs). 133 | $rss = $web->rss('https://test-pages.phpscraper.de/custom_rss.xml'); 134 | 135 | // Check the count 136 | $this->assertSame(37, count($rss)); 137 | 138 | // Check some entries to ensure the parsing works. 139 | // Set 1 140 | $this->assertTrue($rss[4] instanceof FeedEntry); 141 | $this->assertSame( 142 | $rss[4]->title, 143 | 'How I Built My First Browser Extension' 144 | ); 145 | $this->assertSame( 146 | $rss[4]->link, 147 | 'https://peterthaleikis.com/posts/how-i-built-my-first-browser-extension/' 148 | ); 149 | 150 | // Set 2 151 | $this->assertTrue($rss[2] instanceof FeedEntry); 152 | $this->assertSame( 153 | $rss[2]->title, 154 | 'How to Use Pug on Netlify?' 155 | ); 156 | $this->assertSame( 157 | $rss[2]->link, 158 | 'https://peterthaleikis.com/posts/how-to-use-pug-on-netlify/' 159 | ); 160 | 161 | // Set 3 162 | $this->assertTrue($rss[0] instanceof FeedEntry); 163 | $this->assertSame( 164 | $rss[0]->title, 165 | 'Startup Name Check: Experiences of the First week' 166 | ); 167 | $this->assertSame( 168 | $rss[0]->link, 169 | 'https://peterthaleikis.com/posts/startup-name-check:-experiences-of-the-first-week/' 170 | ); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /tests/FeedSearchIndexTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/feeds.html'); 18 | 19 | // Did we get the expected `/index.json`? 20 | $this->assertSame( 21 | 'https://test-pages.phpscraper.de/index.json', 22 | $web->searchIndexUrl 23 | ); 24 | } 25 | 26 | /** 27 | * Tests if the default search index path is applied. 28 | * 29 | * @test 30 | */ 31 | public function testDefaultSearchIndexUrl() 32 | { 33 | $web = new \Spekulatius\PHPScraper\PHPScraper; 34 | 35 | // Navigate to the test page. As the URL is predefined, it's only about the base URL. 36 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 37 | 38 | // `searchIndexUrl` should be the default. 39 | $this->assertSame( 40 | $web->searchIndexRaw(), 41 | $web->searchIndexRaw($web->searchIndexUrl), 42 | ); 43 | } 44 | 45 | /** 46 | * The `custom_index.json` and `index.json` are the same. 47 | * 48 | * So we compare the two results to ensure the custom URL feature works. 49 | * 50 | * @test 51 | */ 52 | public function testCustomSearchIndexUrl() 53 | { 54 | $web = new \Spekulatius\PHPScraper\PHPScraper; 55 | 56 | // Navigate to the test page. As the URL is predefined, it's only about the base URL. 57 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 58 | 59 | // We should always allow for custom urls. 60 | $this->assertSame( 61 | $web->searchIndexRaw($web->searchIndexUrl), 62 | $web->searchIndexRaw($web->currentBaseHost . '/custom_index.json'), 63 | ); 64 | } 65 | 66 | /** 67 | * We should support both absolute and relative URLs. 68 | * 69 | * @test 70 | */ 71 | public function testDifferentSearchIndexUrlTypes() 72 | { 73 | $web = new \Spekulatius\PHPScraper\PHPScraper; 74 | 75 | // Navigate to the test page. As the URL is predefined, it's only about the base URL. 76 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 77 | 78 | // Test 1: Absolute URL 79 | $this->assertSame( 80 | $web->searchIndexRaw($web->searchIndexUrl), 81 | $web->searchIndexRaw($web->currentBaseHost . '/custom_index.json'), 82 | ); 83 | 84 | // Test 2: Relative URL 85 | $this->assertSame( 86 | $web->searchIndexRaw($web->searchIndexUrl), 87 | $web->searchIndexRaw('/custom_index.json'), 88 | ); 89 | } 90 | 91 | /** 92 | * Tests the raw parsing. 93 | * 94 | * @test 95 | */ 96 | public function testSearchIndexRaw() 97 | { 98 | $web = new \Spekulatius\PHPScraper\PHPScraper; 99 | 100 | // Navigate to the test page. As the URL is predefined, it's only about the base URL. 101 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 102 | 103 | // Get the raw searchIndex and store it. 104 | $searchIndexRaw = $web->searchIndexRaw; 105 | 106 | // Ensure the structure is an nested array 107 | $this->assertTrue(is_array($searchIndexRaw)); 108 | $this->assertTrue(is_array($searchIndexRaw[42])); 109 | 110 | // Did we get the expected `/index.json`? It should contain 60 entries. 111 | $this->assertSame(60, count($searchIndexRaw)); 112 | 113 | // Check some data to ensure the parsing actually worked. 114 | $this->assertSame( 115 | 'https://pastablelists.com/en/counties-of-croatia', 116 | $searchIndexRaw[4]['link'] 117 | ); 118 | $this->assertSame( 119 | 'https://pastablelists.com/en/municipalities-of-macedonia', 120 | $searchIndexRaw[2]['link'] 121 | ); 122 | $this->assertSame( 123 | 'https://pastablelists.com/en/counties-and-municipalities-of-lithuania', 124 | $searchIndexRaw[0]['link'] 125 | ); 126 | } 127 | 128 | /** 129 | * Tests the DTO creation. 130 | * 131 | * @test 132 | */ 133 | public function testSearchIndex() 134 | { 135 | $web = new \Spekulatius\PHPScraper\PHPScraper; 136 | 137 | // Navigate to the test page. As the URL is predefined, it's only about the base URL. 138 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 139 | 140 | // Get the searchIndex and store it. 141 | $searchIndex = $web->searchIndex; 142 | 143 | // Did we get the expected `/index.json`? It should contain 60 entries. 144 | $this->assertSame(60, count($searchIndex)); 145 | 146 | // Check some data to ensure the parsing actually worked: 147 | // Set 1 148 | $this->assertTrue($searchIndex[4] instanceof FeedEntry); 149 | $this->assertSame( 150 | 'List of the Counties of Croatia', 151 | $searchIndex[4]->title, 152 | ); 153 | $this->assertSame( 154 | 'List of the Counties of Croatia ready for copy and paste or export.', 155 | $searchIndex[4]->description, 156 | ); 157 | $this->assertSame( 158 | 'https://pastablelists.com/en/counties-of-croatia', 159 | $searchIndex[4]->link, 160 | ); 161 | 162 | // Set 2 163 | $this->assertTrue($searchIndex[2] instanceof FeedEntry); 164 | $this->assertSame( 165 | 'List of the Municipalities of Macedonia', 166 | $searchIndex[2]->title, 167 | ); 168 | $this->assertSame( 169 | 'List of the Municipalities of Macedonia ready for copy and paste or export.', 170 | $searchIndex[2]->description, 171 | ); 172 | $this->assertSame( 173 | 'https://pastablelists.com/en/municipalities-of-macedonia', 174 | $searchIndex[2]->link, 175 | ); 176 | 177 | // Set 3 178 | $this->assertTrue($searchIndex[0] instanceof FeedEntry); 179 | $this->assertSame( 180 | 'List of the Counties and Municipalities of Lithuania', 181 | $searchIndex[0]->title, 182 | ); 183 | $this->assertSame( 184 | 'List of the Counties and Municipalities of Lithuania, ready for copy and paste or export.', 185 | $searchIndex[0]->description, 186 | ); 187 | $this->assertSame( 188 | 'https://pastablelists.com/en/counties-and-municipalities-of-lithuania', 189 | $searchIndex[0]->link, 190 | ); 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /tests/FeedSitemapTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/feeds.html'); 18 | 19 | // Did we get the expected `/sitemap.xml`? 20 | $this->assertSame( 21 | 'https://test-pages.phpscraper.de/sitemap.xml', 22 | $web->sitemapUrl 23 | ); 24 | } 25 | 26 | /** 27 | * Tests if the default sitemap path is applied. 28 | * 29 | * @test 30 | */ 31 | public function testDefaultSitemapUrl() 32 | { 33 | $web = new \Spekulatius\PHPScraper\PHPScraper; 34 | 35 | // Navigate to the test page. As the URL is guessed, it's only about the base URL. 36 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 37 | 38 | // The sitemapUrl should be the default. 39 | $this->assertSame( 40 | $web->sitemapRaw(), 41 | $web->sitemapRaw($web->sitemapUrl), 42 | ); 43 | } 44 | 45 | /** 46 | * The files `sitemap.xml` and `custom_sitemap.xml` are the same and used to ensure the custom URL feature works. 47 | * 48 | * @test 49 | */ 50 | public function testCustomSitemapUrl() 51 | { 52 | $web = new \Spekulatius\PHPScraper\PHPScraper; 53 | 54 | // Navigate to the test page. As the URL is guessed, it's only about the base URL. 55 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 56 | 57 | // We should always allow for custom paths. 58 | $this->assertSame( 59 | $web->sitemapRaw($web->sitemapUrl), 60 | $web->sitemapRaw($web->currentBaseHost . '/custom_sitemap.xml'), 61 | ); 62 | } 63 | 64 | /** 65 | * We should support both absolute and relative URLs. 66 | * 67 | * @test 68 | */ 69 | public function testDifferentSitemapUrlTypes() 70 | { 71 | $web = new \Spekulatius\PHPScraper\PHPScraper; 72 | 73 | // Navigate to the test page. As the URL is predefined, it's only about the base URL. 74 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 75 | 76 | // Test 1: Absolute URL 77 | $this->assertSame( 78 | $web->sitemapRaw($web->sitemapUrl), 79 | $web->sitemapRaw($web->currentBaseHost . '/custom_sitemap.xml'), 80 | ); 81 | 82 | // Test 2: Relative URL 83 | $this->assertSame( 84 | $web->sitemapRaw($web->sitemapUrl), 85 | $web->sitemapRaw('/custom_sitemap.xml'), 86 | ); 87 | } 88 | 89 | /** 90 | * Ensure we can parse the sitemap in itself (XML). 91 | * 92 | * @test 93 | */ 94 | public function testSitemapRaw() 95 | { 96 | $web = new \Spekulatius\PHPScraper\PHPScraper; 97 | 98 | // Navigate to the test page. As the URL is guessed, it's only about the base URL. 99 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 100 | 101 | // Get the sitemap and store it. 102 | $sitemapRaw = $web->sitemapRaw; 103 | 104 | // Check the count 105 | $this->assertSame(129, count($sitemapRaw['url'])); 106 | 107 | // Check some entries to ensure the parsing works as expected. 108 | $this->assertSame( 109 | 'https://phpscraper.de/apis/linkedin.html', 110 | $sitemapRaw['url'][4]['loc'], 111 | ); 112 | $this->assertSame( 113 | 'https://phpscraper.de/de/apis/zalando.html', 114 | $sitemapRaw['url'][20]['loc'], 115 | ); 116 | } 117 | 118 | /** 119 | * Tests the DTO creation. 120 | * 121 | * @test 122 | */ 123 | public function testSitemap() 124 | { 125 | $web = new \Spekulatius\PHPScraper\PHPScraper; 126 | 127 | // Navigate to the test page. As the URL is guessed, it's only about the base URL. 128 | $web->go('https://test-pages.phpscraper.de/meta/feeds.html'); 129 | 130 | // Get the sitemap and store it. 131 | $sitemap = $web->sitemap; 132 | 133 | // Check the count 134 | $this->assertSame(129, count($sitemap)); 135 | 136 | // Check some samples. 137 | $this->assertTrue($sitemap[42] instanceof FeedEntry); 138 | $this->assertSame( 139 | 'https://phpscraper.de/apis/linkedin.html', 140 | $sitemap[4]->link, 141 | ); 142 | $this->assertSame( 143 | 'https://phpscraper.de/de/apis/zalando.html', 144 | $sitemap[20]->link 145 | ); 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /tests/HeadingTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/no-meta.html'); 16 | 17 | // Check the missing headers (h1 actually exists on the page). 18 | $this->assertSame([], $web->h2); 19 | $this->assertSame([], $web->h3); 20 | $this->assertSame([], $web->h4); 21 | $this->assertSame([], $web->h5); 22 | $this->assertSame([], $web->h6); 23 | } 24 | 25 | /** 26 | * @test 27 | */ 28 | public function testWithHTMLEntity() 29 | { 30 | $web = new \Spekulatius\PHPScraper\PHPScraper; 31 | 32 | // Navigate to the test page. 33 | $web->go('https://test-pages.phpscraper.de/meta/html-entities.html'); 34 | 35 | // Check the h1 36 | $this->assertSame( 37 | 'We are testing here & elsewhere!', 38 | $web->h1[0] 39 | ); 40 | 41 | // h2s 42 | $this->assertSame(2, count($web->h2)); 43 | $this->assertSame([ 44 | 'Cat & Mouse', 45 | 'Mouse & Cat', 46 | ], $web->h2); 47 | 48 | // Collection of headings 49 | $this->assertSame( 50 | [ 51 | ['We are testing here & elsewhere!'], 52 | ['Cat & Mouse', 'Mouse & Cat'], 53 | ['1', '2', '3'], 54 | ['Not so important heading'], 55 | [], 56 | [], 57 | ], 58 | $web->headings 59 | ); 60 | } 61 | 62 | /** 63 | * @test 64 | */ 65 | public function testLoremIpsum() 66 | { 67 | $web = new \Spekulatius\PHPScraper\PHPScraper; 68 | 69 | // Navigate to the test page. 70 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 71 | 72 | // Check the h1 73 | $this->assertSame( 74 | 'We are testing here!', 75 | $web->h1[0] 76 | ); 77 | 78 | // h2s 79 | $this->assertSame(2, count($web->h2)); 80 | $this->assertSame([ 81 | 'h2s are headings too.', 82 | 'h2s are headings too.', 83 | ], $web->h2); 84 | } 85 | 86 | /** 87 | * @test 88 | */ 89 | public function testGermanUmlaute() 90 | { 91 | $web = new \Spekulatius\PHPScraper\PHPScraper; 92 | 93 | // Navigate to the test page. 94 | $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html'); 95 | 96 | // Check the h1 97 | $this->assertSame( 98 | 'We are testing here ä ü ö!', 99 | $web->h1[0] 100 | ); 101 | 102 | // h2s 103 | $this->assertSame(2, count($web->h2)); 104 | $this->assertSame([ 105 | 'Täst, ehm, test!', 106 | 'Weiter testen, Müller!', 107 | ], $web->h2); 108 | } 109 | 110 | /** 111 | * @test 112 | */ 113 | public function testChineseCharacters() 114 | { 115 | $web = new \Spekulatius\PHPScraper\PHPScraper; 116 | 117 | // Navigate to the test page. 118 | $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html'); 119 | 120 | // Check the h1 121 | $this->assertSame( 122 | 'We are testing here! 加油!', 123 | $web->h1[0] 124 | ); 125 | 126 | // h2s 127 | $this->assertSame(2, count($web->h2)); 128 | $this->assertSame(['加油!', '加油 #1!'], $web->h2); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /tests/ImageTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // No images -> an empty array is expected. 18 | $this->assertSame([], $web->images); 19 | $this->assertSame([], $web->imagesWithDetails); 20 | } 21 | 22 | /** 23 | * @test 24 | */ 25 | public function testLoremIpsum() 26 | { 27 | $web = new \Spekulatius\PHPScraper\PHPScraper; 28 | 29 | // Navigate to the test page. 30 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 31 | 32 | // Navigate to the test page. This page contains two images (cat.jpg). 33 | $this->assertSame(2, count($web->images)); 34 | 35 | // Check the simple list 36 | $this->assertSame([ 37 | 'https://test-pages.phpscraper.de/assets/cat.jpg', 38 | 'https://test-pages.phpscraper.de/assets/cat.jpg', 39 | ], $web->images); 40 | 41 | // Check the expected data 42 | $this->assertSame([ 43 | [ 44 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 45 | 'alt' => 'absolute path', 46 | 'width' => null, 47 | 'height' => null, 48 | ], 49 | [ 50 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 51 | 'alt' => 'relative path', 52 | 'width' => null, 53 | 'height' => null, 54 | ], 55 | ], $web->imagesWithDetails); 56 | } 57 | 58 | /** 59 | * @test 60 | */ 61 | public function testGermanUmlaute() 62 | { 63 | $web = new \Spekulatius\PHPScraper\PHPScraper; 64 | 65 | // Navigate to the test page. 66 | $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html'); 67 | 68 | // Check the h1 69 | $this->assertSame( 70 | 'We are testing here ä ü ö!', 71 | $web->h1[0] 72 | ); 73 | 74 | // Check the number of images 75 | $this->assertSame(2, count($web->images)); 76 | 77 | // Check the simple list 78 | $this->assertSame([ 79 | 'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg', 80 | 'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg', 81 | ], $web->images); 82 | 83 | // Check the expected data 84 | $this->assertSame([ 85 | [ 86 | 'url' => 'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg', 87 | 'alt' => 'absolute path', 88 | 'width' => null, 89 | 'height' => null, 90 | ], 91 | [ 92 | 'url' => 'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg', 93 | 'alt' => 'relative path', 94 | 'width' => null, 95 | 'height' => null, 96 | ], 97 | ], $web->imagesWithDetails); 98 | } 99 | 100 | /** 101 | * @test 102 | */ 103 | public function testChineseCharacters() 104 | { 105 | $web = new \Spekulatius\PHPScraper\PHPScraper; 106 | 107 | // Navigate to the test page. 108 | $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html'); 109 | 110 | // Check the number of images 111 | $this->assertSame(2, count($web->images)); 112 | 113 | // Check the simple list 114 | $this->assertSame([ 115 | 'https://test-pages.phpscraper.de/assets/貓.jpg', 116 | 'https://test-pages.phpscraper.de/assets/貓.jpg', 117 | ], $web->images); 118 | 119 | // Check the expected data 120 | $this->assertSame([ 121 | [ 122 | 'url' => 'https://test-pages.phpscraper.de/assets/貓.jpg', 123 | 'alt' => 'absolute path', 124 | 'width' => null, 125 | 'height' => null, 126 | ], 127 | [ 128 | 'url' => 'https://test-pages.phpscraper.de/assets/貓.jpg', 129 | 'alt' => 'relative path', 130 | 'width' => null, 131 | 'height' => null, 132 | ], 133 | ], $web->imagesWithDetails); 134 | } 135 | 136 | /** 137 | * @test 138 | */ 139 | public function testBaseHref() 140 | { 141 | $web = new \Spekulatius\PHPScraper\PHPScraper; 142 | 143 | // Navigate to the test page. 144 | $web->go('https://test-pages.phpscraper.de/images/base-href.html'); 145 | 146 | // Check the number of images 147 | $this->assertSame(2, count($web->images)); 148 | 149 | // Base set: 150 | $this->assertSame([ 151 | 'https://test-pages.phpscraper.de/assets/cat.jpg', 152 | 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg', 153 | ], $web->images); 154 | 155 | // Detail set: 156 | $this->assertSame([ 157 | [ 158 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 159 | 'alt' => 'absolute path with base href', 160 | 'width' => null, 161 | 'height' => null, 162 | ], 163 | [ 164 | 'url' => 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg', 165 | 'alt' => 'relative path with base href', 166 | 'width' => null, 167 | 'height' => null, 168 | ], 169 | ], $web->imagesWithDetails); 170 | } 171 | 172 | /** 173 | * @test 174 | */ 175 | public function testWidth() 176 | { 177 | $web = new \Spekulatius\PHPScraper\PHPScraper; 178 | 179 | // Navigate to the test page. 180 | $web->go('https://test-pages.phpscraper.de/images/width.html'); 181 | 182 | // Check the number of images 183 | $this->assertSame(3, count($web->images)); 184 | 185 | // Check the expected data 186 | $this->assertSame([ 187 | [ 188 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 189 | 'alt' => 'no width', 190 | 'width' => null, 191 | 'height' => null, 192 | ], 193 | [ 194 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 195 | 'alt' => 'width at 1200px', 196 | 'width' => '1200px', 197 | 'height' => null, 198 | ], 199 | [ 200 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 201 | 'alt' => 'width at 100rem', 202 | 'width' => '100rem', 203 | 'height' => null, 204 | ], 205 | ], $web->imagesWithDetails); 206 | } 207 | 208 | /** 209 | * @test 210 | */ 211 | public function testHeight() 212 | { 213 | $web = new \Spekulatius\PHPScraper\PHPScraper; 214 | 215 | // Navigate to the test page. 216 | $web->go('https://test-pages.phpscraper.de/images/height.html'); 217 | 218 | // Check the number of imagess 219 | $this->assertSame(3, count($web->images)); 220 | 221 | // Check the expected data 222 | $this->assertSame([ 223 | [ 224 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 225 | 'alt' => 'no height', 226 | 'width' => null, 227 | 'height' => null, 228 | ], 229 | [ 230 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 231 | 'alt' => 'height at 1200px', 232 | 'width' => null, 233 | 'height' => '1200px', 234 | ], 235 | [ 236 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 237 | 'alt' => 'height at 100rem', 238 | 'width' => null, 239 | 'height' => '100rem', 240 | ], 241 | ], $web->imagesWithDetails); 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /tests/KeywordTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/content/keywords.html'); 17 | 18 | // Check the keywords on this case... 19 | $keywords = $web->contentKeywords; 20 | 21 | // A selected list of keywords to expect 22 | $shouldKeywords = [ 23 | '1960s', 24 | 'added', 25 | 'adopted lorem ipsum', 26 | 'advertisements', 27 | 'aldus employed', 28 | 'corrupted version', 29 | 'graphic', 30 | 'improper latin', 31 | 'introduced', 32 | 'keyword extraction tests', 33 | 'test', 34 | 'microsoft word', 35 | 'english wikipedia', 36 | 'lorem ipsum', 37 | 'lorem ipsum text', 38 | ]; 39 | 40 | // Check if all are part of the output 41 | foreach ($shouldKeywords as $keyword) { 42 | $this->assertTrue( 43 | in_array($keyword, $keywords), 44 | sprintf('"%s" is missing', $keyword) 45 | ); 46 | } 47 | } 48 | 49 | /** 50 | * @test 51 | */ 52 | public function testKeywordExtractionWithScores() 53 | { 54 | $web = new \Spekulatius\PHPScraper\PHPScraper; 55 | 56 | // Navigate to the test page. 57 | // It contains 3 paragraphs from the English Wikipedia article for "lorem ipsum" 58 | $web->go('https://test-pages.phpscraper.de/content/keywords.html'); 59 | 60 | // Check the keywords on this case... 61 | $keywords = $web->contentKeywordsWithScores; 62 | 63 | // A selected list of keywords to expect 64 | $shouldKeywords = [ 65 | 'added' => 1.0, 66 | 'adopted lorem ipsum' => 11.0, 67 | 'advertisements' => 1.0, 68 | 'aldus employed' => 4.0, 69 | 'corrupted version' => 4.0, 70 | 'graphic' => 1.0, 71 | 'improper latin' => 4.0, 72 | 'introduced' => 1.0, 73 | 'keyword extraction tests' => 9.0, 74 | 'test' => 1.0, 75 | 'microsoft word' => 5.3333333333333, 76 | 'english wikipedia' => 4.0, 77 | 'lorem ipsum' => 8.0, 78 | 'lorem ipsum text' => 11.0, 79 | ]; 80 | 81 | // Check if all are part of the output with the expected score 82 | foreach ($shouldKeywords as $keyword => $score) { 83 | // Has the same score 84 | $this->assertSame( 85 | round($keywords[$keyword], 8), 86 | round($score, 8), 87 | sprintf('Score for "%s" is incorrect', $keyword) 88 | ); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /tests/LinkTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/links/no-links.html'); 16 | 17 | // No links -> an empty array is expected. 18 | $this->assertSame([], $web->links); 19 | $this->assertSame([], $web->linksWithDetails); 20 | } 21 | 22 | /** 23 | * @test 24 | */ 25 | public function testTarget() 26 | { 27 | $web = new \Spekulatius\PHPScraper\PHPScraper; 28 | 29 | // Navigate to the test page. 30 | $web->go('https://test-pages.phpscraper.de/links/target.html'); 31 | 32 | // Check the number of links 33 | $this->assertSame(6, count($web->links)); 34 | 35 | // Check the simple links list 36 | $this->assertSame([ 37 | 'https://placekitten.com/408/287', 38 | 'https://placekitten.com/444/333', 39 | 'https://placekitten.com/444/321', 40 | 'https://placekitten.com/408/287', 41 | 'https://placekitten.com/444/333', 42 | 'https://placekitten.com/444/321', 43 | ], $web->links); 44 | 45 | // Check the complex links list 46 | $this->assertSame([ 47 | [ 48 | 'url' => 'https://placekitten.com/408/287', 49 | 'protocol' => 'https', 50 | 'text' => 'external kitten', 51 | 'title' => null, 52 | 'target' => '_blank', 53 | 'rel' => null, 54 | 'image' => [], 55 | 'isNofollow' => false, 56 | 'isUGC' => false, 57 | 'isSponsored' => false, 58 | 'isMe' => false, 59 | 'isNoopener' => false, 60 | 'isNoreferrer' => false, 61 | ], [ 62 | 'url' => 'https://placekitten.com/444/333', 63 | 'protocol' => 'https', 64 | 'text' => 'external kitten', 65 | 'title' => null, 66 | 'target' => '_blank', 67 | 'rel' => null, 68 | 'image' => [], 69 | 'isNofollow' => false, 70 | 'isUGC' => false, 71 | 'isSponsored' => false, 72 | 'isMe' => false, 73 | 'isNoopener' => false, 74 | 'isNoreferrer' => false, 75 | ], [ 76 | 'url' => 'https://placekitten.com/444/321', 77 | 'protocol' => 'https', 78 | 'text' => 'external kitten', 79 | 'title' => null, 80 | 'target' => '_blank', 81 | 'rel' => null, 82 | 'image' => [], 83 | 'isNofollow' => false, 84 | 'isUGC' => false, 85 | 'isSponsored' => false, 86 | 'isMe' => false, 87 | 'isNoopener' => false, 88 | 'isNoreferrer' => false, 89 | ], [ 90 | 'url' => 'https://placekitten.com/408/287', 91 | 'protocol' => 'https', 92 | 'text' => 'external kitten', 93 | 'title' => null, 94 | 'target' => 'kitten', 95 | 'rel' => null, 96 | 'image' => [], 97 | 'isNofollow' => false, 98 | 'isUGC' => false, 99 | 'isSponsored' => false, 100 | 'isMe' => false, 101 | 'isNoopener' => false, 102 | 'isNoreferrer' => false, 103 | ], [ 104 | 'url' => 'https://placekitten.com/444/333', 105 | 'protocol' => 'https', 106 | 'text' => 'external kitten', 107 | 'title' => null, 108 | 'target' => 'kitten', 109 | 'rel' => null, 110 | 'image' => [], 111 | 'isNofollow' => false, 112 | 'isUGC' => false, 113 | 'isSponsored' => false, 114 | 'isMe' => false, 115 | 'isNoopener' => false, 116 | 'isNoreferrer' => false, 117 | ], [ 118 | 'url' => 'https://placekitten.com/444/321', 119 | 'protocol' => 'https', 120 | 'text' => 'external kitten', 121 | 'title' => null, 122 | 'target' => 'kitten', 123 | 'rel' => null, 124 | 'image' => [], 125 | 'isNofollow' => false, 126 | 'isUGC' => false, 127 | 'isSponsored' => false, 128 | 'isMe' => false, 129 | 'isNoopener' => false, 130 | 'isNoreferrer' => false, 131 | ], 132 | ], $web->linksWithDetails); 133 | } 134 | 135 | /** 136 | * @test 137 | */ 138 | public function testRel() 139 | { 140 | $web = new \Spekulatius\PHPScraper\PHPScraper; 141 | 142 | // Navigate to the test page. 143 | // This page contains several links with different rel attributes. 144 | $web->go('https://test-pages.phpscraper.de/links/rel.html'); 145 | 146 | // Check the number of links 147 | $this->assertSame(5, count($web->links)); 148 | 149 | // Check the simple links list 150 | $this->assertSame([ 151 | 'https://placekitten.com/432/287', 152 | 'https://placekitten.com/456/287', 153 | 'https://placekitten.com/345/287', 154 | 'https://placekitten.com/345/287', 155 | 'https://placekitten.com/345/222', 156 | ], $web->links); 157 | 158 | // Check the complex links list 159 | $this->assertSame([ 160 | [ 161 | 'url' => 'https://placekitten.com/432/287', 162 | 'protocol' => 'https', 163 | 'text' => 'external kitten', 164 | 'title' => null, 165 | 'target' => null, 166 | 'rel' => 'nofollow', 167 | 'image' => [], 168 | 'isNofollow' => true, 169 | 'isUGC' => false, 170 | 'isSponsored' => false, 171 | 'isMe' => false, 172 | 'isNoopener' => false, 173 | 'isNoreferrer' => false, 174 | ], [ 175 | 'url' => 'https://placekitten.com/456/287', 176 | 'protocol' => 'https', 177 | 'text' => 'external kitten', 178 | 'title' => null, 179 | 'target' => null, 180 | 'rel' => 'ugc', 181 | 'image' => [], 182 | 'isNofollow' => false, 183 | 'isUGC' => true, 184 | 'isSponsored' => false, 185 | 'isMe' => false, 186 | 'isNoopener' => false, 187 | 'isNoreferrer' => false, 188 | ], [ 189 | 'url' => 'https://placekitten.com/345/287', 190 | 'protocol' => 'https', 191 | 'text' => 'external kitten', 192 | 'title' => null, 193 | 'target' => null, 194 | 'rel' => 'nofollow ugc', 195 | 'image' => [], 196 | 'isNofollow' => true, 197 | 'isUGC' => true, 198 | 'isSponsored' => false, 199 | 'isMe' => false, 200 | 'isNoopener' => false, 201 | 'isNoreferrer' => false, 202 | ], [ 203 | 'url' => 'https://placekitten.com/345/287', 204 | 'protocol' => 'https', 205 | 'text' => 'external kitten', 206 | 'title' => null, 207 | 'target' => null, 208 | 'rel' => 'noopener', 209 | 'image' => [], 210 | 'isNofollow' => false, 211 | 'isUGC' => false, 212 | 'isSponsored' => false, 213 | 'isMe' => false, 214 | 'isNoopener' => true, 215 | 'isNoreferrer' => false, 216 | ], [ 217 | 'url' => 'https://placekitten.com/345/222', 218 | 'protocol' => 'https', 219 | 'text' => 'external kitten', 220 | 'title' => null, 221 | 'target' => null, 222 | 'rel' => 'noreferrer', 223 | 'image' => [], 224 | 'isNofollow' => false, 225 | 'isUGC' => false, 226 | 'isSponsored' => false, 227 | 'isMe' => false, 228 | 'isNoopener' => false, 229 | 'isNoreferrer' => true, 230 | ], 231 | ], $web->linksWithDetails); 232 | } 233 | 234 | /** 235 | * @test 236 | */ 237 | public function testBaseHref() 238 | { 239 | $web = new \Spekulatius\PHPScraper\PHPScraper; 240 | 241 | // Navigate to the test page. 242 | $web->go('https://test-pages.phpscraper.de/links/base-href.html'); 243 | 244 | // Check the number of links 245 | $this->assertSame(3, count($web->links)); 246 | 247 | // Check the simple links list 248 | $this->assertSame([ 249 | 'https://placekitten.com/408/287', 250 | 'https://test-pages.phpscraper.de/assets/cat.jpg', 251 | 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg', 252 | ], $web->links); 253 | 254 | // Check the complex links list 255 | $this->assertSame([ 256 | [ 257 | 'url' => 'https://placekitten.com/408/287', 258 | 'protocol' => 'https', 259 | 'text' => 'external kitten', 260 | 'title' => 'external path with base href', 261 | 'target' => null, 262 | 'rel' => null, 263 | 'image' => [], 264 | 'isNofollow' => false, 265 | 'isUGC' => false, 266 | 'isSponsored' => false, 267 | 'isMe' => false, 268 | 'isNoopener' => false, 269 | 'isNoreferrer' => false, 270 | ], [ 271 | 'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 272 | 'protocol' => 'https', 273 | 'text' => 'absolute path to cat', 274 | 'title' => 'absolute internal path with base href', 275 | 'target' => null, 276 | 'rel' => null, 277 | 'image' => [], 278 | 'isNofollow' => false, 279 | 'isUGC' => false, 280 | 'isSponsored' => false, 281 | 'isMe' => false, 282 | 'isNoopener' => false, 283 | 'isNoreferrer' => false, 284 | ], [ 285 | 'url' => 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg', 286 | 'protocol' => 'https', 287 | 'text' => 'relative cat', 288 | 'title' => 'relative path with base href', 289 | 'target' => null, 290 | 'rel' => null, 291 | 'image' => [], 292 | 'isNofollow' => false, 293 | 'isUGC' => false, 294 | 'isSponsored' => false, 295 | 'isMe' => false, 296 | 'isNoopener' => false, 297 | 'isNoreferrer' => false, 298 | ], 299 | ], $web->linksWithDetails); 300 | } 301 | 302 | /** 303 | * @test 304 | */ 305 | public function testImageUrl() 306 | { 307 | $web = new \Spekulatius\PHPScraper\PHPScraper; 308 | 309 | // Navigate to the test page. 310 | $web->go('https://test-pages.phpscraper.de/links/image-url.html'); 311 | 312 | // Check the number of links 313 | $this->assertSame(3, count($web->links)); 314 | 315 | // Check the complex links list 316 | $this->assertSame([ 317 | [ 318 | 'url' => 'https://placekitten.com/432/500', 319 | 'protocol' => 'https', 320 | 'text' => '', 321 | 'title' => null, 322 | 'target' => null, 323 | 'rel' => 'nofollow', 324 | 'image' => [ 325 | 'https://placekitten.com/432/287', 326 | ], 327 | 'isNofollow' => true, 328 | 'isUGC' => false, 329 | 'isSponsored' => false, 330 | 'isMe' => false, 331 | 'isNoopener' => false, 332 | 'isNoreferrer' => false, 333 | ], [ 334 | 'url' => 'https://placekitten.com/456/500', 335 | 'protocol' => 'https', 336 | 'text' => '', 337 | 'title' => null, 338 | 'target' => null, 339 | 'rel' => 'ugc', 340 | 'image' => [ 341 | 'https://placekitten.com/456/400', 342 | 'https://placekitten.com/456/300', 343 | ], 344 | 'isNofollow' => false, 345 | 'isUGC' => true, 346 | 'isSponsored' => false, 347 | 'isMe' => false, 348 | 'isNoopener' => false, 349 | 'isNoreferrer' => false, 350 | ], [ 351 | 'url' => 'https://placekitten.com/345/500', 352 | 'protocol' => 'https', 353 | 'text' => 'This is image', 354 | 'title' => null, 355 | 'target' => null, 356 | 'rel' => 'nofollow ugc', 357 | 'image' => [ 358 | 'https://placekitten.com/345/287', 359 | ], 360 | 'isNofollow' => true, 361 | 'isUGC' => true, 362 | 'isSponsored' => false, 363 | 'isMe' => false, 364 | 'isNoopener' => false, 365 | 'isNoreferrer' => false, 366 | ], 367 | ], $web->linksWithDetails); 368 | } 369 | 370 | /** 371 | * @test 372 | */ 373 | public function testInternalLinks() 374 | { 375 | $web = new \Spekulatius\PHPScraper\PHPScraper; 376 | 377 | // Navigate to the test page. 378 | $web->go('https://test-pages.phpscraper.de/links/base-href.html'); 379 | 380 | // Check the internal links list 381 | $this->assertSame( 382 | ['https://test-pages.phpscraper.de/assets/cat.jpg'], 383 | $web->internalLinks 384 | ); 385 | } 386 | 387 | /** 388 | * @test 389 | */ 390 | public function testExternalLinks() 391 | { 392 | $web = new \Spekulatius\PHPScraper\PHPScraper; 393 | 394 | // Navigate to the test page. 395 | $web->go('https://test-pages.phpscraper.de/links/base-href.html'); 396 | 397 | // Check the external links list 398 | $this->assertSame( 399 | [ 400 | 'https://placekitten.com/408/287', 401 | 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg', 402 | ], 403 | $web->externalLinks 404 | ); 405 | } 406 | } 407 | -------------------------------------------------------------------------------- /tests/ListsTest.php: -------------------------------------------------------------------------------- 1 | Example 1: Unordered List 18 | *
    19 | *
  • Unordered item 1
  • 20 | *
  • Unordered item 2
  • 21 | *
  • Unordered item with HTML
  • 22 | *
23 | * 24 | *

Example 2: Ordered List

25 | *
    26 | *
  1. Order list item 1
  2. 27 | *
  3. Order list item 2
  4. 28 | *
  5. Order list item with HTML
  6. 29 | *
30 | */ 31 | $web->go('https://test-pages.phpscraper.de/content/lists.html'); 32 | 33 | // Check all lists are recognized 34 | $this->assertSame(count($web->lists), 2); 35 | $this->assertSame(count($web->unorderedLists), 1); 36 | $this->assertSame(count($web->orderedLists), 1); 37 | 38 | // Check the contents 39 | $this->assertSame([ 40 | 'Ordered list item 1', 41 | 'Ordered list item 2', 42 | 'Ordered list item with HTML', 43 | ], $web->orderedLists[0]['children_plain']); 44 | 45 | $this->assertSame([ 46 | 'Unordered list item 1', 47 | 'Unordered list item 2', 48 | 'Unordered list item with HTML', 49 | ], $web->unorderedLists[0]['children_plain']); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tests/MetaAuthorTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/meta/missing.html'); 16 | 17 | // Check the author as not given (null) 18 | $this->assertNull($web->author); 19 | } 20 | 21 | /** 22 | * @test 23 | */ 24 | public function testWithHTMLEntity() 25 | { 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | 28 | // Navigate to the test page. 29 | $web->go('https://test-pages.phpscraper.de/meta/html-entities.html'); 30 | 31 | // Check the author 32 | $this->assertSame( 33 | 'Cat & Mouse', 34 | $web->author 35 | ); 36 | } 37 | 38 | /** 39 | * @test 40 | */ 41 | public function testLoremIpsum() 42 | { 43 | $web = new \Spekulatius\PHPScraper\PHPScraper; 44 | 45 | // Navigate to the test page. 46 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 47 | 48 | // Check the author 49 | $this->assertSame( 50 | 'Lorem ipsum', 51 | $web->author 52 | ); 53 | } 54 | 55 | /** 56 | * @test 57 | */ 58 | public function testGermanUmlaute() 59 | { 60 | $web = new \Spekulatius\PHPScraper\PHPScraper; 61 | 62 | // Navigate to the test page. 63 | $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html'); 64 | 65 | // Check the author 66 | $this->assertSame( 67 | 'Müller', 68 | $web->author 69 | ); 70 | } 71 | 72 | /** 73 | * @test 74 | */ 75 | public function testChineseCharacters() 76 | { 77 | $web = new \Spekulatius\PHPScraper\PHPScraper; 78 | 79 | // Navigate to the test page. 80 | $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html'); 81 | 82 | // Check the author 83 | $this->assertSame( 84 | '貓', 85 | $web->author 86 | ); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /tests/MetaCharsetTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // Check the charset as not given (null) 18 | $this->assertNull($web->charset); 19 | } 20 | 21 | /** 22 | * @test 23 | */ 24 | public function testWithCharset() 25 | { 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | 28 | // Navigate to the test page. 29 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 30 | 31 | // Check the charset 32 | $this->assertSame( 33 | 'utf-8', 34 | $web->charset 35 | ); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tests/MetaContentTypeTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // Check the contentType as not given (null) 18 | $this->assertNull($web->contentType); 19 | } 20 | 21 | /** 22 | * @test 23 | */ 24 | public function testWithContentType() 25 | { 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | 28 | // Navigate to the test page. 29 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 30 | 31 | // Check the contentType 32 | $this->assertSame( 33 | 'text/html; charset=utf-8', 34 | $web->contentType 35 | ); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tests/MetaCsrfTokenTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // Check the csrfToken as not given (null) 18 | $this->assertNull($web->csrfToken); 19 | } 20 | 21 | /** 22 | * @test 23 | */ 24 | public function testWithCsrfToken() 25 | { 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | 28 | // Navigate to the test page. 29 | // Contains: 30 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 31 | 32 | // Check the csrfToken 33 | $this->assertSame( 34 | 'token', 35 | $web->csrfToken 36 | ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/MetaDescriptionTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // Check the description as not given (null) 18 | $this->assertNull($web->description); 19 | } 20 | 21 | /** 22 | * @test 23 | */ 24 | public function testWithHTMLEntity() 25 | { 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | 28 | // Navigate to the test page. 29 | $web->go('https://test-pages.phpscraper.de/meta/html-entities.html'); 30 | 31 | // Check the description 32 | $this->assertSame( 33 | 'Cat & Mouse', 34 | $web->description 35 | ); 36 | } 37 | 38 | /** 39 | * @test 40 | */ 41 | public function testLoremIpsum() 42 | { 43 | $web = new \Spekulatius\PHPScraper\PHPScraper; 44 | 45 | // Navigate to the test page. 46 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 47 | 48 | // Check the description 49 | $this->assertSame( 50 | 'Lorem ipsum dolor etc.', 51 | $web->description 52 | ); 53 | } 54 | 55 | /** 56 | * @test 57 | */ 58 | public function testGermanUmlaute() 59 | { 60 | $web = new \Spekulatius\PHPScraper\PHPScraper; 61 | 62 | // Navigate to the test page. 63 | $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html'); 64 | 65 | // Check the description 66 | $this->assertSame( 67 | 'Eine deutsche Beschreibung mit Umlauten: ä ü ö', 68 | $web->description 69 | ); 70 | } 71 | 72 | /** 73 | * @test 74 | */ 75 | public function testChineseCharacters() 76 | { 77 | $web = new \Spekulatius\PHPScraper\PHPScraper; 78 | 79 | // Navigate to the test page. 80 | $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html'); 81 | 82 | // Check the description 83 | $this->assertSame( 84 | 'A description with Chinese Characters: 加油', 85 | $web->description 86 | ); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /tests/MetaImageTest.php: -------------------------------------------------------------------------------- 1 | go('https://peterthaleikis.com'); 16 | 17 | // Both the method call as well as property call should return the same... 18 | $this->assertSame($web->image(), $web->image); 19 | } 20 | 21 | /** 22 | * @test 23 | */ 24 | public function testMissingImage() 25 | { 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | 28 | // Navigate to the test page. 29 | $web->go('https://test-pages.phpscraper.de/meta/missing.html'); 30 | 31 | // Check the absolute image path 32 | $this->assertNull($web->image); 33 | } 34 | 35 | /** 36 | * @test 37 | */ 38 | public function testAbsolutePath() 39 | { 40 | $web = new \Spekulatius\PHPScraper\PHPScraper; 41 | 42 | // Navigate to the test page. 43 | $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path.html'); 44 | 45 | // Check the absolute image path 46 | $this->assertSame('https://test-pages.phpscraper.de/assets/cat.jpg', $web->image); 47 | } 48 | 49 | /** 50 | * @test 51 | */ 52 | public function testRelativePath() 53 | { 54 | $web = new \Spekulatius\PHPScraper\PHPScraper; 55 | 56 | // Navigate to the test page. 57 | $web->go('https://test-pages.phpscraper.de/meta/image/relative-path.html'); 58 | 59 | // Check the relative image path should be converted into an absolute path. 60 | $this->assertSame( 61 | 'https://test-pages.phpscraper.de/assets/cat.jpg', 62 | $web->image 63 | ); 64 | } 65 | 66 | /** 67 | * @test 68 | */ 69 | public function testAbsolutePathWithBaseHref() 70 | { 71 | $web = new \Spekulatius\PHPScraper\PHPScraper; 72 | 73 | // Navigate to the test page. 74 | $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html'); 75 | 76 | // Check the absolute image path 77 | $this->assertSame( 78 | 'https://test-pages.phpscraper.de/assets/cat.jpg', 79 | $web->image 80 | ); 81 | } 82 | 83 | /** 84 | * @test 85 | */ 86 | public function testRelativePathBaseHref() 87 | { 88 | $web = new \Spekulatius\PHPScraper\PHPScraper; 89 | 90 | // Navigate to the test page. 91 | $web->go('https://test-pages.phpscraper.de/meta/image/relative-path-with-base-href.html'); 92 | 93 | // Check the relative image path 94 | $this->assertSame( 95 | 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg', 96 | $web->image 97 | ); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /tests/MetaKeywordsTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // null if there aren't any keywords set. 18 | $this->assertNull($web->keywordString); 19 | 20 | // Empty array if there aren't any keywords set. 21 | $this->assertTrue(is_iterable($web->keywords)); 22 | $this->assertTrue(empty($web->keywords)); 23 | } 24 | 25 | /** 26 | * @test 27 | */ 28 | public function testNoSpaces() 29 | { 30 | $web = new \Spekulatius\PHPScraper\PHPScraper; 31 | 32 | // Navigate to the test page. 33 | $web->go('https://test-pages.phpscraper.de/meta/keywords/parse-no-spaces.html'); 34 | 35 | // Check the keywords on this case... 36 | $this->assertSame('one,two,three', $web->keywordString); 37 | $this->assertSame(['one', 'two', 'three'], $web->keywords); 38 | } 39 | 40 | /** 41 | * @test 42 | */ 43 | public function testSpaces() 44 | { 45 | $web = new \Spekulatius\PHPScraper\PHPScraper; 46 | 47 | // Navigate to the test page. 48 | $web->go('https://test-pages.phpscraper.de/meta/keywords/parse-spaces.html'); 49 | 50 | // Check the keywords on this case... 51 | $this->assertSame('one, two, three', $web->keywordString); 52 | $this->assertSame(['one', 'two', 'three'], $web->keywords); 53 | } 54 | 55 | /** 56 | * @test 57 | */ 58 | public function testIrregularSpaces() 59 | { 60 | $web = new \Spekulatius\PHPScraper\PHPScraper; 61 | 62 | // Navigate to the test page. 63 | $web->go('https://test-pages.phpscraper.de/meta/keywords/parse-irregular-spaces.html'); 64 | 65 | // Check the keywords on this case... 66 | $this->assertSame('one, two, three', $web->keywordString); 67 | $this->assertSame(['one', 'two', 'three'], $web->keywords); 68 | } 69 | 70 | /** 71 | * @test 72 | */ 73 | public function testWithHTMLEntity() 74 | { 75 | $web = new \Spekulatius\PHPScraper\PHPScraper; 76 | 77 | // Navigate to the test page. 78 | $web->go('https://test-pages.phpscraper.de/meta/html-entities.html'); 79 | 80 | // Check the keywords 81 | $this->assertSame(['Cat & Mouse', 'Mouse & Cat'], $web->keywords); 82 | } 83 | 84 | /** 85 | * @test 86 | */ 87 | public function testLoremIpsum() 88 | { 89 | $web = new \Spekulatius\PHPScraper\PHPScraper; 90 | 91 | // Navigate to the test page. 92 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 93 | 94 | // Check the keywords 95 | $this->assertSame(['Lorem', 'ipsum', 'dolor'], $web->keywords); 96 | } 97 | 98 | /** 99 | * @test 100 | */ 101 | public function testGermanUmlaute() 102 | { 103 | $web = new \Spekulatius\PHPScraper\PHPScraper; 104 | 105 | // Navigate to the test page. 106 | $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html'); 107 | 108 | // Check the keywords 109 | $this->assertSame(['keywords', 'schlüsselwörter'], $web->keywords); 110 | } 111 | 112 | /** 113 | * @test 114 | */ 115 | public function testChineseCharacters() 116 | { 117 | $web = new \Spekulatius\PHPScraper\PHPScraper; 118 | 119 | // Navigate to the test page. 120 | $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html'); 121 | 122 | // Check the keywords 123 | $this->assertSame(['加油', '貓'], $web->keywords); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /tests/MetaViewportTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // null if there isn't a viewport set. 18 | $this->assertNull($web->viewportString); 19 | 20 | // Empty array if there aren't any viewports set. 21 | $this->assertTrue(is_iterable($web->viewport)); 22 | $this->assertTrue(empty($web->viewport)); 23 | } 24 | 25 | /** 26 | * @test 27 | */ 28 | public function testWithViewport() 29 | { 30 | $web = new \Spekulatius\PHPScraper\PHPScraper; 31 | 32 | // Navigate to the test page. 33 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 34 | 35 | // Check the viewport 36 | $this->assertSame( 37 | 'width=device-width, initial-scale=1, shrink-to-fit=no, maximum-scale=1, user-scalable=no', 38 | $web->viewportString 39 | ); 40 | $this->assertSame( 41 | ['width=device-width', 'initial-scale=1', 'shrink-to-fit=no', 'maximum-scale=1', 'user-scalable=no'], 42 | $web->viewport 43 | ); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /tests/NavigationTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/navigation/1.html'); 16 | 17 | // Check the title to see if we actually at the right page... 18 | $this->assertSame('Page #1', $web->h1[0]); 19 | 20 | // Navigate to test page #2 using the absolute link. 21 | $web->clickLink('2 absolute'); 22 | 23 | // Check the title and URL to see if we actually moved... 24 | $this->assertSame('Page #2', $web->h1[0]); 25 | $this->assertSame($web->currentUrl, 'https://test-pages.phpscraper.de/navigation/2.html'); 26 | } 27 | 28 | /** 29 | * @test 30 | */ 31 | public function testSurfWithRelativeLink() 32 | { 33 | $web = new \Spekulatius\PHPScraper\PHPScraper; 34 | 35 | // Navigate to test page #1. 36 | $web->go('https://test-pages.phpscraper.de/navigation/1.html'); 37 | 38 | // Check the title to see if we actually at the right page... 39 | $this->assertSame('Page #1', $web->h1[0]); 40 | 41 | // Navigate to test page #2 using the relative link. 42 | $web->clickLink('2 relative'); 43 | 44 | // Check the title and URL to see if we actually moved... 45 | $this->assertSame('Page #2', $web->h1[0]); 46 | $this->assertSame($web->currentUrl, 'https://test-pages.phpscraper.de/navigation/2.html'); 47 | } 48 | 49 | /** 50 | * Test navigation using an anchor text. 51 | * 52 | * @test 53 | */ 54 | public function testLeavePageByText() 55 | { 56 | $web = new \Spekulatius\PHPScraper\PHPScraper; 57 | 58 | // Navigate to test page #2. 59 | $web->go('https://test-pages.phpscraper.de/navigation/2.html'); 60 | 61 | // Check the title to see if we actually at the right page... 62 | $this->assertSame('Page #2', $web->h1[0]); 63 | 64 | // Click the link with the text: 65 | $web->clickLink('external link'); 66 | 67 | // Check the URL 68 | $this->assertSame('https://peterthaleikis.com/', $web->currentUrl); 69 | } 70 | 71 | /** 72 | * Test if we can navigate out using a redirect. 73 | * 74 | * @test 75 | */ 76 | public function testLeavePageWithRedirect() 77 | { 78 | $web = new \Spekulatius\PHPScraper\PHPScraper; 79 | 80 | // Navigate to test page #2. 81 | $web->go('https://test-pages.phpscraper.de/navigation/2.html'); 82 | 83 | // Check the title to see if we actually at the right page... 84 | $this->assertSame('Page #2', $web->h1[0]); 85 | 86 | // Click the link with the text: 87 | $web->clickLink('external link with redirect'); 88 | 89 | // Check the URL 90 | $this->assertSame('https://peterthaleikis.com/', $web->currentUrl); 91 | } 92 | 93 | /** 94 | * Test if we can navigate out. 95 | * 96 | * @test 97 | */ 98 | public function testLeavePageByURL() 99 | { 100 | $web = new \Spekulatius\PHPScraper\PHPScraper; 101 | 102 | // Navigate to test page #2. 103 | $web->go('https://test-pages.phpscraper.de/navigation/2.html'); 104 | 105 | // Check the title to see if we actually at the right page... 106 | $this->assertSame('Page #2', $web->h1[0]); 107 | 108 | // Click the link with the text: 109 | $web->clickLink('https://peterthaleikis.com/'); 110 | 111 | // Check the URL 112 | $this->assertSame('https://peterthaleikis.com/', $web->currentUrl); 113 | } 114 | 115 | /** 116 | * Test chainability of `clickLink`. 117 | * 118 | * @test 119 | */ 120 | public function testClickLinkChainability() 121 | { 122 | $web = new \Spekulatius\PHPScraper\PHPScraper; 123 | 124 | // Navigate to a page, click a link by URL and see if we are on the expected `currentUrl`. 125 | $web 126 | ->go('https://test-pages.phpscraper.de/navigation/2.html') 127 | ->clickLink('https://peterthaleikis.com/'); 128 | 129 | // Check the URL 130 | $this->assertSame('https://peterthaleikis.com/', $web->currentUrl); 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /tests/NotFoundTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/page-does-not-exist.html'); 18 | 19 | // The built-in server returns this string. 20 | $this->assertSame('Page Not Found', $web->title); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tests/OpenGraphTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // Empty array, because there aren't any open graph props set. 18 | $this->assertTrue(is_iterable($web->openGraph)); 19 | $this->assertTrue(empty($web->openGraph)); 20 | } 21 | 22 | /** 23 | * @test 24 | */ 25 | public function testOpenGraph() 26 | { 27 | $web = new \Spekulatius\PHPScraper\PHPScraper; 28 | 29 | // Navigate to the test page. 30 | $web->go('https://test-pages.phpscraper.de/og/example.html'); 31 | 32 | // Check elements 33 | $this->assertSame('Lorem Ipsum', $web->openGraph['og:title']); 34 | $this->assertSame('Lorem ipsum dolor etc.', $web->openGraph['og:description']); 35 | 36 | // The whole set. 37 | $this->assertSame( 38 | [ 39 | 'og:site_name' => 'Lorem ipsum', 40 | 'og:type' => 'website', 41 | 'og:title' => 'Lorem Ipsum', 42 | 'og:description' => 'Lorem ipsum dolor etc.', 43 | 'og:url' => 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html', 44 | 'og:image' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 45 | ], 46 | $web->openGraph 47 | ); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tests/OutlineTest.php: -------------------------------------------------------------------------------- 1 | We are testing here! 18 | *

This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.

19 | * 20 | *

Examples

21 | *

There are numerous examples on the website. Please check them out to get more context on how scraping works.

22 | * 23 | *

Example 1

24 | *

Here would be an example.

25 | * 26 | *

Example 2

27 | *

Here would be the second example.

28 | * 29 | *

Example 3

30 | *

Here would be another example.

31 | */ 32 | $web->go('https://test-pages.phpscraper.de/content/outline.html'); 33 | 34 | // Get the content outline 35 | $this->assertSame( 36 | [ 37 | [ 38 | 'tag' => 'h1', 39 | 'content' => 'We are testing here!', 40 | ], [ 41 | 'tag' => 'h2', 42 | 'content' => 'Examples', 43 | ], [ 44 | 'tag' => 'h3', 45 | 'content' => 'Example 1', 46 | ], [ 47 | 'tag' => 'h3', 48 | 'content' => 'Example 2', 49 | ], [ 50 | 'tag' => 'h3', 51 | 'content' => 'Example 3', 52 | ], 53 | ], 54 | $web->outline 55 | ); 56 | } 57 | 58 | /** 59 | * @test 60 | */ 61 | public function outlineWithParagraphsTest() 62 | { 63 | $web = new \Spekulatius\PHPScraper\PHPScraper; 64 | 65 | /** 66 | * Navigate to the test page. This page contains: 67 | * 68 | *

We are testing here!

69 | *

This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.

70 | * 71 | *

Examples

72 | *

There are numerous examples on the website. Please check them out to get more context on how scraping works.

73 | * 74 | *

Example 1

75 | *

Here would be an example.

76 | * 77 | *

Example 2

78 | *

Here would be the second example.

79 | * 80 | *

Example 3

81 | *

Here would be another example.

82 | * 83 | * 84 | *

85 | */ 86 | $web->go('https://test-pages.phpscraper.de/content/outline.html'); 87 | 88 | // Get the content outline 89 | $this->assertSame( 90 | [ 91 | [ 92 | 'tag' => 'h1', 93 | 'content' => 'We are testing here!', 94 | ], [ 95 | 'tag' => 'p', 96 | 'content' => 'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.', 97 | ], [ 98 | 'tag' => 'h2', 99 | 'content' => 'Examples', 100 | ], [ 101 | 'tag' => 'p', 102 | 'content' => 'There are numerous examples on the website. Please check them out to get more context on how scraping works.', 103 | ], [ 104 | 'tag' => 'h3', 105 | 'content' => 'Example 1', 106 | ], [ 107 | 'tag' => 'p', 108 | 'content' => 'Here would be an example.', 109 | ], [ 110 | 'tag' => 'h3', 111 | 'content' => 'Example 2', 112 | ], [ 113 | 'tag' => 'p', 114 | 'content' => 'Here would be the second example.', 115 | ], [ 116 | 'tag' => 'h3', 117 | 'content' => 'Example 3', 118 | ], [ 119 | 'tag' => 'p', 120 | 'content' => 'Here would be another example.', 121 | ], [ 122 | 'tag' => 'p', 123 | 'content' => '', 124 | ], 125 | ], 126 | $web->outlineWithParagraphs 127 | ); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /tests/ParagraphsTest.php: -------------------------------------------------------------------------------- 1 | We are testing here! 18 | *

This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.

19 | * 20 | *

Examples

21 | *

There are numerous examples on the website. Please check them out to get more context on how scraping works.

22 | * 23 | *

Example 1

24 | *

Here would be an example.

25 | * 26 | *

Example 2

27 | *

Here would be the second example.

28 | * 29 | *

Example 3

30 | *

Here would be another example.

31 | * 32 | * 33 | *

34 | */ 35 | $web->go('https://test-pages.phpscraper.de/content/outline.html'); 36 | 37 | // Get the paragraphs 38 | $this->assertSame([ 39 | 'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.', 40 | 'There are numerous examples on the website. Please check them out to get more context on how scraping works.', 41 | 'Here would be an example.', 42 | 'Here would be the second example.', 43 | 'Here would be another example.', 44 | '', 45 | ], $web->paragraphs); 46 | } 47 | 48 | /** 49 | * @test 50 | */ 51 | public function cleanParagraphTest() 52 | { 53 | $web = new \Spekulatius\PHPScraper\PHPScraper; 54 | 55 | /** 56 | * Navigate to the test page. This page contains: 57 | * 58 | *

We are testing here!

59 | *

This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.

60 | * 61 | *

Examples

62 | *

There are numerous examples on the website. Please check them out to get more context on how scraping works.

63 | * 64 | *

Example 1

65 | *

Here would be an example.

66 | * 67 | *

Example 2

68 | *

Here would be the second example.

69 | * 70 | *

Example 3

71 | *

Here would be another example.

72 | * 73 | * 74 | *

75 | */ 76 | $web->go('https://test-pages.phpscraper.de/content/outline.html'); 77 | 78 | // Get the cleaned up paragraphs 79 | $this->assertSame([ 80 | 'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.', 81 | 'There are numerous examples on the website. Please check them out to get more context on how scraping works.', 82 | 'Here would be an example.', 83 | 'Here would be the second example.', 84 | 'Here would be another example.', 85 | ], $web->cleanParagraphs); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /tests/ParserCsvTest.php: -------------------------------------------------------------------------------- 1 | parseCsv(); 19 | } catch (\Exception $e) { 20 | // Did we get the expected exception? 21 | $this->assertSame( 22 | 'You can not call parseCsv() without parameter or initial navigation.', 23 | $e->getMessage() 24 | ); 25 | } 26 | 27 | // This tests ensures an exception is thrown, if no context is given. 28 | // Context means either it's been navigated before (URL context) or get something to (fetch +) parse 29 | try { 30 | $web = new \Spekulatius\PHPScraper\PHPScraper; 31 | $web->parseCsvWithHeader(); 32 | } catch (\Exception $e) { 33 | // Did we get the expected exception? 34 | $this->assertSame( 35 | 'You can not call parseCsvWithHeader() without parameter or initial navigation.', 36 | $e->getMessage() 37 | ); 38 | } 39 | } 40 | 41 | /** 42 | * @test 43 | */ 44 | public function testCsvDecodeRaw() 45 | { 46 | $web = new \Spekulatius\PHPScraper\PHPScraper; 47 | 48 | // Only decoding 49 | $this->assertSame( 50 | [ 51 | ['date', 'value'], 52 | ['1945-02-06', '4.20'], 53 | ['1952-03-11', '42'], 54 | ], 55 | $web->csvDecodeRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"), 56 | ); 57 | 58 | // Fetching and decoding 59 | $this->assertSame( 60 | [ 61 | ['date', 'value'], 62 | ['1945-02-06', '4.20'], 63 | ['1952-03-11', '42'], 64 | ], 65 | $web->csvDecodeRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')), 66 | ); 67 | } 68 | 69 | /** 70 | * @test 71 | */ 72 | public function testCsvDecode() 73 | { 74 | $web = new \Spekulatius\PHPScraper\PHPScraper; 75 | 76 | // Only decoding 77 | $this->assertSame( 78 | [ 79 | ['date', 'value'], 80 | ['1945-02-06', 4.20], 81 | ['1952-03-11', 42], 82 | ], 83 | $web->csvDecode("date,value\n1945-02-06,4.20\n1952-03-11,42"), 84 | ); 85 | 86 | // Fetching and decoding 87 | $this->assertSame( 88 | [ 89 | ['date', 'value'], 90 | ['1945-02-06', 4.20], 91 | ['1952-03-11', 42], 92 | ], 93 | $web->csvDecode($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')), 94 | ); 95 | } 96 | 97 | /** 98 | * Test with pipe as separator, enclosure and escape. 99 | * 100 | * @test 101 | */ 102 | public function testCsvDecodeAndCustomEncoding() 103 | { 104 | $web = new \Spekulatius\PHPScraper\PHPScraper; 105 | 106 | $this->assertSame( 107 | [ 108 | ['date', 'value'], 109 | ['1945-02-06', 4.20], 110 | ['1952-03-11', 42], 111 | ['\\'], 112 | ], 113 | $web->csvDecode( 114 | "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"\n\\", 115 | '|', 116 | '"', 117 | '\\' 118 | ) 119 | ); 120 | } 121 | 122 | /** 123 | * @test 124 | */ 125 | public function testCsvDecodeWithHeaderRaw() 126 | { 127 | $web = new \Spekulatius\PHPScraper\PHPScraper; 128 | 129 | // Only decoding 130 | $this->assertSame( 131 | [ 132 | ['date' => '1945-02-06', 'value' => '4.20'], 133 | ['date' => '1952-03-11', 'value' => '42'], 134 | ], 135 | $web->csvDecodeWithHeaderRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"), 136 | ); 137 | 138 | // Fetching and decoding 139 | $this->assertSame( 140 | [ 141 | ['date' => '1945-02-06', 'value' => '4.20'], 142 | ['date' => '1952-03-11', 'value' => '42'], 143 | ], 144 | $web->csvDecodeWithHeaderRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')), 145 | ); 146 | } 147 | 148 | /** 149 | * @test 150 | */ 151 | public function testCsvDecodeWithHeaderAndCasting() 152 | { 153 | $web = new \Spekulatius\PHPScraper\PHPScraper; 154 | 155 | $this->assertSame( 156 | [ 157 | ['date' => '1945-02-06', 'value' => 4.20], 158 | ['date' => '1952-03-11', 'value' => 42], 159 | ], 160 | $web->csvDecodeWithHeader("date,value\n1945-02-06,4.20\n1952-03-11,42"), 161 | ); 162 | } 163 | 164 | /** 165 | * Test with header, pipe as separator, and enclosure. 166 | * 167 | * @test 168 | */ 169 | public function testCsvDecodeWithHeaderAndCustomEncoding() 170 | { 171 | $web = new \Spekulatius\PHPScraper\PHPScraper; 172 | 173 | $this->assertSame( 174 | [ 175 | ['date' => '1945-02-06', 'value' => 4.20], 176 | ['date' => '1952-03-11', 'value' => 42], 177 | ], 178 | 179 | $web->csvDecodeWithHeader( 180 | "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"", 181 | '|', 182 | '"', 183 | '\\' 184 | ) 185 | ); 186 | } 187 | 188 | /** 189 | * Check the pluming: Test the various ways to call `parseCsv()`. 190 | * 191 | * @test 192 | */ 193 | public function testDifferentCsvCalls() 194 | { 195 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). 196 | $web = new \Spekulatius\PHPScraper\PHPScraper; 197 | 198 | // For the reference we are using a simple CSV and parse it. This matches the hosted CSV. 199 | $csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42"; 200 | $csvData = [['date', 'value'], ['1945-02-06', 4.20], ['1952-03-11', 42]]; 201 | 202 | // Case 1: Passing in an CSV string in. 203 | $this->assertSame( 204 | // Pass the CSV Data as reference in. 205 | $csvData, 206 | 207 | // Parse the $csvString directly. 208 | (new \Spekulatius\PHPScraper\PHPScraper) 209 | ->parseCsv($csvString) 210 | ); 211 | 212 | // Case 2: `go` + `parseCsv()` 213 | $this->assertSame( 214 | // Pass the CSV Data as reference in. 215 | $csvData, 216 | 217 | // Chained call using a CSV file as URL. 218 | (new \Spekulatius\PHPScraper\PHPScraper) 219 | ->go('https://test-pages.phpscraper.de/test.csv') 220 | ->parseCsv() 221 | ); 222 | 223 | // Case 3: `parseCsv()` with absolute URL. 224 | $this->assertSame( 225 | // Pass the CSV Data as reference in. 226 | $csvData, 227 | 228 | // Pass the absolutely URL to `parseCsv()` 229 | (new \Spekulatius\PHPScraper\PHPScraper) 230 | ->parseCsv('https://test-pages.phpscraper.de/test.csv') 231 | ); 232 | 233 | // Case 4: `go` + `parseCsv()` with relative URL. 234 | $this->assertSame( 235 | // Pass the CSV Data as reference in. 236 | $csvData, 237 | 238 | // The 'go' sets the base URL for the following relative path. 239 | (new \Spekulatius\PHPScraper\PHPScraper) 240 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 241 | ->parseCsv('/test.csv') 242 | ); 243 | 244 | // Case 5: `go` with base URL + `go` with relative URL + `parseCsv()`. 245 | // 5.1. Ensure the final URL is correct. 246 | $this->assertSame( 247 | 'https://test-pages.phpscraper.de/test.csv', 248 | 249 | // The first 'go' sets the base URL for the following `go` with relative URL. 250 | (new \Spekulatius\PHPScraper\PHPScraper) 251 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 252 | ->go('/test.csv') 253 | ->currentUrl() 254 | ); 255 | 256 | // 5.2. Ensure the parsed CSV is correct. 257 | $this->assertSame( 258 | // Pass the CSV Data as reference in. 259 | $csvData, 260 | 261 | // The first 'go' sets the base URL for the following `go` with relative URL. 262 | (new \Spekulatius\PHPScraper\PHPScraper) 263 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 264 | ->go('/test.csv') 265 | ->parseCsv() 266 | ); 267 | 268 | // Case 6: With encoding params 269 | $this->assertSame( 270 | // Pass the CSV Data as reference in. 271 | $csvData, 272 | 273 | // The first 'go' sets the base URL for the following `go` with relative URL. 274 | (new \Spekulatius\PHPScraper\PHPScraper) 275 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 276 | ->go('/test-custom.csv') 277 | ->parseCsv(null, '|', '"') 278 | ); 279 | 280 | // Case 7: With encoding params and (relative) URL 281 | $this->assertSame( 282 | // Pass the CSV Data as reference in. 283 | $csvData, 284 | 285 | // The first 'go' sets the base URL for the following `go` with relative URL. 286 | (new \Spekulatius\PHPScraper\PHPScraper) 287 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 288 | ->parseCsv('/test-custom.csv', '|', '"') 289 | ); 290 | } 291 | 292 | /** 293 | * Check the pluming: Test the various ways to call `parseCsvWithHeader()`. 294 | * 295 | * @test 296 | */ 297 | public function testDifferentCsvWithHeaderCalls() 298 | { 299 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). 300 | $web = new \Spekulatius\PHPScraper\PHPScraper; 301 | 302 | // For the reference we are using a simple CSV and parse it. This matches the hosted CSV. 303 | $csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42"; 304 | $csvData = [ 305 | ['date' => '1945-02-06', 'value' => 4.20], 306 | ['date' => '1952-03-11', 'value' => 42], 307 | ]; 308 | 309 | // Case 1: Passing in an CSV string in. 310 | $this->assertSame( 311 | // Pass the CSV Data as reference in. 312 | $csvData, 313 | 314 | // Parse the $csvString directly. 315 | (new \Spekulatius\PHPScraper\PHPScraper) 316 | ->parseCsvWithHeader($csvString) 317 | ); 318 | 319 | // Case 2: `parseCsvWithHeader()` 320 | $this->assertSame( 321 | // Pass the CSV Data as reference in. 322 | $csvData, 323 | 324 | // Chained call using a CSV file as URL. 325 | (new \Spekulatius\PHPScraper\PHPScraper) 326 | ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv') 327 | ); 328 | 329 | // Case 2: `go` + `parseCsvWithHeader()` 330 | $this->assertSame( 331 | // Pass the CSV Data as reference in. 332 | $csvData, 333 | 334 | // Chained call using a CSV file as URL. 335 | (new \Spekulatius\PHPScraper\PHPScraper) 336 | ->go('https://test-pages.phpscraper.de/test.csv') 337 | ->parseCsvWithHeader() 338 | ); 339 | 340 | // Case 3: `parseCsvWithHeader()` with absolute URL. 341 | $this->assertSame( 342 | // Pass the CSV Data as reference in. 343 | $csvData, 344 | 345 | // Pass the absolutely URL to `parseCsvWithHeader()` 346 | (new \Spekulatius\PHPScraper\PHPScraper) 347 | ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv') 348 | ); 349 | 350 | // Case 4: `go` + `parseCsvWithHeader()` with relative URL. 351 | $this->assertSame( 352 | // Pass the CSV Data as reference in. 353 | $csvData, 354 | 355 | // The 'go' sets the base URL for the following relative path. 356 | (new \Spekulatius\PHPScraper\PHPScraper) 357 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 358 | ->parseCsvWithHeader('/test.csv') 359 | ); 360 | 361 | // Case 5: `go` with base URL + `go` with relative URL + `parseCsvWithHeader()`. 362 | // 5.1. Ensure the final URL is correct. 363 | $this->assertSame( 364 | 'https://test-pages.phpscraper.de/test.csv', 365 | 366 | // The first 'go' sets the base URL for the following `go` with relative URL. 367 | (new \Spekulatius\PHPScraper\PHPScraper) 368 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 369 | ->go('/test.csv') 370 | ->currentUrl() 371 | ); 372 | 373 | // 5.2. Ensure the parsed CSV is correct. 374 | $this->assertSame( 375 | // Pass the CSV Data as reference in. 376 | $csvData, 377 | 378 | // The first 'go' sets the base URL for the following `go` with relative URL. 379 | (new \Spekulatius\PHPScraper\PHPScraper) 380 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 381 | ->go('/test.csv') 382 | ->parseCsvWithHeader() 383 | ); 384 | 385 | // Case 6: With encoding params 386 | $this->assertSame( 387 | // Pass the CSV Data as reference in. 388 | $csvData, 389 | 390 | // The first 'go' sets the base URL for the following `go` with relative URL. 391 | (new \Spekulatius\PHPScraper\PHPScraper) 392 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 393 | ->go('/test-custom.csv') 394 | ->parseCsvWithHeader(null, '|', '"') 395 | ); 396 | 397 | // Case 7: With encoding params and (relative) URL 398 | $this->assertSame( 399 | // Pass the CSV Data as reference in. 400 | $csvData, 401 | 402 | // The first 'go' sets the base URL for the following `go` with relative URL. 403 | (new \Spekulatius\PHPScraper\PHPScraper) 404 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 405 | ->parseCsvWithHeader('/test-custom.csv', '|', '"') 406 | ); 407 | } 408 | } 409 | -------------------------------------------------------------------------------- /tests/ParserJsonTest.php: -------------------------------------------------------------------------------- 1 | parseJson(); 19 | } catch (\Exception $e) { 20 | // Did we get the expected exception? 21 | $this->assertSame( 22 | 'You can not call parseJson() without parameter or initial navigation.', 23 | $e->getMessage() 24 | ); 25 | } 26 | } 27 | 28 | /** 29 | * Test the various ways to call `parseJson()`. 30 | * 31 | * @test 32 | */ 33 | public function testDifferentJsonCalls() 34 | { 35 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). 36 | $web = new \Spekulatius\PHPScraper\PHPScraper; 37 | 38 | // For the reference we are using a simple JSON and parse it. 39 | $jsonString = $web->fetchAsset('https://test-pages.phpscraper.de/index.json'); 40 | $jsonData = json_decode($jsonString, true); 41 | 42 | // Case 1: Passing in an JSON string in. 43 | $this->assertSame( 44 | // Pass the JSON Data as reference in. 45 | $jsonData, 46 | 47 | // Parse the $jsonString directly. 48 | (new \Spekulatius\PHPScraper\PHPScraper) 49 | ->parseJson($jsonString) 50 | ); 51 | 52 | // Case 2: `go` + `parseJson()` 53 | $this->assertSame( 54 | // Pass the JSON Data as reference in. 55 | $jsonData, 56 | 57 | // Chained call using a JSON file as URL. 58 | (new \Spekulatius\PHPScraper\PHPScraper) 59 | ->go('https://test-pages.phpscraper.de/index.json') 60 | ->parseJson() 61 | ); 62 | 63 | // Case 3: `parseJson()` with absolute URL. 64 | $this->assertSame( 65 | // Pass the JSON Data as reference in. 66 | $jsonData, 67 | 68 | // Pass the absolutely URL to `parseJson()` 69 | (new \Spekulatius\PHPScraper\PHPScraper) 70 | ->parseJson('https://test-pages.phpscraper.de/index.json') 71 | ); 72 | 73 | // Case 4: `go` + `parseJson()` with relative URL. 74 | $this->assertSame( 75 | // Pass the JSON Data as reference in. 76 | $jsonData, 77 | 78 | // The 'go' sets the base URL for the following relative path. 79 | (new \Spekulatius\PHPScraper\PHPScraper) 80 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 81 | ->parseJson('/index.json') 82 | ); 83 | 84 | // Case 5: `go` with base URL + `go` with relative URL + `parseJson()`. 85 | // 5.1. Ensure the final URL is correct. 86 | $this->assertSame( 87 | 'https://test-pages.phpscraper.de/index.json', 88 | 89 | // The first 'go' sets the base URL for the following `go` with relative URL. 90 | (new \Spekulatius\PHPScraper\PHPScraper) 91 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 92 | ->go('/index.json') 93 | ->currentUrl() 94 | ); 95 | 96 | // 5.2. Ensure the parsed JSON is correct. 97 | $this->assertSame( 98 | // Pass the JSON Data as reference in. 99 | $jsonData, 100 | 101 | // The first 'go' sets the base URL for the following `go` with relative URL. 102 | (new \Spekulatius\PHPScraper\PHPScraper) 103 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 104 | ->go('/index.json') 105 | ->parseJson() 106 | ); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /tests/ParserXmlTest.php: -------------------------------------------------------------------------------- 1 | parseXml(); 19 | } catch (\Exception $e) { 20 | // Did we get the expected exception? 21 | $this->assertSame( 22 | 'You can not call parseXml() without parameter or initial navigation.', 23 | $e->getMessage() 24 | ); 25 | } 26 | } 27 | 28 | /** 29 | * @test 30 | */ 31 | public function testDifferentXmlCalls() 32 | { 33 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). 34 | $web = new \Spekulatius\PHPScraper\PHPScraper; 35 | 36 | // For the reference we are using a simple XML and parse it. 37 | $xmlString = $web->fetchAsset('https://test-pages.phpscraper.de/sitemap.xml'); 38 | $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); 39 | $xmlData = json_decode((string) json_encode($xml), true); 40 | 41 | // Case 1: Passing in an XML string in. 42 | $this->assertSame( 43 | // Pass the XML Data as reference in. 44 | $xmlData, 45 | 46 | // Parse the XML string directly. 47 | (new \Spekulatius\PHPScraper\PHPScraper) 48 | ->parseXml($xmlString) 49 | ); 50 | 51 | // Case 2: `go` + `parseXml()` 52 | $this->assertSame( 53 | // Pass the XML Data as reference in. 54 | $xmlData, 55 | 56 | // Chained call with XML as URL 57 | (new \Spekulatius\PHPScraper\PHPScraper) 58 | ->go('https://test-pages.phpscraper.de/sitemap.xml') 59 | ->parseXml() 60 | ); 61 | 62 | // Case 3: `parseXml()` with absolute URL. 63 | $this->assertSame( 64 | // Pass the XML Data as reference in. 65 | $xmlData, 66 | 67 | // Pass the absolutely URL to `parseXml()` 68 | (new \Spekulatius\PHPScraper\PHPScraper) 69 | ->parseXml('https://test-pages.phpscraper.de/sitemap.xml') 70 | ); 71 | 72 | // Case 4: `go` + `parseXml()` with relative URL. 73 | $this->assertSame( 74 | // Pass the XML Data as reference in. 75 | $xmlData, 76 | 77 | // The 'go' sets the base URL for the following relative path. 78 | (new \Spekulatius\PHPScraper\PHPScraper) 79 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 80 | ->parseXml('/sitemap.xml') 81 | ); 82 | 83 | // Case 5: `go` with base URL + `go` with relative URL + `parseXml()`. 84 | // 5.1. Ensure the final URL is correct. 85 | $this->assertSame( 86 | 'https://test-pages.phpscraper.de/sitemap.xml', 87 | 88 | // The first 'go' sets the base URL for the following `go` with relative URL. 89 | (new \Spekulatius\PHPScraper\PHPScraper) 90 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 91 | ->go('/sitemap.xml') 92 | ->currentUrl() 93 | ); 94 | 95 | // 5.2. Ensure the parsed JSON is correct. 96 | $this->assertSame( 97 | // Pass the XML Data as reference in. 98 | $xmlData, 99 | 100 | // The first 'go' sets the base URL for the following `go` with relative URL. 101 | (new \Spekulatius\PHPScraper\PHPScraper) 102 | ->go('https://test-pages.phpscraper.de/meta/feeds.html') 103 | ->go('/sitemap.xml') 104 | ->parseXml() 105 | ); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /tests/RedirectTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de'); 16 | 17 | $this->assertNotSame( 18 | $web->currentUrl, 19 | 'https://test-pages.phpscraper.de/' 20 | ); 21 | $this->assertSame( 22 | $web->currentUrl, 23 | 'https://phpscraper.de/' 24 | ); 25 | } 26 | 27 | /** 28 | * @test 29 | */ 30 | public function testDisabledRedirect() 31 | { 32 | $web = new \Spekulatius\PHPScraper\PHPScraper; 33 | 34 | $web->setConfig([ 35 | 'follow_redirects' => false, 36 | 'follow_meta_refresh' => false, 37 | 'max_redirects' => -1, 38 | ]); 39 | 40 | // Navigate to the test page: This redirects to phpscraper.de 41 | $web->go('https://test-pages.phpscraper.de'); 42 | 43 | $this->assertSame( 44 | 'https://test-pages.phpscraper.de', 45 | $web->currentUrl, 46 | ); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/TitleTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // Check the title as not given (null) 18 | $this->assertNull($web->title); 19 | } 20 | 21 | /** 22 | * @test 23 | */ 24 | public function testWithHTMLEntity() 25 | { 26 | $web = new \Spekulatius\PHPScraper\PHPScraper; 27 | 28 | // Navigate to the test page. 29 | $web->go('https://test-pages.phpscraper.de/meta/html-entities.html'); 30 | 31 | // Check the title 32 | $this->assertSame( 33 | 'Cat & Mouse', 34 | $web->title 35 | ); 36 | } 37 | 38 | /** 39 | * @test 40 | */ 41 | public function testLoremIpsum() 42 | { 43 | $web = new \Spekulatius\PHPScraper\PHPScraper; 44 | 45 | // Navigate to the test page. 46 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html'); 47 | 48 | // Check the title 49 | $this->assertSame( 50 | 'Lorem Ipsum', 51 | $web->title 52 | ); 53 | } 54 | 55 | /** 56 | * @test 57 | */ 58 | public function testGermanUmlaute() 59 | { 60 | $web = new \Spekulatius\PHPScraper\PHPScraper; 61 | 62 | // Navigate to the test page. 63 | $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html'); 64 | 65 | // Check the title 66 | $this->assertSame( 67 | 'A page with plenty of German umlaute everywhere (ä ü ö)', 68 | $web->title 69 | ); 70 | } 71 | 72 | /** 73 | * @test 74 | */ 75 | public function testChineseCharacters() 76 | { 77 | $web = new \Spekulatius\PHPScraper\PHPScraper; 78 | 79 | // Navigate to the test page. 80 | $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html'); 81 | 82 | // Check the title 83 | $this->assertSame( 84 | 'Page with Chinese Characters all over the place (加油)', 85 | $web->title 86 | ); 87 | } 88 | 89 | /** 90 | * @test 91 | */ 92 | public function testLongTitle() 93 | { 94 | $web = new \Spekulatius\PHPScraper\PHPScraper; 95 | 96 | // Navigate to the test page. 97 | $web->go('https://test-pages.phpscraper.de/title/long-title.html'); 98 | 99 | // Check the title 100 | $this->assertSame( 101 | 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed mollis purus id ex consectetur facilisis. In gravida sodales nisl a consequat. Aenean ipsum sem, congue et rhoncus a, feugiat eget enim. Duis ut malesuada neque. Nam justo est, interdum eu massa in, volutpat vestibulum libero. Mauris a varius mauris, in vulputate ligula. Nulla rhoncus eget purus a sodales. Nulla facilisi. Proin purus purus, sodales non dolor in, lobortis elementum augue. Nulla sagittis, ex eu placerat varius, nulla mi rutrum odio, sit amet lacinia ipsum urna nec massa. Quisque posuere mauris id condimentum viverra.', 102 | $web->title 103 | ); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /tests/TwitterCardTest.php: -------------------------------------------------------------------------------- 1 | go('https://test-pages.phpscraper.de/meta/missing.html'); 16 | 17 | // Empty array, because there aren't any twitter cards props set. 18 | $this->assertTrue(is_iterable($web->twitterCard)); 19 | $this->assertTrue(empty($web->twitterCard)); 20 | } 21 | 22 | /** 23 | * @test 24 | */ 25 | public function testTwitterCard() 26 | { 27 | $web = new \Spekulatius\PHPScraper\PHPScraper; 28 | 29 | // Navigate to the test page. 30 | $web->go('https://test-pages.phpscraper.de/twittercard/example.html'); 31 | 32 | // Check elements 33 | $this->assertSame('summary_large_image', $web->twitterCard['twitter:card']); 34 | $this->assertSame('Lorem Ipsum', $web->twitterCard['twitter:title']); 35 | 36 | // The whole set. 37 | $this->assertSame( 38 | [ 39 | 'twitter:card' => 'summary_large_image', 40 | 'twitter:title' => 'Lorem Ipsum', 41 | 'twitter:description' => 'Lorem ipsum dolor etc.', 42 | 'twitter:url' => 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html', 43 | 'twitter:image' => 'https://test-pages.phpscraper.de/assets/cat.jpg', 44 | ], 45 | $web->twitterCard 46 | ); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/UrlTest.php: -------------------------------------------------------------------------------- 1 | assertNull($web->makeUrlAbsolute(null)); 20 | } 21 | 22 | /** 23 | * @test 24 | */ 25 | public function validateUriTest() 26 | { 27 | $web = new \Spekulatius\PHPScraper\PHPScraper; 28 | 29 | // We use any URL for this. 30 | $web->go('https://test-pages.phpscraper.de/content/lists.html'); 31 | 32 | // Ensure the URL is set correctly. 33 | $this->assertSame( 34 | 'https://test-pages.phpscraper.de/content/lists.html', 35 | $web->currentUrl 36 | ); 37 | 38 | // Ensure the host is parsed correctly. 39 | $this->assertSame( 40 | 'test-pages.phpscraper.de', 41 | $web->currentHost 42 | ); 43 | 44 | // Ensure the host with protocol is parsed correctly. 45 | $this->assertSame( 46 | 'https://test-pages.phpscraper.de', 47 | $web->currentBaseHost 48 | ); 49 | } 50 | 51 | /** 52 | * @test 53 | */ 54 | public function testCurrentBaseHostWithBase() 55 | { 56 | $web = new \Spekulatius\PHPScraper\PHPScraper; 57 | 58 | // Navigate to the test page. 59 | // Contains: 60 | $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html'); 61 | 62 | // Check the base href being passed through the current base host. 63 | $this->assertSame( 64 | 'https://test-pages-with-base-href.phpscraper.de', 65 | $web->currentBaseHost 66 | ); 67 | } 68 | 69 | /** 70 | * Basic processing of the URLs. 71 | * 72 | * @test 73 | */ 74 | public function testMakeUrlAbsolute() 75 | { 76 | $web = new \Spekulatius\PHPScraper\PHPScraper; 77 | 78 | // Navigate to test page: This sets the base URL. 79 | $web->go('https://phpscraper.de'); 80 | 81 | // Test variations of paths to be processed 82 | // With leading slash 83 | $this->assertSame( 84 | 'https://phpscraper.de/index.html', 85 | $web->makeUrlAbsolute('/index.html'), 86 | ); 87 | 88 | // Without leading slash 89 | $this->assertSame( 90 | 'https://phpscraper.de/index.html', 91 | $web->makeUrlAbsolute('index.html'), 92 | ); 93 | 94 | // Paths are considered. 95 | $this->assertSame( 96 | 'https://phpscraper.de/test/index.html', 97 | $web->makeUrlAbsolute('test/index.html'), 98 | ); 99 | 100 | // Absolutely URLs are untouched. 101 | $this->assertSame( 102 | 'https://example.com/index.html', 103 | $web->makeUrlAbsolute('https://example.com/index.html'), 104 | ); 105 | 106 | // Protocol is considered 107 | $this->assertSame( 108 | 'http://example.com/index.html', 109 | $web->makeUrlAbsolute('http://example.com/index.html'), 110 | ); 111 | } 112 | 113 | /** 114 | * Basic processing of the URLs. 115 | * 116 | * @test 117 | */ 118 | public function testMakeUrlAbsoluteConsiderBaseHref() 119 | { 120 | $web = new \Spekulatius\PHPScraper\PHPScraper; 121 | 122 | /** 123 | * Navigate to test page: This sets the base URL. 124 | * 125 | * It contains: 126 | * 127 | * ```html 128 | * 129 | * ``` 130 | * 131 | * While it's located on `test-pages.phpscraper.de`. 132 | * 133 | * This page isn't actually used. It's purely to set the context. 134 | */ 135 | $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html'); 136 | 137 | // Test variations of paths to be processed 138 | // With leading slash 139 | $this->assertSame( 140 | 'https://test-pages-with-base-href.phpscraper.de/index.html', 141 | $web->makeUrlAbsolute('/index.html'), 142 | ); 143 | 144 | // Without leading slash 145 | $this->assertSame( 146 | 'https://test-pages-with-base-href.phpscraper.de/index.html', 147 | $web->makeUrlAbsolute('index.html'), 148 | ); 149 | 150 | // Paths are considered. 151 | $this->assertSame( 152 | 'https://test-pages-with-base-href.phpscraper.de/test/index.html', 153 | $web->makeUrlAbsolute('test/index.html'), 154 | ); 155 | 156 | // Absolutely URLs are untouched. 157 | $this->assertSame( 158 | 'https://example.com/index.html', 159 | $web->makeUrlAbsolute('https://example.com/index.html'), 160 | ); 161 | 162 | // Protocol is considered 163 | $this->assertSame( 164 | 'http://example.com/index.html', 165 | $web->makeUrlAbsolute('http://example.com/index.html'), 166 | ); 167 | } 168 | 169 | /** 170 | * Test if passed in hosts are considered. It trumps any base-href and current url. 171 | * 172 | * @test 173 | */ 174 | public function testMakeUrlAbsoluteWithBaseHost() 175 | { 176 | $web = new \Spekulatius\PHPScraper\PHPScraper; 177 | 178 | // Navigate to test page: This sets the base URL. 179 | $web->go('https://phpscraper.de'); 180 | 181 | // Test variations of paths to be processed 182 | // With leading slash 183 | $this->assertSame( 184 | 'https://example.com/index.html', 185 | $web->makeUrlAbsolute('/index.html', 'https://example.com'), 186 | ); 187 | 188 | // Without leading slash 189 | $this->assertSame( 190 | 'https://example.com/index.html', 191 | $web->makeUrlAbsolute('index.html', 'https://example.com'), 192 | ); 193 | 194 | // Paths are considered. 195 | $this->assertSame( 196 | 'https://example.com/test/index.html', 197 | $web->makeUrlAbsolute('test/index.html', 'https://example.com'), 198 | ); 199 | 200 | // Absolutely URLs are untouched. 201 | $this->assertSame( 202 | 'https://example.com/index.html', 203 | $web->makeUrlAbsolute('https://example.com/index.html', 'https://example-2.com/test/with/path'), 204 | ); 205 | 206 | // Protocol is considered 207 | $this->assertSame( 208 | 'http://example.com/index.html', 209 | $web->makeUrlAbsolute('http://example.com/index.html', 'https://example-2.com/test/with/path'), 210 | ); 211 | } 212 | } 213 | --------------------------------------------------------------------------------