├── .github ├── dependabot.yml └── workflows │ ├── stale.yaml │ └── test.yaml ├── .idea ├── .name ├── ElementFinder.iml ├── inspectionProfiles │ └── Project_Default.xml ├── php.xml └── vcs.xml ├── .php_cs_config.php ├── .run └── Tests.run.xml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── composer.json ├── doc └── using_css_selectors.md ├── ecs.php ├── phpunit.xml ├── rector.php └── src ├── Collection ├── ElementCollection.php ├── Filters │ └── StringFilter │ │ ├── RegexStringFilter.php │ │ └── StringFilterInterface.php ├── Modify │ └── StringModify │ │ ├── RegexReplace.php │ │ └── StringModifyInterface.php ├── ObjectCollection.php └── StringCollection.php ├── CssExpressionTranslator ├── CssExpressionTranslator.php └── CssOrXpathExpressionTranslator.php ├── DomNodeListAction ├── DomNodeListActionInterface.php └── RemoveNodes.php ├── ElementFinder.php ├── ElementFinder └── Element.php ├── ElementFinderInterface.php ├── ExpressionTranslator ├── ExpressionTranslatorInterface.php └── XpathExpression.php └── Helper ├── FormHelper.php ├── NodeHelper.php └── StringHelper.php /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: composer 4 | directory: "/" 5 | schedule: 6 | interval: monthly 7 | open-pull-requests-limit: 10 8 | - package-ecosystem: github-actions 9 | directory: "/" 10 | schedule: 11 | interval: monthly 12 | open-pull-requests-limit: 10 13 | -------------------------------------------------------------------------------- /.github/workflows/stale.yaml: -------------------------------------------------------------------------------- 1 | name: 'Close stale issues and PRs' 2 | 3 | on: 4 | schedule: 5 | - cron: '0 22 * * *' 6 | 7 | jobs: 8 | stale: 9 | runs-on: ubuntu-22.04 10 | permissions: 11 | issues: write 12 | pull-requests: write 13 | 14 | steps: 15 | - name: 'Mark stale issues and PRs' 16 | uses: actions/stale@v9 17 | with: 18 | repo-token: ${{ secrets.GITHUB_TOKEN }} 19 | days-before-issue-stale: 10 20 | days-before-pr-stale: 10 21 | days-before-issue-close: 10 22 | days-before-pr-close: 10 23 | stale-issue-label: 'no-issue-activity' 24 | stale-issue-message: 'This issue is stale because it has been open 10 days with no activity. Remove stale label or comment or this will be closed in 3 days.' 25 | close-issue-message: 'This issue was closed because it has been stalled for 10 days with no activity.' 26 | stale-pr-label: 'no-pr-activity' 27 | stale-pr-message: 'This PR is stale because it has been open 10 days with no activity. Remove stale label or comment or this will be closed in 2 days.' 28 | close-pr-message: 'This PR was closed because it has been stalled for 10 days with no activity.' 29 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | schedule: 5 | - cron: '00 1 * * 1' # At 01:00 on Mondays. 6 | push: 7 | branches: 8 | - master 9 | pull_request: 10 | branches: 11 | - master 12 | 13 | jobs: 14 | run: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | include: 19 | - php-version: '8.2' 20 | main: true 21 | - php-version: '8.3' 22 | - php-version: '8.4' 23 | nightly: true 24 | name: PHP ${{ matrix.php-version }} 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v4 28 | 29 | - name: Setup PHP 30 | uses: shivammathur/setup-php@v2 31 | with: 32 | coverage: pcov 33 | php-version: ${{ matrix.php-version }} 34 | 35 | - name: Get Composer Cache Directory 36 | id: composer-cache 37 | run: echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT 38 | 39 | - uses: actions/cache@v4 40 | with: 41 | path: ${{ steps.composer-cache.outputs.dir }} 42 | key: ${{ runner.os }}-composer-${{ hashFiles('**/composer.json') }} 43 | restore-keys: ${{ runner.os }}-composer- 44 | 45 | - name: Install dependencies 46 | run: composer install --prefer-dist 47 | 48 | - name: Run tests 49 | continue-on-error: ${{ matrix.nightly }} 50 | run: ./vendor/bin/phpunit 51 | 52 | - name: Run code style check 53 | if: ${{ matrix.main }} 54 | run: ./vendor/bin/ecs 55 | 56 | - name: Run rector 57 | if: ${{ matrix.main }} 58 | run: ./vendor/bin/rector --dry-run 59 | 60 | - name: Upload coverage reports to Codecov 61 | if: ${{ matrix.main }} 62 | uses: codecov/codecov-action@v5 63 | with: 64 | fail_ci_if_error: true 65 | files: ./.tmp/clover.xml 66 | env: 67 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 68 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | ElementFinder -------------------------------------------------------------------------------- /.idea/ElementFinder.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 133 | -------------------------------------------------------------------------------- /.idea/php.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.php_cs_config.php: -------------------------------------------------------------------------------- 1 | in(__DIR__.'/src') 5 | ->in(__DIR__ . '/tests') 6 | ; 7 | 8 | return PhpCsFixer\Config::create() 9 | ->setFinder($finder) 10 | ->setRiskyAllowed(true) 11 | ->setRules([ 12 | '@PSR2' => true, 13 | 'psr4' => true, 14 | 'phpdoc_indent' => true, 15 | 'array_syntax' => ['syntax' => 'short'], 16 | 'blank_line_before_statement' => false, 17 | 'strict_comparison' => true, 18 | 'strict_param' => true, 19 | 'no_null_property_initialization' => true, 20 | 'yoda_style' => false, 21 | 'ordered_imports' => ['sortAlgorithm' => 'alpha'], 22 | 'ordered_class_elements' => [ 23 | 'use_trait', 24 | 'constant_public', 25 | 'constant_protected', 26 | 'constant_private', 27 | 'property_public', 28 | 'property_protected', 29 | 'property_private', 30 | 'construct', 31 | 'destruct', 32 | 'magic', 33 | 'phpunit', 34 | 'method_public', 35 | 'method_protected', 36 | 'method_private' 37 | ], 38 | ]); 39 | -------------------------------------------------------------------------------- /.run/Tests.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All Notable changes to `ElementFinder` will be documented in this file 3 | ## 3.0.0 [2025-02-08] 4 | - Minimum required php version is now php 8.2 5 | 6 | 7 | ## 2.0.0 [2023-01-11] 8 | - Move to php 8.1 9 | - Add `ExpressionTranslator` (merged from the [xparse/expression-translator](https://github.com/xparse/ExpressionTranslator) repository) 10 | - Add `CssExpressionTranslator` (merged from the [xparse/css-expression-translator](https://github.com/xparse/CssExpressionTranslator) repository) 11 | - Constants ElementFinder::DOCUMENT_HTML, ElementFinder::DOCUMENT_XML marked as final 12 | 13 | ## 1.0.1 [2021-06-01] 14 | - Move xparse/expression-translator to 1.0.0 15 | 16 | ## 1.0.0 [2021-05-31] 17 | - No breaking changes. 18 | 19 | ## 0.5.1-alpha [2021-04-22] 20 | - Move to php 7.3 21 | - Add php 8.0 support 22 | 23 | ## 0.5.1 [2019-05-23] 24 | - Allow pass empty string to constructor `ElementFinder` 25 | 26 | ## 0.5.0 [2019-05-14] 27 | - Use `ElementFinderInterface` instead of `ElementFinder` class 28 | - Move to php 7.1 29 | 30 | ## 0.4.0 [2019-05-11] 31 | - Remove ElementFinder\ElementFinderModifierInterface see DomNodeListAction\DomNodeListActionInterface 32 | - Remove ElementFinder\RemoveElements see DomNodeListAction\RemoveNodes 33 | - Remove deprecated class RegexHelper 34 | - Specify types 35 | - Add final modifiers for all public methods 36 | ## 0.3.1 [2018-04-18] 37 | - Add new method: `ElementFinder::modify`. 38 | 39 | ## 0.3.0 [2018-03-22] 40 | 41 | ### Changed 42 | - #84 `ElementFinder` become immutable 43 | - #84 method `ElementFinder::remove` return `new ElementFinder()` 44 | - #84 method `ElementFinder::element` return copy of the element 45 | - Make second argument required `\Xparse\ElementFinder\Collection\StringCollection::replace` 46 | - #87 Remove exceptions from the constructor `ElementCollection`, `StringCollection`, `ObjectCollection` 47 | 48 | ### Deprecated 49 | - #95 deprecate internal method `\Xparse\ElementFinder\Helper\RegexHelper::match` 50 | 51 | ### Removed 52 | - #86 Remove deprecated method `\Xparse\ElementFinder\ElementFinder::match` 53 | - #86 Remove deprecated method `\Xparse\ElementFinder\ElementFinder::__toString` 54 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\StringCollection::getLast` 55 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\StringCollection::getFirst` 56 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\StringCollection::getItems` 57 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\StringCollection::walk` 58 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\ObjectCollection::getLast` 59 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\ObjectCollection::getFirst` 60 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\ObjectCollection::getItems` 61 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\ObjectCollection::walk` 62 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\ElementCollection::getLast` 63 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\ElementCollection::getFirst` 64 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\ElementCollection::getItems` 65 | - #86 Remove deprecated method `\Xparse\ElementFinder\Collection\ElementCollection::walk` 66 | - #86 Remove deprecated method `` 67 | 68 | ## 0.2.1 [2017-12-27] 69 | 70 | ### Deprecated 71 | - #98 Deprecate `ElementCollection::walk` 72 | - #98 Deprecate `StringCollection::walk` 73 | - #98 Deprecate `ObjectCollection::walk` 74 | - #100 Deprecate `ElementCollection::getItems` see `ElementCollection::all` 75 | - #100 Deprecate `StringCollection::getItems` see `StringCollection::all` 76 | - #100 Deprecate `ObjectCollection::getItems` see `ObjectCollection::all` 77 | - #99 Deprecate `ElementCollection::getFirst` see `ElementCollection::first` 78 | - #99 Deprecate `StringCollection::getFirst` see `StringCollection::first` 79 | - #99 Deprecate `ObjectCollection::getFirst` see `ObjectCollection::first` 80 | - #99 Deprecate `ElementCollection::getLast` see `ElementCollection::last` 81 | - #99 Deprecate `StringCollection::getLast` see `StringCollection::last` 82 | - #99 Deprecate `ObjectCollection::getLast` see `ObjectCollection::last` 83 | 84 | ### Changed 85 | - #92 Require second parameter `StringCollection::replace` 86 | - #93 All public methods become final. 87 | - #94 All protected methods become private 88 | 89 | 90 | ## 0.2.0 [2017-11-02] 91 | 92 | ### Deprecated 93 | - #85 Deprecate `ElementFinder::match` 94 | - #88 Deprecate `ElementFinder::__toString` 95 | 96 | ### Removed 97 | - #82 Remove method `ElementCollection::getAttributes` 98 | - #82 Remove method `ElementFinder::setExpressionTranslator()` 99 | - #82 Remove method `ElementFinder::getExpressionTranslator()` 100 | - #82 Remove method `ObjectCollection::append()` 101 | - #82 Remove method `ElementFinder::replace()` 102 | - #82 Remove method `ElementFinder::getType()` 103 | - #82 Remove method `ElementFinder::getOptions()` 104 | - #83 Remove method `ElementFinder::query` 105 | 106 | 107 | ## 0.1.0-alpha.7 [2017-08-21] 108 | ### Added 109 | - #81 Introduce new map method `StringCollection::map()`. 110 | - #48 Introduce new filter method `StringCollection::filter`. 111 | - #72 Add 3 argument to the `ElementFinder::__construct`. Now you can pass `ExpressionTranslatorInterface` 112 | 113 | ### Removed 114 | - #75 Remove `options` parameter from the `ElementFinder::__construct` 115 | 116 | ### Deprecated 117 | - #80 Deprecate `ElementCollection::getAttributes` 118 | - #72 Deprecate `ElementFinder::setExpressionTranslator()` 119 | - #72 Deprecate `ElementFinder::getExpressionTranslator()` 120 | - #66 Deprecate `ObjectCollection::append()` 121 | - #70 Deprecate `ElementFinder::replace()` 122 | - #77 Deprecate `ElementFinder::getType()` 123 | - #74 Deprecate `ElementFinder::getOptions()` 124 | 125 | 126 | 127 | ## 0.1.0-alpha.6 [2017-08-16] 128 | 129 | ### Fixed 130 | - #62 `FormHelper` return value attribute in select elements. 131 | 132 | ### Deprecated 133 | - #58 Fire error if we try to store non string values inside `StringCollection` 134 | - #57 Deprecate method `ElementFinder::node` use `ElementFinder::element` instead 135 | 136 | ### Removed 137 | - #75 Method `ElementFinder::node` 138 | - #53 Remove `ArrayAccess` interface from the `StringCollection`, `ObjectCollection` and `ElementCollection` 139 | - #52 RegexReplace `Iterator` with `IteratorAggregate` interface inside `StringCollection`, `ObjectCollection` and `ElementCollection` 140 | - #55 Remove (`StringCollection::prepend`,`StringCollection::addAfter`,`StringCollection::slice`,`StringCollection::extractItems`,`StringCollection::getNext`,`StringCollection::getPrevious`, `StringCollection::append`, `StringCollection::setItems`) 141 | - #55 Remove (`ObjectCollection::prepend`,`ObjectCollection::addAfter`,`ObjectCollection::slice`,`ObjectCollection::extractItems`,`ObjectCollection::getNext`,`ObjectCollection::getPrevious`,`ObjectCollection::append`,`ObjectCollection::setItems`) 142 | - #55 Remove (`ElementCollection::prepend`,`ElementCollection::addAfter`,`ElementCollection::slice`,`ElementCollection::extractItems`,`ElementCollection::getNext`,`ElementCollection::getPrevious`, `ElementCollection::append`, `ElementCollection::setItems`) 143 | - #51 Remove (`ElementCollection::map`,`ObjectCollection::map`,`StringCollection::map`) 144 | - Remove `StringCollection::item` use `StringCollection::get` instead 145 | - Remove `ObjectCollection::item` use `ObjectCollection::get` instead 146 | - Remove method `FormHelper::getDefaultFormData` use `FormHelper::getFormData` instead 147 | - #59 Remove method `ObjectCollection::replace` 148 | 149 | ### Changed 150 | - #54 Return new collection instead of modification (`StringCollection::replace`,`ObjectCollection::replace`) 151 | 152 | ### Added 153 | - #50 Add `StringCollection::unique` function 154 | - #56 Add `StringCollection::merge`, `ObjectCollection::merge` and `ElementCollection::merge` functions 155 | - #60 Add `StringCollection::add`,`StringCollection::get` methods 156 | - #60 Add `ObjectCollection::add`,`ObjectCollection::get` methods 157 | - #60 Add `ElementCollection::add`,`ElementCollection::get` methods 158 | 159 | ## 0.1.0-alpha.5 [2017-03-10] 160 | 161 | ### Added 162 | - strict types declaration 163 | 164 | ### Changed 165 | - all external collections where moved to appropriate ElementFinder collections 166 | 167 | ### Deprecated 168 | - ArrayAccessible methods in Collections (offsetSet, offsetExists, offsetUnset, offsetGet) 169 | - #49 deprecate `StringCollection::map`, `ObjectCollection::map`, `ElementCollection::map` use `walk` instead 170 | 171 | ### Removed 172 | - fiv/collection package 173 | 174 | ## 0.1.0-alpha.3 [2016-06-02] 175 | 176 | ### Fixed 177 | - #33 copy expression translator to child objects 178 | 179 | ### Removed 180 | - method `ElementFinder::attribute()` has been removed 181 | - method `ElementFinder::elements()` has been removed 182 | - method `ElementFinder::getNodeItems()` has been removed 183 | - method `ElementFinder::html()` has been removed 184 | - method `NodeHelper::getInnerHtml()` has been removed 185 | - method `NodeHelper::getOuterHtml()` has been removed 186 | 187 | 188 | ## 0.1.0-alpha.2 [2016-05-25] 189 | 190 | ### Added 191 | - Added `ElementFinder::query()` as an alias of `ElementFinder::node()` 192 | 193 | ### Changed 194 | - #18 Skip `XpathExpression` creation. Use `CssExpression` only when needed. 195 | 196 | ### Deprecated 197 | - #29 `ElementFinder::getNodeItems()` 198 | - #28 Method `ElementFinder::elements()` has been renamed to `ElementFinder::element()` 199 | - #28 Method `ElementFinder::html()` has been renamed to `ElementFinder::content()` 200 | - #28 Method `ElementFinder::query()` has been renamed to `ElementFinder::executeQuery()` 201 | - #28 Method `NodeHelper::getOuterHtml()` has been renamed to `NodeHelper::getOuterContent()` 202 | - #28 Method `NodeHelper::getInnerHtml()` has been renamed to `NodeHelper::getInnerContent()` 203 | - #10 `ElementFinder::attribute()`. See `ElementFinder::value()` 204 | - #14 Remove 3 parameter inside `ElementFinder::KeyValue()` 205 | 206 | ## Version 0.0.3 207 | 208 | ### Changed 209 | - Feature #4 Use `DOMAttr::nodeValue` instead of `DOMAttr::value` 210 | - BC #7 Refactor `Helper` class. Create `FormHelper`, `NodeHelper` and `StringHelper` 211 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are **welcome** and will be fully **credited**. 4 | 5 | We accept contributions via Pull Requests on [Github](https://github.com/xparse/ElementFinder). 6 | 7 | 8 | ## Pull Requests 9 | 10 | - **Add tests!** - Your patch won't be accepted if it doesn`t have tests. 11 | 12 | - **Document any change in behaviour** - Make sure the `README.md` and any other relevant documentation are kept up-to-date. 13 | 14 | - **Consider our release cycle** - We try to follow [SemVer v2.0.0](http://semver.org/). Randomly breaking public APIs is not an option. 15 | 16 | - **Create feature branches** - Don't ask us to pull from your master branch. 17 | 18 | - **One pull request per feature** - If you want to do more than one thing, send multiple pull requests. 19 | 20 | - **Send coherent history** - Make sure each individual commit in your pull request is meaningful. If you had to make multiple intermediate commits while developing, please squash them before submitting. 21 | 22 | 23 | ## Running Tests 24 | 25 | ``` bash 26 | ./vendor/bin/phpunit 27 | ``` 28 | 29 | 30 | **Happy coding**! 31 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Ivan Shcherbak 4 | 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy 6 | > of this software and associated documentation files (the "Software"), to deal 7 | > in the Software without restriction, including without limitation the rights 8 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | > copies of the Software, and to permit persons to whom the Software is 10 | > furnished to do so, subject to the following conditions: 11 | > 12 | > The above copyright notice and this permission notice shall be included in 13 | > all copies or substantial portions of the Software. 14 | > 15 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | > THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ElementFinder 2 | 3 | [![Latest Version](https://img.shields.io/packagist/v/xparse/element-finder.svg?style=flat-square)](https://packagist.org/packages/xparse/element-finder) 4 | [![Software License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat-square)](LICENSE.md) 5 | [![CI](https://github.com/xparse/ElementFinder/actions/workflows/test.yaml/badge.svg)](https://github.com/xparse/ElementFinder/actions/workflows/test.yaml) 6 | [![codecov](https://codecov.io/github/xparse/ElementFinder/graph/badge.svg?token=yYnurpoyxA)](https://codecov.io/github/xparse/ElementFinder) 7 | [![Total Downloads](https://img.shields.io/packagist/dt/xparse/element-finder.svg?style=flat-square)](https://packagist.org/packages/xparse/element-finder) 8 | 9 | Extract data from html with elegant xpath/css expressions and prepare data with regexp in single line. 10 | 11 | ## Install 12 | 13 | Via Composer 14 | 15 | ``` bash 16 | $ composer require xparse/element-finder 17 | ``` 18 | 19 | ## Usage 20 | 21 | ``` php 22 | $page = new ElementFinder($html); 23 | $title = $page->value('//title')->first(); 24 | echo $title; 25 | ``` 26 | 27 | ## Advanced usage with regexp 28 | 29 | 30 | ``` php 31 | $page = new \Xparse\ElementFinder\ElementFinder(' 32 | 33 |
34 | 044-12-12, 35 | 258-16-16 36 |
37 | 38 |
39 | (148) 04-55-16 40 |
41 | 42 | '); 43 | 44 | $tels = $page->value('//*[@class="tels"]')->split('!,!')->replace("![^0-9]!"); 45 | print_r($tels); 46 | 47 | /* 48 | [0] => 0441212 49 | [1] => 2581616 50 | [2] => 148045516 51 | */ 52 | 53 | 54 | ``` 55 | 56 | ## Css selectors 57 | Read this document. [Using css selectors](doc/using_css_selectors.md). 58 | 59 | ## Testing 60 | 61 | ``` bash 62 | ./vendor/bin/phpunit 63 | ``` 64 | 65 | ## Contributing 66 | 67 | Please see [CONTRIBUTING](https://github.com/xparse/ElementFinder/blob/master/CONTRIBUTING.md) for details. 68 | 69 | ## Credits 70 | 71 | - [funivan](https://github.com/funivan) 72 | - [All Contributors](https://github.com/xparse/ElementFinder/contributors) 73 | 74 | ## Xpath info 75 | - [XPath/CSS Equivalents](https://en.wikibooks.org/wiki/XPath/CSS_Equivalents) 76 | - [Choose between XPath and jQuery with an XPath-jQuery phrase book](http://www.ibm.com/developerworks/library/x-xpathjquery/) 77 | - [XPath and CSS Selectors](http://ejohn.org/blog/xpath-css-selectors/) 78 | 79 | ## License 80 | 81 | The MIT License (MIT). Please see [License File](LICENSE.md) for more information. 82 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "xparse/element-finder", 3 | "description": "Elegant data scrapping", 4 | "keywords": [ 5 | "xpath", 6 | "parser", 7 | "dom" 8 | ], 9 | "homepage": "https://github.com/xparse/ElementFinder", 10 | "license": "MIT", 11 | "authors": [ 12 | { 13 | "name": "Ivan Shcherbak", 14 | "email": "alotofall@gmail.com", 15 | "homepage": "http://funivan.com/", 16 | "role": "Developer" 17 | } 18 | ], 19 | "require": { 20 | "php": "^8.2", 21 | "ext-dom": "*", 22 | "ext-libxml": "*", 23 | "symfony/css-selector": "^7.1" 24 | }, 25 | "require-dev": { 26 | "friendsofphp/php-cs-fixer": "^3.16", 27 | "phpunit/phpunit": "^11.2.9", 28 | "rector/rector": "^2.0.8", 29 | "symplify/easy-coding-standard": "^12.1" 30 | }, 31 | "autoload": { 32 | "psr-4": { 33 | "Xparse\\ElementFinder\\": "src" 34 | } 35 | }, 36 | "autoload-dev": { 37 | "psr-4": { 38 | "Test\\Xparse\\ElementFinder\\": "./tests" 39 | } 40 | }, 41 | "config": { 42 | "optimize-autoloader": true, 43 | "sort-packages": true 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /doc/using_css_selectors.md: -------------------------------------------------------------------------------- 1 | # Using CSS selectors 2 | Xpath is very powerful query language. But sometimes, you do not need this power. You need just to grab some page in simple way - using css selectors. 3 | Css selectors are widely used. They are simple. 4 | 5 | 6 | You need additional library called `xparse/css-expression-translator` 7 | 8 | Install it via composer: 9 | ```sh 10 | composer require xparse/css-expression-translator 11 | ``` 12 | 13 | Configure element finder 14 | ```php 15 | $finder->setExpressionTranslator(new CssExpressionTranslator()); 16 | ``` 17 | 18 | ## Example 19 | Here is full working example: 20 | ```php 21 | 22 | require 'vendor/autoload.php'; 23 | 24 | use Xparse\CssExpressionTranslator\CssExpressionTranslator; 25 | use Xparse\ElementFinder\ElementFinder; 26 | 27 | 28 | $finder = new ElementFinder('
29 | 123 30 | 321ad 31 |
', ElementFinder::DOCUMENT_HTML, new CssExpressionTranslator()); 32 | 33 | 34 | # 321ad 35 | echo $finder->content('a.test')->first(); 36 | ``` 37 | 38 | ## How it works? 39 | This library build on top of the `symfony/css-selector` [https://github.com/symfony/css-selector](https://github.com/symfony/css-selector) 40 | 41 | ## How to select attributes with css? 42 | Add space before attribute name. 43 | ```php 44 | $finder->attributes('a @href'); 45 | $finder->attributes('a.test @class'); 46 | 47 | // slect node text 48 | $finder->value('a.test node()'); 49 | ``` 50 | 51 | ## Limits 52 | There are some limits. 53 | - Xpath is more powerful than css. 54 | - you cant select attributes with `or` operator 55 | - fetch function result `a concat('text:', text())` 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /ecs.php: -------------------------------------------------------------------------------- 1 | paths([ 13 | __DIR__ . '/src', 14 | __DIR__ . '/tests', 15 | __FILE__, 16 | ]); 17 | 18 | $ecsConfig->rules([ 19 | NoUnusedImportsFixer::class, 20 | VoidReturnFixer::class, 21 | DeclareStrictTypesFixer::class, 22 | ]); 23 | 24 | // this way you can add sets - group of rules 25 | $ecsConfig->sets([SetList::SPACES, SetList::ARRAY, SetList::DOCBLOCK, SetList::NAMESPACES, SetList::COMMENTS, SetList::PSR_12]); 26 | }; 27 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | ./tests 19 | 20 | 21 | 22 | 23 | ./src 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /rector.php: -------------------------------------------------------------------------------- 1 | paths([__DIR__ . '/src', __DIR__ . '/tests', __FILE__]); 14 | 15 | // register a single rule 16 | $rectorConfig->rule(InlineConstructorDefaultToPropertyRector::class); 17 | $rectorConfig->rule(TypedPropertyFromStrictConstructorRector::class); 18 | $rectorConfig->rule(SeparateMultiUseImportsRector::class); 19 | $rectorConfig->importNames(); 20 | $rectorConfig->sets([LevelSetList::UP_TO_PHP_81, PHPUnitSetList::PHPUNIT_90]); 21 | }; 22 | -------------------------------------------------------------------------------- /src/Collection/ElementCollection.php: -------------------------------------------------------------------------------- 1 | 16 | */ 17 | class ElementCollection implements IteratorAggregate, Countable 18 | { 19 | /** 20 | * @var bool 21 | */ 22 | private $validated = false; 23 | 24 | /** 25 | * @param Element[] $items 26 | * @throws InvalidArgumentException 27 | */ 28 | public function __construct( 29 | private readonly array $items = [] 30 | ) { 31 | } 32 | 33 | /** 34 | * @throws InvalidArgumentException 35 | */ 36 | final public function count(): int 37 | { 38 | return \count($this->all()); 39 | } 40 | 41 | /** 42 | * @throws InvalidArgumentException 43 | */ 44 | final public function last(): ?Element 45 | { 46 | $items = $this->all(); 47 | if (\count($items) === 0) { 48 | return null; 49 | } 50 | return end($items); 51 | } 52 | 53 | /** 54 | * @throws InvalidArgumentException 55 | */ 56 | final public function first(): ?Element 57 | { 58 | $items = $this->all(); 59 | if (\count($items) === 0) { 60 | return null; 61 | } 62 | return reset($items); 63 | } 64 | 65 | /** 66 | * @throws InvalidArgumentException 67 | */ 68 | final public function get(int $index): ?Element 69 | { 70 | return $this->all()[$index] ?? null; 71 | } 72 | 73 | /** 74 | * @return Element[] 75 | * @throws InvalidArgumentException 76 | */ 77 | final public function all(): array 78 | { 79 | if (! $this->validated) { 80 | foreach ($this->items as $key => $item) { 81 | if (! $item instanceof Element) { 82 | $className = ($item === null) ? \gettype($item) : $item::class; 83 | throw new InvalidArgumentException( 84 | sprintf( 85 | 'Invalid object type. Expect %s given %s Check item %d', 86 | Element::class, 87 | $className, 88 | $key 89 | ) 90 | ); 91 | } 92 | } 93 | } 94 | 95 | return $this->items; 96 | } 97 | 98 | /** 99 | * @throws InvalidArgumentException 100 | */ 101 | final public function merge(ElementCollection $collection): ElementCollection 102 | { 103 | return new ElementCollection(array_merge($this->all(), $collection->all())); 104 | } 105 | 106 | /** 107 | * @throws InvalidArgumentException 108 | */ 109 | final public function add(Element $element): ElementCollection 110 | { 111 | $items = $this->all(); 112 | $items[] = $element; 113 | return new ElementCollection($items); 114 | } 115 | 116 | /** 117 | * Retrieve an external iterator 118 | * 119 | * @link http://php.net/manual/en/iteratoraggregate.getiterator.php 120 | * @return Element[]|Traversable An instance of an object implementing Iterator or Traversable 121 | * @throws InvalidArgumentException 122 | */ 123 | final public function getIterator(): Traversable 124 | { 125 | return new ArrayIterator($this->all()); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/Collection/Filters/StringFilter/RegexStringFilter.php: -------------------------------------------------------------------------------- 1 | regex, $input) === 1; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/Collection/Filters/StringFilter/StringFilterInterface.php: -------------------------------------------------------------------------------- 1 | from, $this->to, $input); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/Collection/Modify/StringModify/StringModifyInterface.php: -------------------------------------------------------------------------------- 1 | 17 | */ 18 | class ObjectCollection implements IteratorAggregate, Countable 19 | { 20 | /** 21 | * @var bool 22 | */ 23 | private $validated = false; 24 | 25 | /** 26 | * @param ElementFinderInterface[] $items 27 | * @throws Exception 28 | */ 29 | public function __construct( 30 | private readonly array $items = [] 31 | ) { 32 | } 33 | 34 | /** 35 | * @throws InvalidArgumentException 36 | */ 37 | final public function count(): int 38 | { 39 | return \count($this->all()); 40 | } 41 | 42 | /** 43 | * @throws InvalidArgumentException 44 | */ 45 | final public function last(): ?ElementFinderInterface 46 | { 47 | $items = $this->all(); 48 | if ($items === []) { 49 | return null; 50 | } 51 | return end($items); 52 | } 53 | 54 | /** 55 | * @throws InvalidArgumentException 56 | */ 57 | final public function first(): ?ElementFinderInterface 58 | { 59 | $items = $this->all(); 60 | if (\count($items) === 0) { 61 | return null; 62 | } 63 | return reset($items); 64 | } 65 | 66 | /** 67 | * @return ElementFinderInterface[] 68 | * @throws InvalidArgumentException 69 | */ 70 | final public function all(): array 71 | { 72 | if (! $this->validated) { 73 | foreach ($this->items as $key => $item) { 74 | if (! $item instanceof ElementFinderInterface) { 75 | $className = ($item === null) ? \gettype($item) : $item::class; 76 | throw new InvalidArgumentException( 77 | sprintf( 78 | 'Invalid object type. Expect %s given %s Check item %d', 79 | ElementFinderInterface::class, 80 | $className, 81 | $key 82 | ) 83 | ); 84 | } 85 | } 86 | $this->validated = true; 87 | } 88 | return $this->items; 89 | } 90 | 91 | /** 92 | * @throws Exception 93 | */ 94 | final public function merge(ObjectCollection $collection): ObjectCollection 95 | { 96 | return new ObjectCollection(array_merge($this->all(), $collection->all())); 97 | } 98 | 99 | /** 100 | * @throws Exception 101 | */ 102 | final public function add(ElementFinderInterface $element): ObjectCollection 103 | { 104 | $items = $this->all(); 105 | $items[] = $element; 106 | return new ObjectCollection($items); 107 | } 108 | 109 | /** 110 | * @throws InvalidArgumentException 111 | */ 112 | final public function get(int $index): ?ElementFinderInterface 113 | { 114 | return $this->all()[$index] ?? null; 115 | } 116 | 117 | /** 118 | * @return ElementFinderInterface[]|Traversable 119 | * @throws InvalidArgumentException 120 | */ 121 | final public function getIterator(): Traversable 122 | { 123 | return new ArrayIterator($this->all()); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/Collection/StringCollection.php: -------------------------------------------------------------------------------- 1 | 18 | */ 19 | class StringCollection implements IteratorAggregate, Countable 20 | { 21 | /** 22 | * @var string[] 23 | */ 24 | private readonly array $items; 25 | 26 | /** 27 | * @var bool 28 | */ 29 | private $validated = false; 30 | 31 | /** 32 | * @param string[] $items 33 | */ 34 | public function __construct(array $items = []) 35 | { 36 | $this->items = array_values($items); 37 | } 38 | 39 | /** 40 | * @throws Exception 41 | */ 42 | final public function count(): int 43 | { 44 | return \count($this->all()); 45 | } 46 | 47 | /** 48 | * @throws Exception 49 | */ 50 | final public function last(): ?string 51 | { 52 | $items = $this->all(); 53 | if (\count($items) === 0) { 54 | return null; 55 | } 56 | return (string) end($items); 57 | } 58 | 59 | /** 60 | * @throws Exception 61 | */ 62 | final public function first(): ?string 63 | { 64 | $items = $this->all(); 65 | if (\count($items) === 0) { 66 | return null; 67 | } 68 | return (string) reset($items); 69 | } 70 | 71 | /** 72 | * @return string[] 73 | * @throws Exception 74 | */ 75 | final public function all(): array 76 | { 77 | if (! $this->validated) { 78 | foreach ($this->items as $key => $item) { 79 | if (! \is_string($item)) { 80 | throw new InvalidArgumentException( 81 | sprintf('Expect string. Check %s item', $key) 82 | ); 83 | } 84 | } 85 | $this->validated = true; 86 | } 87 | return $this->items; 88 | } 89 | 90 | /** 91 | * @throws Exception 92 | */ 93 | final public function map(StringModifyInterface $modifier): StringCollection 94 | { 95 | $items = []; 96 | foreach ($this->all() as $item) { 97 | $items[] = $modifier->modify($item); 98 | } 99 | return new StringCollection($items); 100 | } 101 | 102 | /** 103 | * @throws Exception 104 | */ 105 | final public function filter(StringFilterInterface $filter): StringCollection 106 | { 107 | $items = []; 108 | foreach ($this->all() as $item) { 109 | if ($filter->valid($item)) { 110 | $items[] = $item; 111 | } 112 | } 113 | return new StringCollection($items); 114 | } 115 | 116 | /** 117 | * @throws Exception 118 | */ 119 | final public function replace(string $regexp, string $to): StringCollection 120 | { 121 | $result = []; 122 | foreach ($this->all() as $index => $item) { 123 | $result[] = preg_replace($regexp, $to, $item); 124 | } 125 | return new StringCollection($result); 126 | } 127 | 128 | /** 129 | * @throws Exception 130 | */ 131 | final public function match(string $regexp, int $index = 1): StringCollection 132 | { 133 | $result = []; 134 | foreach ($this->all() as $string) { 135 | preg_match_all($regexp, $string, $matchedData); 136 | if (isset($matchedData[$index])) { 137 | foreach ((array) $matchedData[$index] as $matchedString) { 138 | $result[] = $matchedString; 139 | } 140 | } 141 | } 142 | return new StringCollection($result); 143 | } 144 | 145 | /** 146 | * @throws Exception 147 | */ 148 | final public function split(string $regexp): StringCollection 149 | { 150 | $items = []; 151 | foreach ($this->all() as $item) { 152 | foreach (preg_split($regexp, $item) as $string) { 153 | $items[] = $string; 154 | } 155 | } 156 | return new StringCollection($items); 157 | } 158 | 159 | /** 160 | * @throws Exception 161 | */ 162 | final public function unique(): StringCollection 163 | { 164 | return new StringCollection(array_unique($this->all())); 165 | } 166 | 167 | /** 168 | * @throws Exception 169 | */ 170 | final public function merge(StringCollection $collection): StringCollection 171 | { 172 | return new StringCollection(array_merge($this->all(), $collection->all())); 173 | } 174 | 175 | /** 176 | * @throws Exception 177 | */ 178 | final public function add(string $item): StringCollection 179 | { 180 | $items = $this->all(); 181 | $items[] = $item; 182 | return new StringCollection($items); 183 | } 184 | 185 | /** 186 | * @throws Exception 187 | */ 188 | final public function get(int $index): ?string 189 | { 190 | return $this->all()[$index] ?? null; 191 | } 192 | 193 | /** 194 | * @link http://php.net/manual/en/iteratoraggregate.getiterator.php 195 | * @return string[]|Traversable 196 | * @throws Exception 197 | */ 198 | final public function getIterator(): Traversable 199 | { 200 | return new ArrayIterator($this->all()); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /src/CssExpressionTranslator/CssExpressionTranslator.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | class CssExpressionTranslator extends CssSelectorConverter implements ExpressionTranslatorInterface 14 | { 15 | final public function convertToXpath(string $expression): string 16 | { 17 | $xpathExpression = []; 18 | foreach (explode(', ', $expression) as $part) { 19 | preg_match('!(.+) (@.+|.+\(\))$!', $part, $matchExpression); 20 | if (! array_key_exists(2, $matchExpression)) { 21 | $xpathExpression[] = $this->toXPath($part); 22 | } else { 23 | $xpathExpression[] = $this->toXPath($matchExpression[1]) . '/' . $matchExpression[2]; 24 | } 25 | } 26 | return implode(' | ', $xpathExpression); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/CssExpressionTranslator/CssOrXpathExpressionTranslator.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class CssOrXpathExpressionTranslator implements ExpressionTranslatorInterface 16 | { 17 | public function __construct( 18 | private readonly ExpressionTranslatorInterface $cssTranslator = new CssExpressionTranslator() 19 | ) { 20 | } 21 | 22 | /** 23 | * @throws InvalidArgumentException 24 | */ 25 | final public function convertToXpath(string $expression): string 26 | { 27 | $expression = trim($expression); 28 | if ($expression === '') { 29 | throw new InvalidArgumentException('Expect not empty expression'); 30 | } 31 | if ($expression === '.') { 32 | return $expression; 33 | } 34 | if (mb_strpos($expression, './') === 0) { 35 | return $expression; 36 | } 37 | $firstChar = mb_substr($expression, 0, 1); 38 | if (in_array($firstChar, ['/', '('])) { 39 | return $expression; 40 | } 41 | return $this->cssTranslator->convertToXpath($expression); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/DomNodeListAction/DomNodeListActionInterface.php: -------------------------------------------------------------------------------- 1 | ownerElement->removeAttribute($node->name); 17 | } else { 18 | $node->parentNode->removeChild($node); 19 | } 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/ElementFinder.php: -------------------------------------------------------------------------------- 1 | 28 | */ 29 | class ElementFinder implements ElementFinderInterface 30 | { 31 | /** 32 | * Html document type 33 | * 34 | * @var int 35 | */ 36 | final public const DOCUMENT_HTML = 0; 37 | 38 | /** 39 | * Xml document type 40 | * 41 | * @var int 42 | */ 43 | final public const DOCUMENT_XML = 1; 44 | 45 | private int $type; 46 | 47 | private DOMDocument $dom; 48 | 49 | private DomXPath $xpath; 50 | 51 | private ExpressionTranslatorInterface $expressionTranslator; 52 | 53 | /** 54 | * @var LibXMLError[] 55 | */ 56 | private array $loadErrors = []; 57 | 58 | /** 59 | * Example: 60 | * new ElementFinder("
test
", ElementFinder::HTML); 61 | * 62 | * @throws Exception 63 | */ 64 | public function __construct( 65 | string $data, 66 | int $documentType = null, 67 | ExpressionTranslatorInterface $translator = null 68 | ) { 69 | $this->dom = new DomDocument(); 70 | $this->expressionTranslator = $translator ?? new XpathExpression(); 71 | $this->dom->registerNodeClass(DOMElement::class, Element::class); 72 | $this->type = $documentType ?? static::DOCUMENT_HTML; 73 | $this->setData($data ?: ''); 74 | } 75 | 76 | public function __destruct() 77 | { 78 | unset($this->dom, $this->xpath); 79 | } 80 | 81 | public function __clone() 82 | { 83 | $this->dom = clone $this->dom; 84 | $this->xpath = new DomXPath($this->dom); 85 | } 86 | 87 | /** 88 | * @throws Exception 89 | */ 90 | final public function content(string $expression, bool $outerContent = false): StringCollection 91 | { 92 | $items = $this->query($expression); 93 | $result = []; 94 | foreach ($items as $node) { 95 | if ($outerContent) { 96 | $result[] = NodeHelper::getOuterContent($node, $this->type); 97 | } else { 98 | $result[] = NodeHelper::getInnerContent($node, $this->type); 99 | } 100 | } 101 | return new StringCollection($result); 102 | } 103 | 104 | /** 105 | * You can remove elements and attributes 106 | * 107 | * ```php 108 | * $html = $html->remove("//span/@class"); 109 | * $html = $html->remove("//input"); 110 | * ``` 111 | */ 112 | final public function remove(string $expression): ElementFinderInterface 113 | { 114 | return $this->modify($expression, new RemoveNodes()); 115 | } 116 | 117 | final public function modify(string $expression, DomNodeListActionInterface $action): ElementFinderInterface 118 | { 119 | $result = clone $this; 120 | $action->execute( 121 | $result->query($expression) 122 | ); 123 | return $result; 124 | } 125 | 126 | /** 127 | * Get nodeValue of node 128 | * 129 | * @throws Exception 130 | */ 131 | final public function value(string $expression): StringCollection 132 | { 133 | $items = $this->query($expression); 134 | $result = []; 135 | foreach ($items as $node) { 136 | $result[] = $node->nodeValue; 137 | } 138 | return new StringCollection($result); 139 | } 140 | 141 | /** 142 | * Return array of keys and values 143 | * 144 | * @throws Exception 145 | */ 146 | final public function keyValue(string $keyExpression, string $valueExpression): array 147 | { 148 | $keyNodes = $this->query($keyExpression); 149 | $valueNodes = $this->query($valueExpression); 150 | if ($keyNodes->length !== $valueNodes->length) { 151 | throw new RuntimeException('Keys and values must have equal numbers of elements'); 152 | } 153 | $result = []; 154 | foreach ($keyNodes as $index => $node) { 155 | $result[$node->nodeValue] = $valueNodes->item($index)->nodeValue; 156 | } 157 | return $result; 158 | } 159 | 160 | /** 161 | * @throws Exception 162 | * @throws InvalidArgumentException 163 | */ 164 | final public function object(string $expression, bool $outerHtml = false): ObjectCollection 165 | { 166 | $type = $this->type; 167 | $items = $this->query($expression); 168 | $result = []; 169 | foreach ($items as $node) { 170 | assert($node instanceof DOMElement); 171 | $html = $outerHtml 172 | ? NodeHelper::getOuterContent($node, $this->type) 173 | : NodeHelper::getInnerContent($node, $this->type); 174 | if (trim($html) === '') { 175 | $html = ''; 176 | } 177 | if ($this->type === static::DOCUMENT_XML and ! str_contains($html, '' . $html . ''; 179 | } 180 | $result[] = new ElementFinder($html, $type, $this->expressionTranslator); 181 | } 182 | return new ObjectCollection($result); 183 | } 184 | 185 | /** 186 | * @throws InvalidArgumentException 187 | */ 188 | final public function element(string $expression): ElementCollection 189 | { 190 | $nodeList = $this->query($expression); 191 | $items = []; 192 | foreach ($nodeList as $item) { 193 | $items[] = clone $item; 194 | } 195 | return new ElementCollection($items); 196 | } 197 | 198 | final public function getLoadErrors(): array 199 | { 200 | return $this->loadErrors; 201 | } 202 | 203 | /** 204 | * @return $this 205 | * @throws Exception 206 | */ 207 | private function setData(string $data): self 208 | { 209 | $internalErrors = libxml_use_internal_errors(true); 210 | $disableEntities = false; 211 | if (\LIBXML_VERSION < 20900) { 212 | $disableEntities = libxml_disable_entity_loader(); 213 | } 214 | 215 | if (static::DOCUMENT_HTML === $this->type) { 216 | $data = StringHelper::safeEncodeStr($data); 217 | 218 | //Analogue of mb_convert_encoding($data, 'HTML-ENTITIES', 'UTF-8') 219 | //Usage of mb_convert_encoding with encoding to HTML_ENTITIES is deprecated since php version 8.2 220 | //When passing data to ElementFinder in an encoding other than UTF-8, any unrecognized characters will be ignored 221 | $data = mb_encode_numericentity( 222 | htmlspecialchars_decode( 223 | htmlentities($data, ENT_NOQUOTES | ENT_IGNORE, 'UTF-8', false), 224 | ENT_NOQUOTES 225 | ), 226 | [0x80, 0x10FFFF, 0, ~0], 227 | 'UTF-8' 228 | ); 229 | 230 | $this->dom->loadHTML($data, LIBXML_NOCDATA & LIBXML_NOERROR); 231 | } elseif (static::DOCUMENT_XML === $this->type) { 232 | $this->dom->loadXML($data, LIBXML_NOCDATA & LIBXML_NOERROR); 233 | } else { 234 | throw new InvalidArgumentException('Doc type not valid. use xml or html'); 235 | } 236 | $this->loadErrors = libxml_get_errors(); 237 | libxml_clear_errors(); 238 | libxml_use_internal_errors($internalErrors); 239 | if (\LIBXML_VERSION < 20900) { 240 | libxml_disable_entity_loader($disableEntities); 241 | } 242 | unset($this->xpath); 243 | $this->xpath = new DomXPath($this->dom); 244 | return $this; 245 | } 246 | 247 | /** 248 | * @see element 249 | * Fetch nodes from document 250 | */ 251 | private function query(string $expression): DOMNodeList 252 | { 253 | return $this->xpath->query( 254 | $this->expressionTranslator->convertToXpath($expression) 255 | ); 256 | } 257 | } 258 | -------------------------------------------------------------------------------- /src/ElementFinder/Element.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class Element extends DOMElement 13 | { 14 | /** 15 | * @return array Array 16 | */ 17 | final public function getAttributes(): array 18 | { 19 | $attributes = []; 20 | foreach ($this->attributes as $attr) { 21 | $attributes[$attr->name] = $attr->value; 22 | } 23 | return $attributes; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/ElementFinderInterface.php: -------------------------------------------------------------------------------- 1 | 16 | */ 17 | interface ElementFinderInterface 18 | { 19 | /** 20 | * @throws Exception 21 | */ 22 | public function content(string $expression, bool $outerContent = false): StringCollection; 23 | 24 | /** 25 | * You can remove elements and attributes 26 | * 27 | * ```php 28 | * $html = $html->remove("//span/@class"); 29 | * $html = $html->remove("//input"); 30 | * ``` 31 | */ 32 | public function remove(string $expression): ElementFinderInterface; 33 | 34 | public function modify(string $expression, DomNodeListActionInterface $action): ElementFinderInterface; 35 | 36 | /** 37 | * Get nodeValue of the node 38 | * 39 | * @throws Exception 40 | */ 41 | public function value(string $expression): StringCollection; 42 | 43 | /** 44 | * Return array of keys and values 45 | * 46 | * @throws Exception 47 | */ 48 | public function keyValue(string $keyExpression, string $valueExpression): array; 49 | 50 | /** 51 | * @throws Exception 52 | * @throws InvalidArgumentException 53 | */ 54 | public function object(string $expression, bool $outerHtml = false): ObjectCollection; 55 | 56 | /** 57 | * @throws InvalidArgumentException 58 | */ 59 | public function element(string $expression): ElementCollection; 60 | 61 | /** 62 | * @return string[] 63 | */ 64 | public function getLoadErrors(): array; 65 | } 66 | -------------------------------------------------------------------------------- /src/ExpressionTranslator/ExpressionTranslatorInterface.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | interface ExpressionTranslatorInterface 11 | { 12 | /** 13 | * Translate expression to xpath 14 | * For example you can use css 15 | */ 16 | public function convertToXpath(string $expression): string; 17 | } 18 | -------------------------------------------------------------------------------- /src/ExpressionTranslator/XpathExpression.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | class XpathExpression implements ExpressionTranslatorInterface 11 | { 12 | final public function convertToXpath(string $expression): string 13 | { 14 | return $expression; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/Helper/FormHelper.php: -------------------------------------------------------------------------------- 1 | 13 | */ 14 | class FormHelper 15 | { 16 | public function __construct( 17 | private readonly ElementFinderInterface $page 18 | ) { 19 | } 20 | 21 | /** 22 | * Get data from
element 23 | * 24 | * Form is get by $formExpression 25 | * Return key->value array where key is name of field 26 | * 27 | * @param string $formExpression css or xpath expression to form element 28 | * @throws Exception 29 | */ 30 | final public function getFormData(string $formExpression): array 31 | { 32 | $form = $this->page->object($formExpression, true)->first(); 33 | if ($form === null) { 34 | throw new InvalidArgumentException('Cant find form. Possible invalid expression '); 35 | } 36 | 37 | $formData = []; 38 | # textarea 39 | foreach ($form->element('//textarea') as $textArea) { 40 | $formData[$textArea->getAttribute('name')] = $textArea->nodeValue; 41 | } 42 | 43 | # radio and checkboxes 44 | foreach ($form->element('//input[@checked]') as $textArea) { 45 | $formData[$textArea->getAttribute('name')] = $textArea->getAttribute('value'); 46 | } 47 | 48 | # hidden, text, submit 49 | $hiddenAndTextElements = $form->element('//input[@type="hidden" or @type="text" or @type="submit" or not(@type)]'); 50 | foreach ($hiddenAndTextElements as $element) { 51 | $formData[$element->getAttribute('name')] = $element->getAttribute('value'); 52 | } 53 | 54 | # selects 55 | foreach ($form->object('//select[not(@multiple)]', true) as $select) { 56 | $name = $select->value('//select/@name')->first(); 57 | if ($name === null) { 58 | continue; 59 | } 60 | $formData[$name] = $select->value('//option[@selected]/@value')->first(); 61 | } 62 | 63 | # multiple selects 64 | foreach ($form->object('//select[@multiple]', true) as $multipleSelect) { 65 | $name = $multipleSelect->value('//select/@name')->first(); 66 | if ($name === null) { 67 | continue; 68 | } 69 | $options = $multipleSelect->value('//option[@selected]/@value'); 70 | if (preg_match('!\[]$!', $name)) { 71 | $name = rtrim($name, '[]'); 72 | $formData[$name] = $options->all(); 73 | } else { 74 | $formData[$name] = $options->last(); 75 | } 76 | } 77 | return $formData; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/Helper/NodeHelper.php: -------------------------------------------------------------------------------- 1 | 13 | */ 14 | class NodeHelper 15 | { 16 | final public static function getOuterContent(DOMNode $node, int $documentType): string 17 | { 18 | $domDocument = new DOMDocument('1.0'); 19 | $b = $domDocument->importNode($node->cloneNode(true), true); 20 | /** @noinspection UnusedFunctionResultInspection */ 21 | $domDocument->appendChild($b); 22 | 23 | $content = $documentType === ElementFinder::DOCUMENT_XML ? $domDocument->saveXml() : $domDocument->saveHTML(); 24 | $content = StringHelper::safeEncodeStr($content); 25 | 26 | return $content; 27 | } 28 | 29 | final public static function getInnerContent(DOMNode $itemObj, int $documentType): string 30 | { 31 | $content = ''; 32 | foreach ($itemObj->childNodes as $child) { 33 | $content .= ($documentType === ElementFinder::DOCUMENT_XML ? $child->ownerDocument->saveXml($child) : $child->ownerDocument->saveHTML($child)); 34 | } 35 | return StringHelper::safeEncodeStr($content); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/Helper/StringHelper.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | class StringHelper 11 | { 12 | final public static function safeEncodeStr(string $str): string 13 | { 14 | return preg_replace_callback('/&#([a-z\d]+);/i', static function ($m) { 15 | $value = (string) $m[0]; 16 | $value = mb_convert_encoding($value, 'UTF-8', 'HTML-ENTITIES'); 17 | return $value; 18 | }, $str); 19 | } 20 | } 21 | --------------------------------------------------------------------------------