4 |
5 | > Permission is hereby granted, free of charge, to any person obtaining a copy
6 | > of this software and associated documentation files (the "Software"), to deal
7 | > in the Software without restriction, including without limitation the rights
8 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | > copies of the Software, and to permit persons to whom the Software is
10 | > furnished to do so, subject to the following conditions:
11 | >
12 | > The above copyright notice and this permission notice shall be included in
13 | > all copies or substantial portions of the Software.
14 | >
15 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | > THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ElementFinder
2 |
3 | [](https://packagist.org/packages/xparse/element-finder)
4 | [](LICENSE.md)
5 | [](https://github.com/xparse/ElementFinder/actions/workflows/test.yaml)
6 | [](https://codecov.io/github/xparse/ElementFinder)
7 | [](https://packagist.org/packages/xparse/element-finder)
8 |
9 | Extract data from html with elegant xpath/css expressions and prepare data with regexp in single line.
10 |
11 | ## Install
12 |
13 | Via Composer
14 |
15 | ``` bash
16 | $ composer require xparse/element-finder
17 | ```
18 |
19 | ## Usage
20 |
21 | ``` php
22 | $page = new ElementFinder($html);
23 | $title = $page->value('//title')->first();
24 | echo $title;
25 | ```
26 |
27 | ## Advanced usage with regexp
28 |
29 |
30 | ``` php
31 | $page = new \Xparse\ElementFinder\ElementFinder('
32 |
33 |
34 | 044-12-12,
35 | 258-16-16
36 |
37 |
38 |
39 | (148) 04-55-16
40 |
41 |
42 | ');
43 |
44 | $tels = $page->value('//*[@class="tels"]')->split('!,!')->replace("![^0-9]!");
45 | print_r($tels);
46 |
47 | /*
48 | [0] => 0441212
49 | [1] => 2581616
50 | [2] => 148045516
51 | */
52 |
53 |
54 | ```
55 |
56 | ## Css selectors
57 | Read this document. [Using css selectors](doc/using_css_selectors.md).
58 |
59 | ## Testing
60 |
61 | ``` bash
62 | ./vendor/bin/phpunit
63 | ```
64 |
65 | ## Contributing
66 |
67 | Please see [CONTRIBUTING](https://github.com/xparse/ElementFinder/blob/master/CONTRIBUTING.md) for details.
68 |
69 | ## Credits
70 |
71 | - [funivan](https://github.com/funivan)
72 | - [All Contributors](https://github.com/xparse/ElementFinder/contributors)
73 |
74 | ## Xpath info
75 | - [XPath/CSS Equivalents](https://en.wikibooks.org/wiki/XPath/CSS_Equivalents)
76 | - [Choose between XPath and jQuery with an XPath-jQuery phrase book](http://www.ibm.com/developerworks/library/x-xpathjquery/)
77 | - [XPath and CSS Selectors](http://ejohn.org/blog/xpath-css-selectors/)
78 |
79 | ## License
80 |
81 | The MIT License (MIT). Please see [License File](LICENSE.md) for more information.
82 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "xparse/element-finder",
3 | "description": "Elegant data scrapping",
4 | "keywords": [
5 | "xpath",
6 | "parser",
7 | "dom"
8 | ],
9 | "homepage": "https://github.com/xparse/ElementFinder",
10 | "license": "MIT",
11 | "authors": [
12 | {
13 | "name": "Ivan Shcherbak",
14 | "email": "alotofall@gmail.com",
15 | "homepage": "http://funivan.com/",
16 | "role": "Developer"
17 | }
18 | ],
19 | "require": {
20 | "php": "^8.2",
21 | "ext-dom": "*",
22 | "ext-libxml": "*",
23 | "symfony/css-selector": "^7.1"
24 | },
25 | "require-dev": {
26 | "friendsofphp/php-cs-fixer": "^3.16",
27 | "phpunit/phpunit": "^11.2.9",
28 | "rector/rector": "^2.0.8",
29 | "symplify/easy-coding-standard": "^12.1"
30 | },
31 | "autoload": {
32 | "psr-4": {
33 | "Xparse\\ElementFinder\\": "src"
34 | }
35 | },
36 | "autoload-dev": {
37 | "psr-4": {
38 | "Test\\Xparse\\ElementFinder\\": "./tests"
39 | }
40 | },
41 | "config": {
42 | "optimize-autoloader": true,
43 | "sort-packages": true
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/doc/using_css_selectors.md:
--------------------------------------------------------------------------------
1 | # Using CSS selectors
2 | Xpath is very powerful query language. But sometimes, you do not need this power. You need just to grab some page in simple way - using css selectors.
3 | Css selectors are widely used. They are simple.
4 |
5 |
6 | You need additional library called `xparse/css-expression-translator`
7 |
8 | Install it via composer:
9 | ```sh
10 | composer require xparse/css-expression-translator
11 | ```
12 |
13 | Configure element finder
14 | ```php
15 | $finder->setExpressionTranslator(new CssExpressionTranslator());
16 | ```
17 |
18 | ## Example
19 | Here is full working example:
20 | ```php
21 |
22 | require 'vendor/autoload.php';
23 |
24 | use Xparse\CssExpressionTranslator\CssExpressionTranslator;
25 | use Xparse\ElementFinder\ElementFinder;
26 |
27 |
28 | $finder = new ElementFinder('', ElementFinder::DOCUMENT_HTML, new CssExpressionTranslator());
32 |
33 |
34 | # 321ad
35 | echo $finder->content('a.test')->first();
36 | ```
37 |
38 | ## How it works?
39 | This library build on top of the `symfony/css-selector` [https://github.com/symfony/css-selector](https://github.com/symfony/css-selector)
40 |
41 | ## How to select attributes with css?
42 | Add space before attribute name.
43 | ```php
44 | $finder->attributes('a @href');
45 | $finder->attributes('a.test @class');
46 |
47 | // slect node text
48 | $finder->value('a.test node()');
49 | ```
50 |
51 | ## Limits
52 | There are some limits.
53 | - Xpath is more powerful than css.
54 | - you cant select attributes with `or` operator
55 | - fetch function result `a concat('text:', text())`
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/ecs.php:
--------------------------------------------------------------------------------
1 | paths([
13 | __DIR__ . '/src',
14 | __DIR__ . '/tests',
15 | __FILE__,
16 | ]);
17 |
18 | $ecsConfig->rules([
19 | NoUnusedImportsFixer::class,
20 | VoidReturnFixer::class,
21 | DeclareStrictTypesFixer::class,
22 | ]);
23 |
24 | // this way you can add sets - group of rules
25 | $ecsConfig->sets([SetList::SPACES, SetList::ARRAY, SetList::DOCBLOCK, SetList::NAMESPACES, SetList::COMMENTS, SetList::PSR_12]);
26 | };
27 |
--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
1 |
2 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | ./tests
19 |
20 |
21 |
22 |
23 | ./src
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/rector.php:
--------------------------------------------------------------------------------
1 | paths([__DIR__ . '/src', __DIR__ . '/tests', __FILE__]);
14 |
15 | // register a single rule
16 | $rectorConfig->rule(InlineConstructorDefaultToPropertyRector::class);
17 | $rectorConfig->rule(TypedPropertyFromStrictConstructorRector::class);
18 | $rectorConfig->rule(SeparateMultiUseImportsRector::class);
19 | $rectorConfig->importNames();
20 | $rectorConfig->sets([LevelSetList::UP_TO_PHP_81, PHPUnitSetList::PHPUNIT_90]);
21 | };
22 |
--------------------------------------------------------------------------------
/src/Collection/ElementCollection.php:
--------------------------------------------------------------------------------
1 |
16 | */
17 | class ElementCollection implements IteratorAggregate, Countable
18 | {
19 | /**
20 | * @var bool
21 | */
22 | private $validated = false;
23 |
24 | /**
25 | * @param Element[] $items
26 | * @throws InvalidArgumentException
27 | */
28 | public function __construct(
29 | private readonly array $items = []
30 | ) {
31 | }
32 |
33 | /**
34 | * @throws InvalidArgumentException
35 | */
36 | final public function count(): int
37 | {
38 | return \count($this->all());
39 | }
40 |
41 | /**
42 | * @throws InvalidArgumentException
43 | */
44 | final public function last(): ?Element
45 | {
46 | $items = $this->all();
47 | if (\count($items) === 0) {
48 | return null;
49 | }
50 | return end($items);
51 | }
52 |
53 | /**
54 | * @throws InvalidArgumentException
55 | */
56 | final public function first(): ?Element
57 | {
58 | $items = $this->all();
59 | if (\count($items) === 0) {
60 | return null;
61 | }
62 | return reset($items);
63 | }
64 |
65 | /**
66 | * @throws InvalidArgumentException
67 | */
68 | final public function get(int $index): ?Element
69 | {
70 | return $this->all()[$index] ?? null;
71 | }
72 |
73 | /**
74 | * @return Element[]
75 | * @throws InvalidArgumentException
76 | */
77 | final public function all(): array
78 | {
79 | if (! $this->validated) {
80 | foreach ($this->items as $key => $item) {
81 | if (! $item instanceof Element) {
82 | $className = ($item === null) ? \gettype($item) : $item::class;
83 | throw new InvalidArgumentException(
84 | sprintf(
85 | 'Invalid object type. Expect %s given %s Check item %d',
86 | Element::class,
87 | $className,
88 | $key
89 | )
90 | );
91 | }
92 | }
93 | }
94 |
95 | return $this->items;
96 | }
97 |
98 | /**
99 | * @throws InvalidArgumentException
100 | */
101 | final public function merge(ElementCollection $collection): ElementCollection
102 | {
103 | return new ElementCollection(array_merge($this->all(), $collection->all()));
104 | }
105 |
106 | /**
107 | * @throws InvalidArgumentException
108 | */
109 | final public function add(Element $element): ElementCollection
110 | {
111 | $items = $this->all();
112 | $items[] = $element;
113 | return new ElementCollection($items);
114 | }
115 |
116 | /**
117 | * Retrieve an external iterator
118 | *
119 | * @link http://php.net/manual/en/iteratoraggregate.getiterator.php
120 | * @return Element[]|Traversable An instance of an object implementing Iterator or Traversable
121 | * @throws InvalidArgumentException
122 | */
123 | final public function getIterator(): Traversable
124 | {
125 | return new ArrayIterator($this->all());
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/src/Collection/Filters/StringFilter/RegexStringFilter.php:
--------------------------------------------------------------------------------
1 | regex, $input) === 1;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/Collection/Filters/StringFilter/StringFilterInterface.php:
--------------------------------------------------------------------------------
1 | from, $this->to, $input);
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/Collection/Modify/StringModify/StringModifyInterface.php:
--------------------------------------------------------------------------------
1 |
17 | */
18 | class ObjectCollection implements IteratorAggregate, Countable
19 | {
20 | /**
21 | * @var bool
22 | */
23 | private $validated = false;
24 |
25 | /**
26 | * @param ElementFinderInterface[] $items
27 | * @throws Exception
28 | */
29 | public function __construct(
30 | private readonly array $items = []
31 | ) {
32 | }
33 |
34 | /**
35 | * @throws InvalidArgumentException
36 | */
37 | final public function count(): int
38 | {
39 | return \count($this->all());
40 | }
41 |
42 | /**
43 | * @throws InvalidArgumentException
44 | */
45 | final public function last(): ?ElementFinderInterface
46 | {
47 | $items = $this->all();
48 | if ($items === []) {
49 | return null;
50 | }
51 | return end($items);
52 | }
53 |
54 | /**
55 | * @throws InvalidArgumentException
56 | */
57 | final public function first(): ?ElementFinderInterface
58 | {
59 | $items = $this->all();
60 | if (\count($items) === 0) {
61 | return null;
62 | }
63 | return reset($items);
64 | }
65 |
66 | /**
67 | * @return ElementFinderInterface[]
68 | * @throws InvalidArgumentException
69 | */
70 | final public function all(): array
71 | {
72 | if (! $this->validated) {
73 | foreach ($this->items as $key => $item) {
74 | if (! $item instanceof ElementFinderInterface) {
75 | $className = ($item === null) ? \gettype($item) : $item::class;
76 | throw new InvalidArgumentException(
77 | sprintf(
78 | 'Invalid object type. Expect %s given %s Check item %d',
79 | ElementFinderInterface::class,
80 | $className,
81 | $key
82 | )
83 | );
84 | }
85 | }
86 | $this->validated = true;
87 | }
88 | return $this->items;
89 | }
90 |
91 | /**
92 | * @throws Exception
93 | */
94 | final public function merge(ObjectCollection $collection): ObjectCollection
95 | {
96 | return new ObjectCollection(array_merge($this->all(), $collection->all()));
97 | }
98 |
99 | /**
100 | * @throws Exception
101 | */
102 | final public function add(ElementFinderInterface $element): ObjectCollection
103 | {
104 | $items = $this->all();
105 | $items[] = $element;
106 | return new ObjectCollection($items);
107 | }
108 |
109 | /**
110 | * @throws InvalidArgumentException
111 | */
112 | final public function get(int $index): ?ElementFinderInterface
113 | {
114 | return $this->all()[$index] ?? null;
115 | }
116 |
117 | /**
118 | * @return ElementFinderInterface[]|Traversable
119 | * @throws InvalidArgumentException
120 | */
121 | final public function getIterator(): Traversable
122 | {
123 | return new ArrayIterator($this->all());
124 | }
125 | }
126 |
--------------------------------------------------------------------------------
/src/Collection/StringCollection.php:
--------------------------------------------------------------------------------
1 |
18 | */
19 | class StringCollection implements IteratorAggregate, Countable
20 | {
21 | /**
22 | * @var string[]
23 | */
24 | private readonly array $items;
25 |
26 | /**
27 | * @var bool
28 | */
29 | private $validated = false;
30 |
31 | /**
32 | * @param string[] $items
33 | */
34 | public function __construct(array $items = [])
35 | {
36 | $this->items = array_values($items);
37 | }
38 |
39 | /**
40 | * @throws Exception
41 | */
42 | final public function count(): int
43 | {
44 | return \count($this->all());
45 | }
46 |
47 | /**
48 | * @throws Exception
49 | */
50 | final public function last(): ?string
51 | {
52 | $items = $this->all();
53 | if (\count($items) === 0) {
54 | return null;
55 | }
56 | return (string) end($items);
57 | }
58 |
59 | /**
60 | * @throws Exception
61 | */
62 | final public function first(): ?string
63 | {
64 | $items = $this->all();
65 | if (\count($items) === 0) {
66 | return null;
67 | }
68 | return (string) reset($items);
69 | }
70 |
71 | /**
72 | * @return string[]
73 | * @throws Exception
74 | */
75 | final public function all(): array
76 | {
77 | if (! $this->validated) {
78 | foreach ($this->items as $key => $item) {
79 | if (! \is_string($item)) {
80 | throw new InvalidArgumentException(
81 | sprintf('Expect string. Check %s item', $key)
82 | );
83 | }
84 | }
85 | $this->validated = true;
86 | }
87 | return $this->items;
88 | }
89 |
90 | /**
91 | * @throws Exception
92 | */
93 | final public function map(StringModifyInterface $modifier): StringCollection
94 | {
95 | $items = [];
96 | foreach ($this->all() as $item) {
97 | $items[] = $modifier->modify($item);
98 | }
99 | return new StringCollection($items);
100 | }
101 |
102 | /**
103 | * @throws Exception
104 | */
105 | final public function filter(StringFilterInterface $filter): StringCollection
106 | {
107 | $items = [];
108 | foreach ($this->all() as $item) {
109 | if ($filter->valid($item)) {
110 | $items[] = $item;
111 | }
112 | }
113 | return new StringCollection($items);
114 | }
115 |
116 | /**
117 | * @throws Exception
118 | */
119 | final public function replace(string $regexp, string $to): StringCollection
120 | {
121 | $result = [];
122 | foreach ($this->all() as $index => $item) {
123 | $result[] = preg_replace($regexp, $to, $item);
124 | }
125 | return new StringCollection($result);
126 | }
127 |
128 | /**
129 | * @throws Exception
130 | */
131 | final public function match(string $regexp, int $index = 1): StringCollection
132 | {
133 | $result = [];
134 | foreach ($this->all() as $string) {
135 | preg_match_all($regexp, $string, $matchedData);
136 | if (isset($matchedData[$index])) {
137 | foreach ((array) $matchedData[$index] as $matchedString) {
138 | $result[] = $matchedString;
139 | }
140 | }
141 | }
142 | return new StringCollection($result);
143 | }
144 |
145 | /**
146 | * @throws Exception
147 | */
148 | final public function split(string $regexp): StringCollection
149 | {
150 | $items = [];
151 | foreach ($this->all() as $item) {
152 | foreach (preg_split($regexp, $item) as $string) {
153 | $items[] = $string;
154 | }
155 | }
156 | return new StringCollection($items);
157 | }
158 |
159 | /**
160 | * @throws Exception
161 | */
162 | final public function unique(): StringCollection
163 | {
164 | return new StringCollection(array_unique($this->all()));
165 | }
166 |
167 | /**
168 | * @throws Exception
169 | */
170 | final public function merge(StringCollection $collection): StringCollection
171 | {
172 | return new StringCollection(array_merge($this->all(), $collection->all()));
173 | }
174 |
175 | /**
176 | * @throws Exception
177 | */
178 | final public function add(string $item): StringCollection
179 | {
180 | $items = $this->all();
181 | $items[] = $item;
182 | return new StringCollection($items);
183 | }
184 |
185 | /**
186 | * @throws Exception
187 | */
188 | final public function get(int $index): ?string
189 | {
190 | return $this->all()[$index] ?? null;
191 | }
192 |
193 | /**
194 | * @link http://php.net/manual/en/iteratoraggregate.getiterator.php
195 | * @return string[]|Traversable
196 | * @throws Exception
197 | */
198 | final public function getIterator(): Traversable
199 | {
200 | return new ArrayIterator($this->all());
201 | }
202 | }
203 |
--------------------------------------------------------------------------------
/src/CssExpressionTranslator/CssExpressionTranslator.php:
--------------------------------------------------------------------------------
1 |
12 | */
13 | class CssExpressionTranslator extends CssSelectorConverter implements ExpressionTranslatorInterface
14 | {
15 | final public function convertToXpath(string $expression): string
16 | {
17 | $xpathExpression = [];
18 | foreach (explode(', ', $expression) as $part) {
19 | preg_match('!(.+) (@.+|.+\(\))$!', $part, $matchExpression);
20 | if (! array_key_exists(2, $matchExpression)) {
21 | $xpathExpression[] = $this->toXPath($part);
22 | } else {
23 | $xpathExpression[] = $this->toXPath($matchExpression[1]) . '/' . $matchExpression[2];
24 | }
25 | }
26 | return implode(' | ', $xpathExpression);
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/CssExpressionTranslator/CssOrXpathExpressionTranslator.php:
--------------------------------------------------------------------------------
1 |
14 | */
15 | class CssOrXpathExpressionTranslator implements ExpressionTranslatorInterface
16 | {
17 | public function __construct(
18 | private readonly ExpressionTranslatorInterface $cssTranslator = new CssExpressionTranslator()
19 | ) {
20 | }
21 |
22 | /**
23 | * @throws InvalidArgumentException
24 | */
25 | final public function convertToXpath(string $expression): string
26 | {
27 | $expression = trim($expression);
28 | if ($expression === '') {
29 | throw new InvalidArgumentException('Expect not empty expression');
30 | }
31 | if ($expression === '.') {
32 | return $expression;
33 | }
34 | if (mb_strpos($expression, './') === 0) {
35 | return $expression;
36 | }
37 | $firstChar = mb_substr($expression, 0, 1);
38 | if (in_array($firstChar, ['/', '('])) {
39 | return $expression;
40 | }
41 | return $this->cssTranslator->convertToXpath($expression);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/DomNodeListAction/DomNodeListActionInterface.php:
--------------------------------------------------------------------------------
1 | ownerElement->removeAttribute($node->name);
17 | } else {
18 | $node->parentNode->removeChild($node);
19 | }
20 | }
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/ElementFinder.php:
--------------------------------------------------------------------------------
1 |
28 | */
29 | class ElementFinder implements ElementFinderInterface
30 | {
31 | /**
32 | * Html document type
33 | *
34 | * @var int
35 | */
36 | final public const DOCUMENT_HTML = 0;
37 |
38 | /**
39 | * Xml document type
40 | *
41 | * @var int
42 | */
43 | final public const DOCUMENT_XML = 1;
44 |
45 | private int $type;
46 |
47 | private DOMDocument $dom;
48 |
49 | private DomXPath $xpath;
50 |
51 | private ExpressionTranslatorInterface $expressionTranslator;
52 |
53 | /**
54 | * @var LibXMLError[]
55 | */
56 | private array $loadErrors = [];
57 |
58 | /**
59 | * Example:
60 | * new ElementFinder("test
", ElementFinder::HTML);
61 | *
62 | * @throws Exception
63 | */
64 | public function __construct(
65 | string $data,
66 | int $documentType = null,
67 | ExpressionTranslatorInterface $translator = null
68 | ) {
69 | $this->dom = new DomDocument();
70 | $this->expressionTranslator = $translator ?? new XpathExpression();
71 | $this->dom->registerNodeClass(DOMElement::class, Element::class);
72 | $this->type = $documentType ?? static::DOCUMENT_HTML;
73 | $this->setData($data ?: '');
74 | }
75 |
76 | public function __destruct()
77 | {
78 | unset($this->dom, $this->xpath);
79 | }
80 |
81 | public function __clone()
82 | {
83 | $this->dom = clone $this->dom;
84 | $this->xpath = new DomXPath($this->dom);
85 | }
86 |
87 | /**
88 | * @throws Exception
89 | */
90 | final public function content(string $expression, bool $outerContent = false): StringCollection
91 | {
92 | $items = $this->query($expression);
93 | $result = [];
94 | foreach ($items as $node) {
95 | if ($outerContent) {
96 | $result[] = NodeHelper::getOuterContent($node, $this->type);
97 | } else {
98 | $result[] = NodeHelper::getInnerContent($node, $this->type);
99 | }
100 | }
101 | return new StringCollection($result);
102 | }
103 |
104 | /**
105 | * You can remove elements and attributes
106 | *
107 | * ```php
108 | * $html = $html->remove("//span/@class");
109 | * $html = $html->remove("//input");
110 | * ```
111 | */
112 | final public function remove(string $expression): ElementFinderInterface
113 | {
114 | return $this->modify($expression, new RemoveNodes());
115 | }
116 |
117 | final public function modify(string $expression, DomNodeListActionInterface $action): ElementFinderInterface
118 | {
119 | $result = clone $this;
120 | $action->execute(
121 | $result->query($expression)
122 | );
123 | return $result;
124 | }
125 |
126 | /**
127 | * Get nodeValue of node
128 | *
129 | * @throws Exception
130 | */
131 | final public function value(string $expression): StringCollection
132 | {
133 | $items = $this->query($expression);
134 | $result = [];
135 | foreach ($items as $node) {
136 | $result[] = $node->nodeValue;
137 | }
138 | return new StringCollection($result);
139 | }
140 |
141 | /**
142 | * Return array of keys and values
143 | *
144 | * @throws Exception
145 | */
146 | final public function keyValue(string $keyExpression, string $valueExpression): array
147 | {
148 | $keyNodes = $this->query($keyExpression);
149 | $valueNodes = $this->query($valueExpression);
150 | if ($keyNodes->length !== $valueNodes->length) {
151 | throw new RuntimeException('Keys and values must have equal numbers of elements');
152 | }
153 | $result = [];
154 | foreach ($keyNodes as $index => $node) {
155 | $result[$node->nodeValue] = $valueNodes->item($index)->nodeValue;
156 | }
157 | return $result;
158 | }
159 |
160 | /**
161 | * @throws Exception
162 | * @throws InvalidArgumentException
163 | */
164 | final public function object(string $expression, bool $outerHtml = false): ObjectCollection
165 | {
166 | $type = $this->type;
167 | $items = $this->query($expression);
168 | $result = [];
169 | foreach ($items as $node) {
170 | assert($node instanceof DOMElement);
171 | $html = $outerHtml
172 | ? NodeHelper::getOuterContent($node, $this->type)
173 | : NodeHelper::getInnerContent($node, $this->type);
174 | if (trim($html) === '') {
175 | $html = '';
176 | }
177 | if ($this->type === static::DOCUMENT_XML and ! str_contains($html, '' . $html . '';
179 | }
180 | $result[] = new ElementFinder($html, $type, $this->expressionTranslator);
181 | }
182 | return new ObjectCollection($result);
183 | }
184 |
185 | /**
186 | * @throws InvalidArgumentException
187 | */
188 | final public function element(string $expression): ElementCollection
189 | {
190 | $nodeList = $this->query($expression);
191 | $items = [];
192 | foreach ($nodeList as $item) {
193 | $items[] = clone $item;
194 | }
195 | return new ElementCollection($items);
196 | }
197 |
198 | final public function getLoadErrors(): array
199 | {
200 | return $this->loadErrors;
201 | }
202 |
203 | /**
204 | * @return $this
205 | * @throws Exception
206 | */
207 | private function setData(string $data): self
208 | {
209 | $internalErrors = libxml_use_internal_errors(true);
210 | $disableEntities = false;
211 | if (\LIBXML_VERSION < 20900) {
212 | $disableEntities = libxml_disable_entity_loader();
213 | }
214 |
215 | if (static::DOCUMENT_HTML === $this->type) {
216 | $data = StringHelper::safeEncodeStr($data);
217 |
218 | //Analogue of mb_convert_encoding($data, 'HTML-ENTITIES', 'UTF-8')
219 | //Usage of mb_convert_encoding with encoding to HTML_ENTITIES is deprecated since php version 8.2
220 | //When passing data to ElementFinder in an encoding other than UTF-8, any unrecognized characters will be ignored
221 | $data = mb_encode_numericentity(
222 | htmlspecialchars_decode(
223 | htmlentities($data, ENT_NOQUOTES | ENT_IGNORE, 'UTF-8', false),
224 | ENT_NOQUOTES
225 | ),
226 | [0x80, 0x10FFFF, 0, ~0],
227 | 'UTF-8'
228 | );
229 |
230 | $this->dom->loadHTML($data, LIBXML_NOCDATA & LIBXML_NOERROR);
231 | } elseif (static::DOCUMENT_XML === $this->type) {
232 | $this->dom->loadXML($data, LIBXML_NOCDATA & LIBXML_NOERROR);
233 | } else {
234 | throw new InvalidArgumentException('Doc type not valid. use xml or html');
235 | }
236 | $this->loadErrors = libxml_get_errors();
237 | libxml_clear_errors();
238 | libxml_use_internal_errors($internalErrors);
239 | if (\LIBXML_VERSION < 20900) {
240 | libxml_disable_entity_loader($disableEntities);
241 | }
242 | unset($this->xpath);
243 | $this->xpath = new DomXPath($this->dom);
244 | return $this;
245 | }
246 |
247 | /**
248 | * @see element
249 | * Fetch nodes from document
250 | */
251 | private function query(string $expression): DOMNodeList
252 | {
253 | return $this->xpath->query(
254 | $this->expressionTranslator->convertToXpath($expression)
255 | );
256 | }
257 | }
258 |
--------------------------------------------------------------------------------
/src/ElementFinder/Element.php:
--------------------------------------------------------------------------------
1 |
11 | */
12 | class Element extends DOMElement
13 | {
14 | /**
15 | * @return array Array
16 | */
17 | final public function getAttributes(): array
18 | {
19 | $attributes = [];
20 | foreach ($this->attributes as $attr) {
21 | $attributes[$attr->name] = $attr->value;
22 | }
23 | return $attributes;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/ElementFinderInterface.php:
--------------------------------------------------------------------------------
1 |
16 | */
17 | interface ElementFinderInterface
18 | {
19 | /**
20 | * @throws Exception
21 | */
22 | public function content(string $expression, bool $outerContent = false): StringCollection;
23 |
24 | /**
25 | * You can remove elements and attributes
26 | *
27 | * ```php
28 | * $html = $html->remove("//span/@class");
29 | * $html = $html->remove("//input");
30 | * ```
31 | */
32 | public function remove(string $expression): ElementFinderInterface;
33 |
34 | public function modify(string $expression, DomNodeListActionInterface $action): ElementFinderInterface;
35 |
36 | /**
37 | * Get nodeValue of the node
38 | *
39 | * @throws Exception
40 | */
41 | public function value(string $expression): StringCollection;
42 |
43 | /**
44 | * Return array of keys and values
45 | *
46 | * @throws Exception
47 | */
48 | public function keyValue(string $keyExpression, string $valueExpression): array;
49 |
50 | /**
51 | * @throws Exception
52 | * @throws InvalidArgumentException
53 | */
54 | public function object(string $expression, bool $outerHtml = false): ObjectCollection;
55 |
56 | /**
57 | * @throws InvalidArgumentException
58 | */
59 | public function element(string $expression): ElementCollection;
60 |
61 | /**
62 | * @return string[]
63 | */
64 | public function getLoadErrors(): array;
65 | }
66 |
--------------------------------------------------------------------------------
/src/ExpressionTranslator/ExpressionTranslatorInterface.php:
--------------------------------------------------------------------------------
1 |
9 | */
10 | interface ExpressionTranslatorInterface
11 | {
12 | /**
13 | * Translate expression to xpath
14 | * For example you can use css
15 | */
16 | public function convertToXpath(string $expression): string;
17 | }
18 |
--------------------------------------------------------------------------------
/src/ExpressionTranslator/XpathExpression.php:
--------------------------------------------------------------------------------
1 |
9 | */
10 | class XpathExpression implements ExpressionTranslatorInterface
11 | {
12 | final public function convertToXpath(string $expression): string
13 | {
14 | return $expression;
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/Helper/FormHelper.php:
--------------------------------------------------------------------------------
1 |
13 | */
14 | class FormHelper
15 | {
16 | public function __construct(
17 | private readonly ElementFinderInterface $page
18 | ) {
19 | }
20 |
21 | /**
22 | * Get data from