├── AbstractUriElement.php ├── CHANGELOG.md ├── Crawler.php ├── Field ├── ChoiceFormField.php ├── FileFormField.php ├── FormField.php ├── InputFormField.php └── TextareaFormField.php ├── Form.php ├── FormFieldRegistry.php ├── Image.php ├── LICENSE ├── Link.php ├── README.md ├── Test └── Constraint │ ├── CrawlerAnySelectorTextContains.php │ ├── CrawlerAnySelectorTextSame.php │ ├── CrawlerSelectorAttributeValueSame.php │ ├── CrawlerSelectorCount.php │ ├── CrawlerSelectorExists.php │ ├── CrawlerSelectorTextContains.php │ └── CrawlerSelectorTextSame.php ├── UriResolver.php └── composer.json /AbstractUriElement.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler; 13 | 14 | /** 15 | * Any HTML element that can link to an URI. 16 | * 17 | * @author Fabien Potencier 18 | */ 19 | abstract class AbstractUriElement 20 | { 21 | protected \DOMElement $node; 22 | protected ?string $method; 23 | 24 | /** 25 | * @param \DOMElement $node A \DOMElement instance 26 | * @param string|null $currentUri The URI of the page where the link is embedded (or the base href) 27 | * @param string|null $method The method to use for the link (GET by default) 28 | * 29 | * @throws \InvalidArgumentException if the node is not a link 30 | */ 31 | public function __construct( 32 | \DOMElement $node, 33 | protected ?string $currentUri = null, 34 | ?string $method = 'GET', 35 | ) { 36 | $this->setNode($node); 37 | $this->method = $method ? strtoupper($method) : null; 38 | 39 | $elementUriIsRelative = !parse_url(trim($this->getRawUri()), \PHP_URL_SCHEME); 40 | $baseUriIsAbsolute = null !== $this->currentUri && \in_array(strtolower(substr($this->currentUri, 0, 4)), ['http', 'file']); 41 | if ($elementUriIsRelative && !$baseUriIsAbsolute) { 42 | throw new \InvalidArgumentException(\sprintf('The URL of the element is relative, so you must define its base URI passing an absolute URL to the constructor of the "%s" class ("%s" was passed).', __CLASS__, $this->currentUri)); 43 | } 44 | } 45 | 46 | /** 47 | * Gets the node associated with this link. 48 | */ 49 | public function getNode(): \DOMElement 50 | { 51 | return $this->node; 52 | } 53 | 54 | /** 55 | * Gets the method associated with this link. 56 | */ 57 | public function getMethod(): string 58 | { 59 | return $this->method ?? 'GET'; 60 | } 61 | 62 | /** 63 | * Gets the URI associated with this link. 64 | */ 65 | public function getUri(): string 66 | { 67 | return UriResolver::resolve($this->getRawUri(), $this->currentUri); 68 | } 69 | 70 | /** 71 | * Returns raw URI data. 72 | */ 73 | abstract protected function getRawUri(): string; 74 | 75 | /** 76 | * Returns the canonicalized URI path (see RFC 3986, section 5.2.4). 77 | * 78 | * @param string $path URI path 79 | */ 80 | protected function canonicalizePath(string $path): string 81 | { 82 | if ('' === $path || '/' === $path) { 83 | return $path; 84 | } 85 | 86 | if (str_ends_with($path, '.')) { 87 | $path .= '/'; 88 | } 89 | 90 | $output = []; 91 | 92 | foreach (explode('/', $path) as $segment) { 93 | if ('..' === $segment) { 94 | array_pop($output); 95 | } elseif ('.' !== $segment) { 96 | $output[] = $segment; 97 | } 98 | } 99 | 100 | return implode('/', $output); 101 | } 102 | 103 | /** 104 | * Sets current \DOMElement instance. 105 | * 106 | * @param \DOMElement $node A \DOMElement instance 107 | * 108 | * @throws \LogicException If given node is not an anchor 109 | */ 110 | abstract protected function setNode(\DOMElement $node): void; 111 | } 112 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | CHANGELOG 2 | ========= 3 | 4 | 7.0 5 | --- 6 | 7 | * Add argument `$normalizeWhitespace` to `Crawler::innerText()` 8 | * Add argument `$default` to `Crawler::attr()` 9 | 10 | 6.4 11 | --- 12 | 13 | * Add `CrawlerAnySelectorTextContains` test constraint 14 | * Add `CrawlerAnySelectorTextSame` test constraint 15 | * Add argument `$default` to `Crawler::attr()` 16 | 17 | 6.3 18 | --- 19 | 20 | * Add `$useHtml5Parser` argument to `Crawler` 21 | * Add `CrawlerSelectorCount` test constraint 22 | * Add argument `$normalizeWhitespace` to `Crawler::innerText()` 23 | * Make `Crawler::innerText()` return the first non-empty text 24 | 25 | 6.0 26 | --- 27 | 28 | * Remove `Crawler::parents()` method, use `ancestors()` instead 29 | 30 | 5.4 31 | --- 32 | 33 | * Add `Crawler::innerText` method. 34 | 35 | 5.3 36 | --- 37 | 38 | * The `parents()` method is deprecated. Use `ancestors()` instead. 39 | * Marked the `containsOption()`, `availableOptionValues()`, and `disableValidation()` methods of the 40 | `ChoiceFormField` class as internal 41 | 42 | 5.1.0 43 | ----- 44 | 45 | * Added an internal cache layer on top of the CssSelectorConverter 46 | * Added `UriResolver` to resolve an URI according to a base URI 47 | 48 | 5.0.0 49 | ----- 50 | 51 | * Added argument `$selector` to `Crawler::children()` 52 | * Added argument `$default` to `Crawler::text()` and `html()` 53 | 54 | 4.4.0 55 | ----- 56 | 57 | * Added `Form::getName()` method. 58 | * Added `Crawler::matches()` method. 59 | * Added `Crawler::closest()` method. 60 | * Added `Crawler::outerHtml()` method. 61 | * Added an argument to the `Crawler::text()` method to opt-in normalizing whitespaces. 62 | 63 | 4.3.0 64 | ----- 65 | 66 | * Added PHPUnit constraints: `CrawlerSelectorAttributeValueSame`, `CrawlerSelectorExists`, `CrawlerSelectorTextContains` 67 | and `CrawlerSelectorTextSame` 68 | * Added return of element name (`_name`) in `extract()` method. 69 | * Added ability to return a default value in `text()` and `html()` instead of throwing an exception when node is empty. 70 | * When available, the [html5-php library](https://github.com/Masterminds/html5-php) is used to 71 | parse HTML added to a Crawler for better support of HTML5 tags. 72 | 73 | 4.2.0 74 | ----- 75 | 76 | * The `$currentUri` constructor argument of the `AbstractUriElement`, `Link` and 77 | `Image` classes is now optional. 78 | * The `Crawler::children()` method will have a new `$selector` argument in version 5.0, 79 | not defining it is deprecated. 80 | 81 | 3.1.0 82 | ----- 83 | 84 | * All the URI parsing logic have been abstracted in the `AbstractUriElement` class. 85 | The `Link` class is now a child of `AbstractUriElement`. 86 | * Added an `Image` class to crawl images and parse their `src` attribute, 87 | and `selectImage`, `image`, `images` methods in the `Crawler` (the image version of the equivalent `link` methods). 88 | 89 | 2.5.0 90 | ----- 91 | 92 | * [BC BREAK] The default value for checkbox and radio inputs without a value attribute have changed 93 | from '1' to 'on' to match the HTML specification. 94 | * [BC BREAK] The typehints on the `Link`, `Form` and `FormField` classes have been changed from 95 | `\DOMNode` to `DOMElement`. Using any other type of `DOMNode` was triggering fatal errors in previous 96 | versions. Code extending these classes will need to update the typehints when overwriting these methods. 97 | 98 | 2.4.0 99 | ----- 100 | 101 | * `Crawler::addXmlContent()` removes the default document namespace again if it's an only namespace. 102 | * added support for automatic discovery and explicit registration of document 103 | namespaces for `Crawler::filterXPath()` and `Crawler::filter()` 104 | * improved content type guessing in `Crawler::addContent()` 105 | * [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document 106 | namespace 107 | 108 | 2.3.0 109 | ----- 110 | 111 | * added Crawler::html() 112 | * [BC BREAK] Crawler::each() and Crawler::reduce() now return Crawler instances instead of DomElement instances 113 | * added schema relative URL support to links 114 | * added support for HTML5 'form' attribute 115 | 116 | 2.2.0 117 | ----- 118 | 119 | * added a way to set raw path to the file in FileFormField - necessary for 120 | simulating HTTP requests 121 | 122 | 2.1.0 123 | ----- 124 | 125 | * added support for the HTTP PATCH method 126 | * refactored the Form class internals to support multi-dimensional fields 127 | (the public API is backward compatible) 128 | * added a way to get parsing errors for Crawler::addHtmlContent() and 129 | Crawler::addXmlContent() via libxml functions 130 | * added support for submitting a form without a submit button 131 | -------------------------------------------------------------------------------- /Crawler.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler; 13 | 14 | use Masterminds\HTML5; 15 | use Symfony\Component\CssSelector\CssSelectorConverter; 16 | 17 | /** 18 | * Crawler eases navigation of a list of \DOMNode objects. 19 | * 20 | * @author Fabien Potencier 21 | * 22 | * @implements \IteratorAggregate 23 | */ 24 | class Crawler implements \Countable, \IteratorAggregate 25 | { 26 | /** 27 | * The default namespace prefix to be used with XPath and CSS expressions. 28 | */ 29 | private string $defaultNamespacePrefix = 'default'; 30 | 31 | /** 32 | * A map of manually registered namespaces. 33 | * 34 | * @var array 35 | */ 36 | private array $namespaces = []; 37 | 38 | /** 39 | * A map of cached namespaces. 40 | */ 41 | private \ArrayObject $cachedNamespaces; 42 | 43 | private ?string $baseHref; 44 | private ?\DOMDocument $document = null; 45 | 46 | /** 47 | * @var list<\DOMNode> 48 | */ 49 | private array $nodes = []; 50 | 51 | /** 52 | * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath). 53 | */ 54 | private bool $isHtml = true; 55 | 56 | private ?HTML5 $html5Parser = null; 57 | 58 | /** 59 | * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling 60 | */ 61 | public function __construct( 62 | \DOMNodeList|\DOMNode|array|string|null $node = null, 63 | protected ?string $uri = null, 64 | ?string $baseHref = null, 65 | bool $useHtml5Parser = true, 66 | ) { 67 | $this->baseHref = $baseHref ?: $uri; 68 | $this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null; 69 | $this->cachedNamespaces = new \ArrayObject(); 70 | 71 | $this->add($node); 72 | } 73 | 74 | /** 75 | * Returns the current URI. 76 | */ 77 | public function getUri(): ?string 78 | { 79 | return $this->uri; 80 | } 81 | 82 | /** 83 | * Returns base href. 84 | */ 85 | public function getBaseHref(): ?string 86 | { 87 | return $this->baseHref; 88 | } 89 | 90 | /** 91 | * Removes all the nodes. 92 | */ 93 | public function clear(): void 94 | { 95 | $this->nodes = []; 96 | $this->document = null; 97 | $this->cachedNamespaces = new \ArrayObject(); 98 | } 99 | 100 | /** 101 | * Adds a node to the current list of nodes. 102 | * 103 | * This method uses the appropriate specialized add*() method based 104 | * on the type of the argument. 105 | * 106 | * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A node 107 | * 108 | * @throws \InvalidArgumentException when node is not the expected type 109 | */ 110 | public function add(\DOMNodeList|\DOMNode|array|string|null $node): void 111 | { 112 | if ($node instanceof \DOMNodeList) { 113 | $this->addNodeList($node); 114 | } elseif ($node instanceof \DOMNode) { 115 | $this->addNode($node); 116 | } elseif (\is_array($node)) { 117 | $this->addNodes($node); 118 | } elseif (\is_string($node)) { 119 | $this->addContent($node); 120 | } elseif (null !== $node) { 121 | throw new \InvalidArgumentException(\sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', get_debug_type($node))); 122 | } 123 | } 124 | 125 | /** 126 | * Adds HTML/XML content. 127 | * 128 | * If the charset is not set via the content type, it is assumed to be UTF-8, 129 | * or ISO-8859-1 as a fallback, which is the default charset defined by the 130 | * HTTP 1.1 specification. 131 | */ 132 | public function addContent(string $content, ?string $type = null): void 133 | { 134 | if (!$type) { 135 | $type = str_starts_with($content, 'convertToHtmlEntities('charset=', $m[2])) { 149 | $charset = $m[2]; 150 | } 151 | 152 | return $m[1].$charset; 153 | }, $content, 1); 154 | 155 | if ('x' === $xmlMatches[1]) { 156 | $this->addXmlContent($content, $charset); 157 | } else { 158 | $this->addHtmlContent($content, $charset); 159 | } 160 | } 161 | 162 | /** 163 | * Adds an HTML content to the list of nodes. 164 | * 165 | * The libxml errors are disabled when the content is parsed. 166 | * 167 | * If you want to get parsing errors, be sure to enable 168 | * internal errors via libxml_use_internal_errors(true) 169 | * and then, get the errors via libxml_get_errors(). Be 170 | * sure to clear errors with libxml_clear_errors() afterward. 171 | */ 172 | public function addHtmlContent(string $content, string $charset = 'UTF-8'): void 173 | { 174 | $dom = $this->parseHtmlString($content, $charset); 175 | $this->addDocument($dom); 176 | 177 | $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']); 178 | 179 | $baseHref = current($base); 180 | if (\count($base) && $baseHref) { 181 | if ($this->baseHref) { 182 | $linkNode = $dom->createElement('a'); 183 | $linkNode->setAttribute('href', $baseHref); 184 | $link = new Link($linkNode, $this->baseHref); 185 | $this->baseHref = $link->getUri(); 186 | } else { 187 | $this->baseHref = $baseHref; 188 | } 189 | } 190 | } 191 | 192 | /** 193 | * Adds an XML content to the list of nodes. 194 | * 195 | * The libxml errors are disabled when the content is parsed. 196 | * 197 | * If you want to get parsing errors, be sure to enable 198 | * internal errors via libxml_use_internal_errors(true) 199 | * and then, get the errors via libxml_get_errors(). Be 200 | * sure to clear errors with libxml_clear_errors() afterward. 201 | * 202 | * @param int $options Bitwise OR of the libxml option constants 203 | * LIBXML_PARSEHUGE is dangerous, see 204 | * http://symfony.com/blog/security-release-symfony-2-0-17-released 205 | */ 206 | public function addXmlContent(string $content, string $charset = 'UTF-8', int $options = \LIBXML_NONET): void 207 | { 208 | // remove the default namespace if it's the only namespace to make XPath expressions simpler 209 | if (!str_contains($content, 'xmlns:')) { 210 | $content = str_replace('xmlns', 'ns', $content); 211 | } 212 | 213 | $internalErrors = libxml_use_internal_errors(true); 214 | 215 | $dom = new \DOMDocument('1.0', $charset); 216 | $dom->validateOnParse = true; 217 | 218 | if ('' !== trim($content)) { 219 | @$dom->loadXML($content, $options); 220 | } 221 | 222 | libxml_use_internal_errors($internalErrors); 223 | 224 | $this->addDocument($dom); 225 | 226 | $this->isHtml = false; 227 | } 228 | 229 | /** 230 | * Adds a \DOMDocument to the list of nodes. 231 | * 232 | * @param \DOMDocument $dom A \DOMDocument instance 233 | */ 234 | public function addDocument(\DOMDocument $dom): void 235 | { 236 | if ($dom->documentElement) { 237 | $this->addNode($dom->documentElement); 238 | } 239 | } 240 | 241 | /** 242 | * Adds a \DOMNodeList to the list of nodes. 243 | * 244 | * @param \DOMNodeList $nodes A \DOMNodeList instance 245 | */ 246 | public function addNodeList(\DOMNodeList $nodes): void 247 | { 248 | foreach ($nodes as $node) { 249 | if ($node instanceof \DOMNode) { 250 | $this->addNode($node); 251 | } 252 | } 253 | } 254 | 255 | /** 256 | * Adds an array of \DOMNode instances to the list of nodes. 257 | * 258 | * @param \DOMNode[] $nodes An array of \DOMNode instances 259 | */ 260 | public function addNodes(array $nodes): void 261 | { 262 | foreach ($nodes as $node) { 263 | $this->add($node); 264 | } 265 | } 266 | 267 | /** 268 | * Adds a \DOMNode instance to the list of nodes. 269 | * 270 | * @param \DOMNode $node A \DOMNode instance 271 | */ 272 | public function addNode(\DOMNode $node): void 273 | { 274 | if ($node instanceof \DOMDocument) { 275 | $node = $node->documentElement; 276 | } 277 | 278 | if (null !== $this->document && $this->document !== $node->ownerDocument) { 279 | throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.'); 280 | } 281 | 282 | $this->document ??= $node->ownerDocument; 283 | 284 | // Don't add duplicate nodes in the Crawler 285 | if (\in_array($node, $this->nodes, true)) { 286 | return; 287 | } 288 | 289 | $this->nodes[] = $node; 290 | } 291 | 292 | /** 293 | * Returns a node given its position in the node list. 294 | */ 295 | public function eq(int $position): static 296 | { 297 | if (isset($this->nodes[$position])) { 298 | return $this->createSubCrawler($this->nodes[$position]); 299 | } 300 | 301 | return $this->createSubCrawler(null); 302 | } 303 | 304 | /** 305 | * Calls an anonymous function on each node of the list. 306 | * 307 | * The anonymous function receives the position and the node wrapped 308 | * in a Crawler instance as arguments. 309 | * 310 | * Example: 311 | * 312 | * $crawler->filter('h1')->each(function ($node, $i) { 313 | * return $node->text(); 314 | * }); 315 | * 316 | * @param \Closure $closure An anonymous function 317 | * 318 | * @return array An array of values returned by the anonymous function 319 | */ 320 | public function each(\Closure $closure): array 321 | { 322 | $data = []; 323 | foreach ($this->nodes as $i => $node) { 324 | $data[] = $closure($this->createSubCrawler($node), $i); 325 | } 326 | 327 | return $data; 328 | } 329 | 330 | /** 331 | * Slices the list of nodes by $offset and $length. 332 | */ 333 | public function slice(int $offset = 0, ?int $length = null): static 334 | { 335 | return $this->createSubCrawler(\array_slice($this->nodes, $offset, $length)); 336 | } 337 | 338 | /** 339 | * Reduces the list of nodes by calling an anonymous function. 340 | * 341 | * To remove a node from the list, the anonymous function must return false. 342 | * 343 | * @param \Closure $closure An anonymous function 344 | */ 345 | public function reduce(\Closure $closure): static 346 | { 347 | $nodes = []; 348 | foreach ($this->nodes as $i => $node) { 349 | if (false !== $closure($this->createSubCrawler($node), $i)) { 350 | $nodes[] = $node; 351 | } 352 | } 353 | 354 | return $this->createSubCrawler($nodes); 355 | } 356 | 357 | /** 358 | * Returns the first node of the current selection. 359 | */ 360 | public function first(): static 361 | { 362 | return $this->eq(0); 363 | } 364 | 365 | /** 366 | * Returns the last node of the current selection. 367 | */ 368 | public function last(): static 369 | { 370 | return $this->eq(\count($this->nodes) - 1); 371 | } 372 | 373 | /** 374 | * Returns the siblings nodes of the current selection. 375 | * 376 | * @throws \InvalidArgumentException When current node is empty 377 | */ 378 | public function siblings(): static 379 | { 380 | if (!$this->nodes) { 381 | throw new \InvalidArgumentException('The current node list is empty.'); 382 | } 383 | 384 | return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild)); 385 | } 386 | 387 | public function matches(string $selector): bool 388 | { 389 | if (!$this->nodes) { 390 | return false; 391 | } 392 | 393 | $converter = $this->createCssSelectorConverter(); 394 | $xpath = $converter->toXPath($selector, 'self::'); 395 | 396 | return 0 !== $this->filterRelativeXPath($xpath)->count(); 397 | } 398 | 399 | /** 400 | * Return first parents (heading toward the document root) of the Element that matches the provided selector. 401 | * 402 | * @see https://developer.mozilla.org/en-US/docs/Web/API/Element/closest#Polyfill 403 | * 404 | * @throws \InvalidArgumentException When current node is empty 405 | */ 406 | public function closest(string $selector): ?self 407 | { 408 | if (!$this->nodes) { 409 | throw new \InvalidArgumentException('The current node list is empty.'); 410 | } 411 | 412 | $domNode = $this->getNode(0); 413 | 414 | while (null !== $domNode && \XML_ELEMENT_NODE === $domNode->nodeType) { 415 | $node = $this->createSubCrawler($domNode); 416 | if ($node->matches($selector)) { 417 | return $node; 418 | } 419 | 420 | $domNode = $node->getNode(0)->parentNode; 421 | } 422 | 423 | return null; 424 | } 425 | 426 | /** 427 | * Returns the next siblings nodes of the current selection. 428 | * 429 | * @throws \InvalidArgumentException When current node is empty 430 | */ 431 | public function nextAll(): static 432 | { 433 | if (!$this->nodes) { 434 | throw new \InvalidArgumentException('The current node list is empty.'); 435 | } 436 | 437 | return $this->createSubCrawler($this->sibling($this->getNode(0))); 438 | } 439 | 440 | /** 441 | * Returns the previous sibling nodes of the current selection. 442 | * 443 | * @throws \InvalidArgumentException 444 | */ 445 | public function previousAll(): static 446 | { 447 | if (!$this->nodes) { 448 | throw new \InvalidArgumentException('The current node list is empty.'); 449 | } 450 | 451 | return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling')); 452 | } 453 | 454 | /** 455 | * Returns the ancestors of the current selection. 456 | * 457 | * @throws \InvalidArgumentException When the current node is empty 458 | */ 459 | public function ancestors(): static 460 | { 461 | if (!$this->nodes) { 462 | throw new \InvalidArgumentException('The current node list is empty.'); 463 | } 464 | 465 | $node = $this->getNode(0); 466 | $nodes = []; 467 | 468 | while ($node = $node->parentNode) { 469 | if (\XML_ELEMENT_NODE === $node->nodeType) { 470 | $nodes[] = $node; 471 | } 472 | } 473 | 474 | return $this->createSubCrawler($nodes); 475 | } 476 | 477 | /** 478 | * Returns the children nodes of the current selection. 479 | * 480 | * @throws \InvalidArgumentException When current node is empty 481 | * @throws \RuntimeException If the CssSelector Component is not available and $selector is provided 482 | */ 483 | public function children(?string $selector = null): static 484 | { 485 | if (!$this->nodes) { 486 | throw new \InvalidArgumentException('The current node list is empty.'); 487 | } 488 | 489 | if (null !== $selector) { 490 | $converter = $this->createCssSelectorConverter(); 491 | $xpath = $converter->toXPath($selector, 'child::'); 492 | 493 | return $this->filterRelativeXPath($xpath); 494 | } 495 | 496 | $node = $this->getNode(0)->firstChild; 497 | 498 | return $this->createSubCrawler($node ? $this->sibling($node) : []); 499 | } 500 | 501 | /** 502 | * Returns the attribute value of the first node of the list. 503 | * 504 | * @param string|null $default When not null: the value to return when the node or attribute is empty 505 | * 506 | * @throws \InvalidArgumentException When current node is empty 507 | */ 508 | public function attr(string $attribute, ?string $default = null): ?string 509 | { 510 | if (!$this->nodes) { 511 | if (null !== $default) { 512 | return $default; 513 | } 514 | 515 | throw new \InvalidArgumentException('The current node list is empty.'); 516 | } 517 | 518 | $node = $this->getNode(0); 519 | 520 | return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : $default; 521 | } 522 | 523 | /** 524 | * Returns the node name of the first node of the list. 525 | * 526 | * @throws \InvalidArgumentException When current node is empty 527 | */ 528 | public function nodeName(): string 529 | { 530 | if (!$this->nodes) { 531 | throw new \InvalidArgumentException('The current node list is empty.'); 532 | } 533 | 534 | return $this->getNode(0)->nodeName; 535 | } 536 | 537 | /** 538 | * Returns the text of the first node of the list. 539 | * 540 | * Pass true as the second argument to normalize whitespaces. 541 | * 542 | * @param string|null $default When not null: the value to return when the current node is empty 543 | * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces 544 | * 545 | * @throws \InvalidArgumentException When current node is empty 546 | */ 547 | public function text(?string $default = null, bool $normalizeWhitespace = true): string 548 | { 549 | if (!$this->nodes) { 550 | if (null !== $default) { 551 | return $default; 552 | } 553 | 554 | throw new \InvalidArgumentException('The current node list is empty.'); 555 | } 556 | 557 | $text = $this->getNode(0)->nodeValue; 558 | 559 | if ($normalizeWhitespace) { 560 | return $this->normalizeWhitespace($text); 561 | } 562 | 563 | return $text; 564 | } 565 | 566 | /** 567 | * Returns only the inner text that is the direct descendent of the current node, excluding any child nodes. 568 | * 569 | * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces 570 | */ 571 | public function innerText(bool $normalizeWhitespace = true): string 572 | { 573 | foreach ($this->getNode(0)->childNodes as $childNode) { 574 | if (\XML_TEXT_NODE !== $childNode->nodeType && \XML_CDATA_SECTION_NODE !== $childNode->nodeType) { 575 | continue; 576 | } 577 | if (!$normalizeWhitespace) { 578 | return $childNode->nodeValue; 579 | } 580 | if ('' !== trim($childNode->nodeValue)) { 581 | return $this->normalizeWhitespace($childNode->nodeValue); 582 | } 583 | } 584 | 585 | return ''; 586 | } 587 | 588 | /** 589 | * Returns the first node of the list as HTML. 590 | * 591 | * @param string|null $default When not null: the value to return when the current node is empty 592 | * 593 | * @throws \InvalidArgumentException When current node is empty 594 | */ 595 | public function html(?string $default = null): string 596 | { 597 | if (!$this->nodes) { 598 | if (null !== $default) { 599 | return $default; 600 | } 601 | 602 | throw new \InvalidArgumentException('The current node list is empty.'); 603 | } 604 | 605 | $node = $this->getNode(0); 606 | $owner = $node->ownerDocument; 607 | 608 | if ($this->html5Parser && '' === $owner->saveXML($owner->childNodes[0])) { 609 | $owner = $this->html5Parser; 610 | } 611 | 612 | $html = ''; 613 | foreach ($node->childNodes as $child) { 614 | $html .= $owner->saveHTML($child); 615 | } 616 | 617 | return $html; 618 | } 619 | 620 | public function outerHtml(): string 621 | { 622 | if (!\count($this)) { 623 | throw new \InvalidArgumentException('The current node list is empty.'); 624 | } 625 | 626 | $node = $this->getNode(0); 627 | $owner = $node->ownerDocument; 628 | 629 | if ($this->html5Parser && '' === $owner->saveXML($owner->childNodes[0])) { 630 | $owner = $this->html5Parser; 631 | } 632 | 633 | return $owner->saveHTML($node); 634 | } 635 | 636 | /** 637 | * Evaluates an XPath expression. 638 | * 639 | * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList, 640 | * this method will return either an array of simple types or a new Crawler instance. 641 | */ 642 | public function evaluate(string $xpath): array|self 643 | { 644 | if (null === $this->document) { 645 | throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.'); 646 | } 647 | 648 | $data = []; 649 | $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath)); 650 | 651 | foreach ($this->nodes as $node) { 652 | $data[] = $domxpath->evaluate($xpath, $node); 653 | } 654 | 655 | if (isset($data[0]) && $data[0] instanceof \DOMNodeList) { 656 | return $this->createSubCrawler($data); 657 | } 658 | 659 | return $data; 660 | } 661 | 662 | /** 663 | * Extracts information from the list of nodes. 664 | * 665 | * You can extract attributes or/and the node value (_text). 666 | * 667 | * Example: 668 | * 669 | * $crawler->filter('h1 a')->extract(['_text', 'href']); 670 | */ 671 | public function extract(array $attributes): array 672 | { 673 | $count = \count($attributes); 674 | 675 | $data = []; 676 | foreach ($this->nodes as $node) { 677 | $elements = []; 678 | foreach ($attributes as $attribute) { 679 | if ('_text' === $attribute) { 680 | $elements[] = $node->nodeValue; 681 | } elseif ('_name' === $attribute) { 682 | $elements[] = $node->nodeName; 683 | } else { 684 | $elements[] = $node->getAttribute($attribute); 685 | } 686 | } 687 | 688 | $data[] = 1 === $count ? $elements[0] : $elements; 689 | } 690 | 691 | return $data; 692 | } 693 | 694 | /** 695 | * Filters the list of nodes with an XPath expression. 696 | * 697 | * The XPath expression is evaluated in the context of the crawler, which 698 | * is considered as a fake parent of the elements inside it. 699 | * This means that a child selector "div" or "./div" will match only 700 | * the div elements of the current crawler, not their children. 701 | */ 702 | public function filterXPath(string $xpath): static 703 | { 704 | $xpath = $this->relativize($xpath); 705 | 706 | // If we dropped all expressions in the XPath while preparing it, there would be no match 707 | if ('' === $xpath) { 708 | return $this->createSubCrawler(null); 709 | } 710 | 711 | return $this->filterRelativeXPath($xpath); 712 | } 713 | 714 | /** 715 | * Filters the list of nodes with a CSS selector. 716 | * 717 | * This method only works if you have installed the CssSelector Symfony Component. 718 | * 719 | * @throws \LogicException if the CssSelector Component is not available 720 | */ 721 | public function filter(string $selector): static 722 | { 723 | $converter = $this->createCssSelectorConverter(); 724 | 725 | // The CssSelector already prefixes the selector with descendant-or-self:: 726 | return $this->filterRelativeXPath($converter->toXPath($selector)); 727 | } 728 | 729 | /** 730 | * Selects links by name or alt value for clickable images. 731 | */ 732 | public function selectLink(string $value): static 733 | { 734 | return $this->filterRelativeXPath( 735 | \sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %1$s) or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %1$s)]]', static::xpathLiteral(' '.$value.' ')) 736 | ); 737 | } 738 | 739 | /** 740 | * Selects images by alt value. 741 | */ 742 | public function selectImage(string $value): static 743 | { 744 | $xpath = \sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value)); 745 | 746 | return $this->filterRelativeXPath($xpath); 747 | } 748 | 749 | /** 750 | * Selects a button by name or alt value for images. 751 | */ 752 | public function selectButton(string $value): static 753 | { 754 | return $this->filterRelativeXPath( 755 | \sprintf('descendant-or-self::input[((contains(%1$s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s)) or (contains(%1$s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %2$s)) or @id=%3$s or @name=%3$s] | descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %2$s) or @id=%3$s or @name=%3$s]', 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value)) 756 | ); 757 | } 758 | 759 | /** 760 | * Returns a Link object for the first node in the list. 761 | * 762 | * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement 763 | */ 764 | public function link(string $method = 'get'): Link 765 | { 766 | if (!$this->nodes) { 767 | throw new \InvalidArgumentException('The current node list is empty.'); 768 | } 769 | 770 | $node = $this->getNode(0); 771 | 772 | if (!$node instanceof \DOMElement) { 773 | throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node))); 774 | } 775 | 776 | return new Link($node, $this->baseHref, $method); 777 | } 778 | 779 | /** 780 | * Returns an array of Link objects for the nodes in the list. 781 | * 782 | * @return Link[] 783 | * 784 | * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances 785 | */ 786 | public function links(): array 787 | { 788 | $links = []; 789 | foreach ($this->nodes as $node) { 790 | if (!$node instanceof \DOMElement) { 791 | throw new \InvalidArgumentException(\sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_debug_type($node))); 792 | } 793 | 794 | $links[] = new Link($node, $this->baseHref, 'get'); 795 | } 796 | 797 | return $links; 798 | } 799 | 800 | /** 801 | * Returns an Image object for the first node in the list. 802 | * 803 | * @throws \InvalidArgumentException If the current node list is empty 804 | */ 805 | public function image(): Image 806 | { 807 | if (!\count($this)) { 808 | throw new \InvalidArgumentException('The current node list is empty.'); 809 | } 810 | 811 | $node = $this->getNode(0); 812 | 813 | if (!$node instanceof \DOMElement) { 814 | throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node))); 815 | } 816 | 817 | return new Image($node, $this->baseHref); 818 | } 819 | 820 | /** 821 | * Returns an array of Image objects for the nodes in the list. 822 | * 823 | * @return Image[] 824 | */ 825 | public function images(): array 826 | { 827 | $images = []; 828 | foreach ($this as $node) { 829 | if (!$node instanceof \DOMElement) { 830 | throw new \InvalidArgumentException(\sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_debug_type($node))); 831 | } 832 | 833 | $images[] = new Image($node, $this->baseHref); 834 | } 835 | 836 | return $images; 837 | } 838 | 839 | /** 840 | * Returns a Form object for the first node in the list. 841 | * 842 | * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement 843 | */ 844 | public function form(?array $values = null, ?string $method = null): Form 845 | { 846 | if (!$this->nodes) { 847 | throw new \InvalidArgumentException('The current node list is empty.'); 848 | } 849 | 850 | $node = $this->getNode(0); 851 | 852 | if (!$node instanceof \DOMElement) { 853 | throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node))); 854 | } 855 | 856 | $form = new Form($node, $this->uri, $method, $this->baseHref); 857 | 858 | if (null !== $values) { 859 | $form->setValues($values); 860 | } 861 | 862 | return $form; 863 | } 864 | 865 | /** 866 | * Overloads a default namespace prefix to be used with XPath and CSS expressions. 867 | */ 868 | public function setDefaultNamespacePrefix(string $prefix): void 869 | { 870 | $this->defaultNamespacePrefix = $prefix; 871 | } 872 | 873 | public function registerNamespace(string $prefix, string $namespace): void 874 | { 875 | $this->namespaces[$prefix] = $namespace; 876 | } 877 | 878 | /** 879 | * Converts string for XPath expressions. 880 | * 881 | * Escaped characters are: quotes (") and apostrophe ('). 882 | * 883 | * Examples: 884 | * 885 | * echo Crawler::xpathLiteral('foo " bar'); 886 | * //prints 'foo " bar' 887 | * 888 | * echo Crawler::xpathLiteral("foo ' bar"); 889 | * //prints "foo ' bar" 890 | * 891 | * echo Crawler::xpathLiteral('a\'b"c'); 892 | * //prints concat('a', "'", 'b"c') 893 | */ 894 | public static function xpathLiteral(string $s): string 895 | { 896 | if (!str_contains($s, "'")) { 897 | return \sprintf("'%s'", $s); 898 | } 899 | 900 | if (!str_contains($s, '"')) { 901 | return \sprintf('"%s"', $s); 902 | } 903 | 904 | $string = $s; 905 | $parts = []; 906 | while (true) { 907 | if (false !== $pos = strpos($string, "'")) { 908 | $parts[] = \sprintf("'%s'", substr($string, 0, $pos)); 909 | $parts[] = "\"'\""; 910 | $string = substr($string, $pos + 1); 911 | } else { 912 | $parts[] = "'$string'"; 913 | break; 914 | } 915 | } 916 | 917 | return \sprintf('concat(%s)', implode(', ', $parts)); 918 | } 919 | 920 | /** 921 | * Filters the list of nodes with an XPath expression. 922 | * 923 | * The XPath expression should already be processed to apply it in the context of each node. 924 | */ 925 | private function filterRelativeXPath(string $xpath): static 926 | { 927 | $crawler = $this->createSubCrawler(null); 928 | if (null === $this->document) { 929 | return $crawler; 930 | } 931 | 932 | $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath)); 933 | 934 | foreach ($this->nodes as $node) { 935 | $crawler->add($domxpath->query($xpath, $node)); 936 | } 937 | 938 | return $crawler; 939 | } 940 | 941 | /** 942 | * Make the XPath relative to the current context. 943 | * 944 | * The returned XPath will match elements matching the XPath inside the current crawler 945 | * when running in the context of a node of the crawler. 946 | */ 947 | private function relativize(string $xpath): string 948 | { 949 | $expressions = []; 950 | 951 | // An expression which will never match to replace expressions which cannot match in the crawler 952 | // We cannot drop 953 | $nonMatchingExpression = 'a[name() = "b"]'; 954 | 955 | $xpathLen = \strlen($xpath); 956 | $openedBrackets = 0; 957 | $startPosition = strspn($xpath, " \t\n\r\0\x0B"); 958 | 959 | for ($i = $startPosition; $i <= $xpathLen; ++$i) { 960 | $i += strcspn($xpath, '"\'[]|', $i); 961 | 962 | if ($i < $xpathLen) { 963 | switch ($xpath[$i]) { 964 | case '"': 965 | case "'": 966 | if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) { 967 | return $xpath; // The XPath expression is invalid 968 | } 969 | continue 2; 970 | case '[': 971 | ++$openedBrackets; 972 | continue 2; 973 | case ']': 974 | --$openedBrackets; 975 | continue 2; 976 | } 977 | } 978 | if ($openedBrackets) { 979 | continue; 980 | } 981 | 982 | if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) { 983 | // If the union is inside some braces, we need to preserve the opening braces and apply 984 | // the change only inside it. 985 | $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1); 986 | $parenthesis = substr($xpath, $startPosition, $j); 987 | $startPosition += $j; 988 | } else { 989 | $parenthesis = ''; 990 | } 991 | $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition)); 992 | 993 | if (str_starts_with($expression, 'self::*/')) { 994 | $expression = './'.substr($expression, 8); 995 | } 996 | 997 | // add prefix before absolute element selector 998 | if ('' === $expression) { 999 | $expression = $nonMatchingExpression; 1000 | } elseif (str_starts_with($expression, '//')) { 1001 | $expression = 'descendant-or-self::'.substr($expression, 2); 1002 | } elseif (str_starts_with($expression, './/')) { 1003 | $expression = 'descendant-or-self::'.substr($expression, 3); 1004 | } elseif (str_starts_with($expression, './')) { 1005 | $expression = 'self::'.substr($expression, 2); 1006 | } elseif (str_starts_with($expression, 'child::')) { 1007 | $expression = 'self::'.substr($expression, 7); 1008 | } elseif ('/' === $expression[0] || '.' === $expression[0] || str_starts_with($expression, 'self::')) { 1009 | $expression = $nonMatchingExpression; 1010 | } elseif (str_starts_with($expression, 'descendant::')) { 1011 | $expression = 'descendant-or-self::'.substr($expression, 12); 1012 | } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) { 1013 | // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes) 1014 | $expression = $nonMatchingExpression; 1015 | } elseif (!str_starts_with($expression, 'descendant-or-self::')) { 1016 | $expression = 'self::'.$expression; 1017 | } 1018 | $expressions[] = $parenthesis.$expression; 1019 | 1020 | if ($i === $xpathLen) { 1021 | return implode(' | ', $expressions); 1022 | } 1023 | 1024 | $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1); 1025 | $startPosition = $i + 1; 1026 | } 1027 | 1028 | return $xpath; // The XPath expression is invalid 1029 | } 1030 | 1031 | public function getNode(int $position): ?\DOMNode 1032 | { 1033 | return $this->nodes[$position] ?? null; 1034 | } 1035 | 1036 | public function count(): int 1037 | { 1038 | return \count($this->nodes); 1039 | } 1040 | 1041 | /** 1042 | * @return \ArrayIterator 1043 | */ 1044 | public function getIterator(): \ArrayIterator 1045 | { 1046 | return new \ArrayIterator($this->nodes); 1047 | } 1048 | 1049 | protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling'): array 1050 | { 1051 | $nodes = []; 1052 | 1053 | $currentNode = $this->getNode(0); 1054 | do { 1055 | if ($node !== $currentNode && \XML_ELEMENT_NODE === $node->nodeType) { 1056 | $nodes[] = $node; 1057 | } 1058 | } while ($node = $node->$siblingDir); 1059 | 1060 | return $nodes; 1061 | } 1062 | 1063 | private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument 1064 | { 1065 | if (!$this->supportsEncoding($charset)) { 1066 | $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); 1067 | $charset = 'UTF-8'; 1068 | } 1069 | 1070 | return $this->html5Parser->parse($htmlContent, ['encoding' => $charset]); 1071 | } 1072 | 1073 | private function supportsEncoding(string $encoding): bool 1074 | { 1075 | try { 1076 | return '' === @mb_convert_encoding('', $encoding, 'UTF-8'); 1077 | } catch (\Throwable $e) { 1078 | return false; 1079 | } 1080 | } 1081 | 1082 | private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument 1083 | { 1084 | if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) { 1085 | $htmlContent = ''.$htmlContent; 1086 | } else { 1087 | $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); 1088 | } 1089 | 1090 | $internalErrors = libxml_use_internal_errors(true); 1091 | 1092 | $dom = new \DOMDocument('1.0', $charset); 1093 | $dom->validateOnParse = true; 1094 | 1095 | if ('' !== trim($htmlContent)) { 1096 | @$dom->loadHTML($htmlContent); 1097 | } 1098 | 1099 | libxml_use_internal_errors($internalErrors); 1100 | 1101 | return $dom; 1102 | } 1103 | 1104 | /** 1105 | * Converts charset to HTML-entities to ensure valid parsing. 1106 | */ 1107 | private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string 1108 | { 1109 | set_error_handler(static fn () => throw new \Exception()); 1110 | 1111 | try { 1112 | return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset); 1113 | } catch (\Exception|\ValueError) { 1114 | try { 1115 | $htmlContent = iconv($charset, 'UTF-8', $htmlContent); 1116 | $htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8'); 1117 | } catch (\Exception|\ValueError) { 1118 | } 1119 | 1120 | return $htmlContent; 1121 | } finally { 1122 | restore_error_handler(); 1123 | } 1124 | } 1125 | 1126 | /** 1127 | * @throws \InvalidArgumentException 1128 | */ 1129 | private function createDOMXPath(\DOMDocument $document, array $prefixes = []): \DOMXPath 1130 | { 1131 | $domxpath = new \DOMXPath($document); 1132 | 1133 | foreach ($prefixes as $prefix) { 1134 | $namespace = $this->discoverNamespace($domxpath, $prefix); 1135 | if (null !== $namespace) { 1136 | $domxpath->registerNamespace($prefix, $namespace); 1137 | } 1138 | } 1139 | 1140 | return $domxpath; 1141 | } 1142 | 1143 | /** 1144 | * @throws \InvalidArgumentException 1145 | */ 1146 | private function discoverNamespace(\DOMXPath $domxpath, string $prefix): ?string 1147 | { 1148 | if (\array_key_exists($prefix, $this->namespaces)) { 1149 | return $this->namespaces[$prefix]; 1150 | } 1151 | 1152 | if ($this->cachedNamespaces->offsetExists($prefix)) { 1153 | return $this->cachedNamespaces[$prefix]; 1154 | } 1155 | 1156 | // ask for one namespace, otherwise we'd get a collection with an item for each node 1157 | $namespaces = $domxpath->query(\sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix)); 1158 | 1159 | return $this->cachedNamespaces[$prefix] = ($node = $namespaces->item(0)) ? $node->nodeValue : null; 1160 | } 1161 | 1162 | private function findNamespacePrefixes(string $xpath): array 1163 | { 1164 | if (preg_match_all('/(?P[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) { 1165 | return array_unique($matches['prefix']); 1166 | } 1167 | 1168 | return []; 1169 | } 1170 | 1171 | /** 1172 | * Creates a crawler for some subnodes. 1173 | * 1174 | * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $nodes 1175 | */ 1176 | private function createSubCrawler(\DOMNodeList|\DOMNode|array|string|null $nodes): static 1177 | { 1178 | $crawler = new static($nodes, $this->uri, $this->baseHref); 1179 | $crawler->isHtml = $this->isHtml; 1180 | $crawler->document = $this->document; 1181 | $crawler->namespaces = $this->namespaces; 1182 | $crawler->cachedNamespaces = $this->cachedNamespaces; 1183 | $crawler->html5Parser = $this->html5Parser; 1184 | 1185 | return $crawler; 1186 | } 1187 | 1188 | /** 1189 | * @throws \LogicException If the CssSelector Component is not available 1190 | */ 1191 | private function createCssSelectorConverter(): CssSelectorConverter 1192 | { 1193 | if (!class_exists(CssSelectorConverter::class)) { 1194 | throw new \LogicException('To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.'); 1195 | } 1196 | 1197 | return new CssSelectorConverter($this->isHtml); 1198 | } 1199 | 1200 | /** 1201 | * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available. 1202 | * Use libxml parser otherwise. 1203 | */ 1204 | private function parseHtmlString(string $content, string $charset): \DOMDocument 1205 | { 1206 | if ($this->canParseHtml5String($content)) { 1207 | return $this->parseHtml5($content, $charset); 1208 | } 1209 | 1210 | return $this->parseXhtml($content, $charset); 1211 | } 1212 | 1213 | private function canParseHtml5String(string $content): bool 1214 | { 1215 | if (!$this->html5Parser) { 1216 | return false; 1217 | } 1218 | 1219 | if (false === ($pos = stripos($content, ''))) { 1220 | return false; 1221 | } 1222 | 1223 | $header = substr($content, 0, $pos); 1224 | 1225 | return '' === $header || $this->isValidHtml5Heading($header); 1226 | } 1227 | 1228 | private function isValidHtml5Heading(string $heading): bool 1229 | { 1230 | return 1 === preg_match('/^\x{FEFF}?\s*(\s*)*$/u', $heading); 1231 | } 1232 | 1233 | private function normalizeWhitespace(string $string): string 1234 | { 1235 | return trim(preg_replace("/(?:[ \n\r\t\x0C]{2,}+|[\n\r\t\x0C])/", ' ', $string), " \n\r\t\x0C"); 1236 | } 1237 | } 1238 | -------------------------------------------------------------------------------- /Field/ChoiceFormField.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Field; 13 | 14 | /** 15 | * ChoiceFormField represents a choice form field. 16 | * 17 | * It is constructed from an HTML select tag, or an HTML checkbox, or radio inputs. 18 | * 19 | * @author Fabien Potencier 20 | */ 21 | class ChoiceFormField extends FormField 22 | { 23 | private string $type; 24 | private bool $multiple; 25 | private array $options; 26 | private bool $validationDisabled = false; 27 | 28 | /** 29 | * Returns true if the field should be included in the submitted values. 30 | * 31 | * @return bool true if the field should be included in the submitted values, false otherwise 32 | */ 33 | public function hasValue(): bool 34 | { 35 | // don't send a value for unchecked checkboxes 36 | if (\in_array($this->type, ['checkbox', 'radio']) && null === $this->value) { 37 | return false; 38 | } 39 | 40 | return true; 41 | } 42 | 43 | /** 44 | * Check if the current selected option is disabled. 45 | */ 46 | public function isDisabled(): bool 47 | { 48 | if ('checkbox' === $this->type) { 49 | return parent::isDisabled(); 50 | } 51 | 52 | if (parent::isDisabled() && 'select' === $this->type) { 53 | return true; 54 | } 55 | 56 | foreach ($this->options as $option) { 57 | if ($option['value'] == $this->value && $option['disabled']) { 58 | return true; 59 | } 60 | } 61 | 62 | return false; 63 | } 64 | 65 | /** 66 | * Sets the value of the field. 67 | */ 68 | public function select(string|array|bool $value): void 69 | { 70 | $this->setValue($value); 71 | } 72 | 73 | /** 74 | * Ticks a checkbox. 75 | * 76 | * @throws \LogicException When the type provided is not correct 77 | */ 78 | public function tick(): void 79 | { 80 | if ('checkbox' !== $this->type) { 81 | throw new \LogicException(\sprintf('You cannot tick "%s" as it is not a checkbox (%s).', $this->name, $this->type)); 82 | } 83 | 84 | $this->setValue(true); 85 | } 86 | 87 | /** 88 | * Unticks a checkbox. 89 | * 90 | * @throws \LogicException When the type provided is not correct 91 | */ 92 | public function untick(): void 93 | { 94 | if ('checkbox' !== $this->type) { 95 | throw new \LogicException(\sprintf('You cannot untick "%s" as it is not a checkbox (%s).', $this->name, $this->type)); 96 | } 97 | 98 | $this->setValue(false); 99 | } 100 | 101 | /** 102 | * Sets the value of the field. 103 | * 104 | * @throws \InvalidArgumentException When value type provided is not correct 105 | */ 106 | public function setValue(string|array|bool|null $value): void 107 | { 108 | if ('checkbox' === $this->type && false === $value) { 109 | // uncheck 110 | $this->value = null; 111 | } elseif ('checkbox' === $this->type && true === $value) { 112 | // check 113 | $this->value = $this->options[0]['value']; 114 | } else { 115 | if (\is_array($value)) { 116 | if (!$this->multiple) { 117 | throw new \InvalidArgumentException(\sprintf('The value for "%s" cannot be an array.', $this->name)); 118 | } 119 | 120 | foreach ($value as $v) { 121 | if (!$this->containsOption($v, $this->options)) { 122 | throw new \InvalidArgumentException(\sprintf('Input "%s" cannot take "%s" as a value (possible values: "%s").', $this->name, $v, implode('", "', $this->availableOptionValues()))); 123 | } 124 | } 125 | } elseif (!$this->containsOption($value, $this->options)) { 126 | throw new \InvalidArgumentException(\sprintf('Input "%s" cannot take "%s" as a value (possible values: "%s").', $this->name, $value, implode('", "', $this->availableOptionValues()))); 127 | } 128 | 129 | if ($this->multiple) { 130 | $value = (array) $value; 131 | } 132 | 133 | if (\is_array($value)) { 134 | $this->value = $value; 135 | } else { 136 | parent::setValue($value); 137 | } 138 | } 139 | } 140 | 141 | /** 142 | * Adds a choice to the current ones. 143 | * 144 | * @throws \LogicException When choice provided is not multiple nor radio 145 | * 146 | * @internal 147 | */ 148 | public function addChoice(\DOMElement $node): void 149 | { 150 | if (!$this->multiple && 'radio' !== $this->type) { 151 | throw new \LogicException(\sprintf('Unable to add a choice for "%s" as it is not multiple or is not a radio button.', $this->name)); 152 | } 153 | 154 | $option = $this->buildOptionValue($node); 155 | $this->options[] = $option; 156 | 157 | if ($node->hasAttribute('checked')) { 158 | $this->value = $option['value']; 159 | } 160 | } 161 | 162 | /** 163 | * Returns the type of the choice field (radio, select, or checkbox). 164 | */ 165 | public function getType(): string 166 | { 167 | return $this->type; 168 | } 169 | 170 | /** 171 | * Returns true if the field accepts multiple values. 172 | */ 173 | public function isMultiple(): bool 174 | { 175 | return $this->multiple; 176 | } 177 | 178 | /** 179 | * Initializes the form field. 180 | * 181 | * @throws \LogicException When node type is incorrect 182 | */ 183 | protected function initialize(): void 184 | { 185 | if ('input' !== $this->node->nodeName && 'select' !== $this->node->nodeName) { 186 | throw new \LogicException(\sprintf('A ChoiceFormField can only be created from an input or select tag (%s given).', $this->node->nodeName)); 187 | } 188 | 189 | if ('input' === $this->node->nodeName && 'checkbox' !== strtolower($this->node->getAttribute('type')) && 'radio' !== strtolower($this->node->getAttribute('type'))) { 190 | throw new \LogicException(\sprintf('A ChoiceFormField can only be created from an input tag with a type of checkbox or radio (given type is "%s").', $this->node->getAttribute('type'))); 191 | } 192 | 193 | $this->value = null; 194 | $this->options = []; 195 | $this->multiple = false; 196 | 197 | if ('input' == $this->node->nodeName) { 198 | $this->type = strtolower($this->node->getAttribute('type')); 199 | $optionValue = $this->buildOptionValue($this->node); 200 | $this->options[] = $optionValue; 201 | 202 | if ($this->node->hasAttribute('checked')) { 203 | $this->value = $optionValue['value']; 204 | } 205 | } else { 206 | $this->type = 'select'; 207 | if ($this->node->hasAttribute('multiple')) { 208 | $this->multiple = true; 209 | $this->value = []; 210 | $this->name = str_replace('[]', '', $this->name); 211 | } 212 | 213 | $found = false; 214 | foreach ($this->xpath->query('descendant::option', $this->node) as $option) { 215 | $optionValue = $this->buildOptionValue($option); 216 | $this->options[] = $optionValue; 217 | 218 | if ($option->hasAttribute('selected')) { 219 | $found = true; 220 | if ($this->multiple) { 221 | $this->value[] = $optionValue['value']; 222 | } else { 223 | $this->value = $optionValue['value']; 224 | } 225 | } 226 | } 227 | 228 | // if no option is selected and if it is a simple select box, take the first option as the value 229 | if (!$found && !$this->multiple && $this->options) { 230 | $this->value = $this->options[0]['value']; 231 | } 232 | } 233 | } 234 | 235 | /** 236 | * Returns option value with associated disabled flag. 237 | */ 238 | private function buildOptionValue(\DOMElement $node): array 239 | { 240 | $option = []; 241 | 242 | $defaultDefaultValue = 'select' === $this->node->nodeName ? '' : 'on'; 243 | $defaultValue = (isset($node->nodeValue) && $node->nodeValue) ? $node->nodeValue : $defaultDefaultValue; 244 | $option['value'] = $node->hasAttribute('value') ? $node->getAttribute('value') : $defaultValue; 245 | $option['disabled'] = $node->hasAttribute('disabled'); 246 | 247 | return $option; 248 | } 249 | 250 | /** 251 | * Checks whether given value is in the existing options. 252 | * 253 | * @internal 254 | */ 255 | public function containsOption(string $optionValue, array $options): bool 256 | { 257 | if ($this->validationDisabled) { 258 | return true; 259 | } 260 | 261 | foreach ($options as $option) { 262 | if ($option['value'] == $optionValue) { 263 | return true; 264 | } 265 | } 266 | 267 | return false; 268 | } 269 | 270 | /** 271 | * Returns list of available field options. 272 | * 273 | * @internal 274 | */ 275 | public function availableOptionValues(): array 276 | { 277 | $values = []; 278 | 279 | foreach ($this->options as $option) { 280 | $values[] = $option['value']; 281 | } 282 | 283 | return $values; 284 | } 285 | 286 | /** 287 | * Disables the internal validation of the field. 288 | * 289 | * @internal 290 | * 291 | * @return $this 292 | */ 293 | public function disableValidation(): static 294 | { 295 | $this->validationDisabled = true; 296 | 297 | return $this; 298 | } 299 | } 300 | -------------------------------------------------------------------------------- /Field/FileFormField.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Field; 13 | 14 | /** 15 | * FileFormField represents a file form field (an HTML file input tag). 16 | * 17 | * @author Fabien Potencier 18 | */ 19 | class FileFormField extends FormField 20 | { 21 | /** 22 | * Sets the PHP error code associated with the field. 23 | * 24 | * @param int $error The error code (one of UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE, UPLOAD_ERR_PARTIAL, UPLOAD_ERR_NO_FILE, UPLOAD_ERR_NO_TMP_DIR, UPLOAD_ERR_CANT_WRITE, or UPLOAD_ERR_EXTENSION) 25 | * 26 | * @throws \InvalidArgumentException When error code doesn't exist 27 | */ 28 | public function setErrorCode(int $error): void 29 | { 30 | $codes = [\UPLOAD_ERR_INI_SIZE, \UPLOAD_ERR_FORM_SIZE, \UPLOAD_ERR_PARTIAL, \UPLOAD_ERR_NO_FILE, \UPLOAD_ERR_NO_TMP_DIR, \UPLOAD_ERR_CANT_WRITE, \UPLOAD_ERR_EXTENSION]; 31 | if (!\in_array($error, $codes)) { 32 | throw new \InvalidArgumentException(\sprintf('The error code "%s" is not valid.', $error)); 33 | } 34 | 35 | $this->value = ['name' => '', 'type' => '', 'tmp_name' => '', 'error' => $error, 'size' => 0]; 36 | } 37 | 38 | /** 39 | * Sets the value of the field. 40 | */ 41 | public function upload(?string $value): void 42 | { 43 | $this->setValue($value); 44 | } 45 | 46 | /** 47 | * Sets the value of the field. 48 | */ 49 | public function setValue(?string $value): void 50 | { 51 | if (null !== $value && is_readable($value)) { 52 | $error = \UPLOAD_ERR_OK; 53 | $size = filesize($value); 54 | $info = pathinfo($value); 55 | $name = $info['basename']; 56 | 57 | // copy to a tmp location 58 | $tmp = tempnam(sys_get_temp_dir(), $name); 59 | if (\array_key_exists('extension', $info)) { 60 | unlink($tmp); 61 | $tmp .= '.'.$info['extension']; 62 | } 63 | if (is_file($tmp)) { 64 | unlink($tmp); 65 | } 66 | copy($value, $tmp); 67 | $value = $tmp; 68 | } else { 69 | $error = \UPLOAD_ERR_NO_FILE; 70 | $size = 0; 71 | $name = ''; 72 | $value = ''; 73 | } 74 | 75 | $this->value = ['name' => $name, 'type' => '', 'tmp_name' => $value, 'error' => $error, 'size' => $size]; 76 | } 77 | 78 | /** 79 | * Sets path to the file as string for simulating HTTP request. 80 | */ 81 | public function setFilePath(string $path): void 82 | { 83 | parent::setValue($path); 84 | } 85 | 86 | /** 87 | * Initializes the form field. 88 | * 89 | * @throws \LogicException When node type is incorrect 90 | */ 91 | protected function initialize(): void 92 | { 93 | if ('input' !== $this->node->nodeName) { 94 | throw new \LogicException(\sprintf('A FileFormField can only be created from an input tag (%s given).', $this->node->nodeName)); 95 | } 96 | 97 | if ('file' !== strtolower($this->node->getAttribute('type'))) { 98 | throw new \LogicException(\sprintf('A FileFormField can only be created from an input tag with a type of file (given type is "%s").', $this->node->getAttribute('type'))); 99 | } 100 | 101 | $this->setValue(null); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /Field/FormField.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Field; 13 | 14 | /** 15 | * FormField is the abstract class for all form fields. 16 | * 17 | * @author Fabien Potencier 18 | */ 19 | abstract class FormField 20 | { 21 | protected string $name; 22 | protected string|array|null $value = null; 23 | protected \DOMDocument $document; 24 | protected \DOMXPath $xpath; 25 | protected bool $disabled = false; 26 | 27 | /** 28 | * @param \DOMElement $node The node associated with this field 29 | */ 30 | public function __construct( 31 | protected \DOMElement $node, 32 | ) { 33 | $this->name = $node->getAttribute('name'); 34 | $this->xpath = new \DOMXPath($node->ownerDocument); 35 | 36 | $this->initialize(); 37 | } 38 | 39 | /** 40 | * Returns the label tag associated to the field or null if none. 41 | */ 42 | public function getLabel(): ?\DOMElement 43 | { 44 | $xpath = new \DOMXPath($this->node->ownerDocument); 45 | 46 | if ($this->node->hasAttribute('id')) { 47 | $labels = $xpath->query(\sprintf('descendant::label[@for="%s"]', $this->node->getAttribute('id'))); 48 | if ($labels->length > 0) { 49 | return $labels->item(0); 50 | } 51 | } 52 | 53 | $labels = $xpath->query('ancestor::label[1]', $this->node); 54 | 55 | return $labels->length > 0 ? $labels->item(0) : null; 56 | } 57 | 58 | /** 59 | * Returns the name of the field. 60 | */ 61 | public function getName(): string 62 | { 63 | return $this->name; 64 | } 65 | 66 | /** 67 | * Gets the value of the field. 68 | */ 69 | public function getValue(): string|array|null 70 | { 71 | return $this->value; 72 | } 73 | 74 | /** 75 | * Sets the value of the field. 76 | */ 77 | public function setValue(?string $value): void 78 | { 79 | $this->value = $value ?? ''; 80 | } 81 | 82 | /** 83 | * Returns true if the field should be included in the submitted values. 84 | */ 85 | public function hasValue(): bool 86 | { 87 | return true; 88 | } 89 | 90 | /** 91 | * Check if the current field is disabled. 92 | */ 93 | public function isDisabled(): bool 94 | { 95 | return $this->node->hasAttribute('disabled'); 96 | } 97 | 98 | /** 99 | * Initializes the form field. 100 | */ 101 | abstract protected function initialize(): void; 102 | } 103 | -------------------------------------------------------------------------------- /Field/InputFormField.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Field; 13 | 14 | /** 15 | * InputFormField represents an input form field (an HTML input tag). 16 | * 17 | * For inputs with type of file, checkbox, or radio, there are other more 18 | * specialized classes (cf. FileFormField and ChoiceFormField). 19 | * 20 | * @author Fabien Potencier 21 | */ 22 | class InputFormField extends FormField 23 | { 24 | /** 25 | * Initializes the form field. 26 | * 27 | * @throws \LogicException When node type is incorrect 28 | */ 29 | protected function initialize(): void 30 | { 31 | if ('input' !== $this->node->nodeName && 'button' !== $this->node->nodeName) { 32 | throw new \LogicException(\sprintf('An InputFormField can only be created from an input or button tag (%s given).', $this->node->nodeName)); 33 | } 34 | 35 | $type = strtolower($this->node->getAttribute('type')); 36 | if ('checkbox' === $type) { 37 | throw new \LogicException('Checkboxes should be instances of ChoiceFormField.'); 38 | } 39 | 40 | if ('file' === $type) { 41 | throw new \LogicException('File inputs should be instances of FileFormField.'); 42 | } 43 | 44 | $this->value = $this->node->getAttribute('value'); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /Field/TextareaFormField.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Field; 13 | 14 | /** 15 | * TextareaFormField represents a textarea form field (an HTML textarea tag). 16 | * 17 | * @author Fabien Potencier 18 | */ 19 | class TextareaFormField extends FormField 20 | { 21 | /** 22 | * Initializes the form field. 23 | * 24 | * @throws \LogicException When node type is incorrect 25 | */ 26 | protected function initialize(): void 27 | { 28 | if ('textarea' !== $this->node->nodeName) { 29 | throw new \LogicException(\sprintf('A TextareaFormField can only be created from a textarea tag (%s given).', $this->node->nodeName)); 30 | } 31 | 32 | $this->value = ''; 33 | foreach ($this->node->childNodes as $node) { 34 | $this->value .= $node->wholeText; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /Form.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler; 13 | 14 | use Symfony\Component\DomCrawler\Field\ChoiceFormField; 15 | use Symfony\Component\DomCrawler\Field\FormField; 16 | 17 | /** 18 | * Form represents an HTML form. 19 | * 20 | * @author Fabien Potencier 21 | */ 22 | class Form extends Link implements \ArrayAccess 23 | { 24 | private \DOMElement $button; 25 | private FormFieldRegistry $fields; 26 | 27 | /** 28 | * @param \DOMElement $node A \DOMElement instance 29 | * @param string|null $currentUri The URI of the page where the form is embedded 30 | * @param string|null $method The method to use for the link (if null, it defaults to the method defined by the form) 31 | * @param string|null $baseHref The URI of the used for relative links, but not for empty action 32 | * 33 | * @throws \LogicException if the node is not a button inside a form tag 34 | */ 35 | public function __construct( 36 | \DOMElement $node, 37 | ?string $currentUri = null, 38 | ?string $method = null, 39 | private ?string $baseHref = null, 40 | ) { 41 | parent::__construct($node, $currentUri, $method); 42 | 43 | $this->initialize(); 44 | } 45 | 46 | /** 47 | * Gets the form node associated with this form. 48 | */ 49 | public function getFormNode(): \DOMElement 50 | { 51 | return $this->node; 52 | } 53 | 54 | /** 55 | * Sets the value of the fields. 56 | * 57 | * @param array $values An array of field values 58 | * 59 | * @return $this 60 | */ 61 | public function setValues(array $values): static 62 | { 63 | foreach ($values as $name => $value) { 64 | $this->fields->set($name, $value); 65 | } 66 | 67 | return $this; 68 | } 69 | 70 | /** 71 | * Gets the field values. 72 | * 73 | * The returned array does not include file fields (@see getFiles). 74 | */ 75 | public function getValues(): array 76 | { 77 | $values = []; 78 | foreach ($this->fields->all() as $name => $field) { 79 | if ($field->isDisabled()) { 80 | continue; 81 | } 82 | 83 | if (!$field instanceof Field\FileFormField && $field->hasValue()) { 84 | $values[$name] = $field->getValue(); 85 | } 86 | } 87 | 88 | return $values; 89 | } 90 | 91 | /** 92 | * Gets the file field values. 93 | */ 94 | public function getFiles(): array 95 | { 96 | if (!\in_array($this->getMethod(), ['POST', 'PUT', 'DELETE', 'PATCH'])) { 97 | return []; 98 | } 99 | 100 | $files = []; 101 | 102 | foreach ($this->fields->all() as $name => $field) { 103 | if ($field->isDisabled()) { 104 | continue; 105 | } 106 | 107 | if ($field instanceof Field\FileFormField) { 108 | $files[$name] = $field->getValue(); 109 | } 110 | } 111 | 112 | return $files; 113 | } 114 | 115 | /** 116 | * Gets the field values as PHP. 117 | * 118 | * This method converts fields with the array notation 119 | * (like foo[bar] to arrays) like PHP does. 120 | */ 121 | public function getPhpValues(): array 122 | { 123 | $values = []; 124 | foreach ($this->getValues() as $name => $value) { 125 | $qs = http_build_query([$name => $value], '', '&'); 126 | if ($qs) { 127 | parse_str($qs, $expandedValue); 128 | $varName = substr($name, 0, \strlen(key($expandedValue))); 129 | $values[] = [$varName => current($expandedValue)]; 130 | } 131 | } 132 | 133 | return array_replace_recursive([], ...$values); 134 | } 135 | 136 | /** 137 | * Gets the file field values as PHP. 138 | * 139 | * This method converts fields with the array notation 140 | * (like foo[bar] to arrays) like PHP does. 141 | * The returned array is consistent with the array for field values 142 | * (@see getPhpValues), rather than uploaded files found in $_FILES. 143 | * For a compound file field foo[bar] it will create foo[bar][name], 144 | * instead of foo[name][bar] which would be found in $_FILES. 145 | */ 146 | public function getPhpFiles(): array 147 | { 148 | $values = []; 149 | foreach ($this->getFiles() as $name => $value) { 150 | $qs = http_build_query([$name => $value], '', '&'); 151 | if ($qs) { 152 | parse_str($qs, $expandedValue); 153 | $varName = substr($name, 0, \strlen(key($expandedValue))); 154 | 155 | array_walk_recursive( 156 | $expandedValue, 157 | function (&$value, $key) { 158 | if (ctype_digit($value) && ('size' === $key || 'error' === $key)) { 159 | $value = (int) $value; 160 | } 161 | } 162 | ); 163 | 164 | reset($expandedValue); 165 | 166 | $values[] = [$varName => current($expandedValue)]; 167 | } 168 | } 169 | 170 | return array_replace_recursive([], ...$values); 171 | } 172 | 173 | /** 174 | * Gets the URI of the form. 175 | * 176 | * The returned URI is not the same as the form "action" attribute. 177 | * This method merges the value if the method is GET to mimics 178 | * browser behavior. 179 | */ 180 | public function getUri(): string 181 | { 182 | $uri = parent::getUri(); 183 | 184 | if (!\in_array($this->getMethod(), ['POST', 'PUT', 'DELETE', 'PATCH'])) { 185 | $currentParameters = []; 186 | if ($query = parse_url($uri, \PHP_URL_QUERY)) { 187 | parse_str($query, $currentParameters); 188 | } 189 | 190 | $queryString = http_build_query(array_merge($currentParameters, $this->getValues()), '', '&'); 191 | 192 | $pos = strpos($uri, '?'); 193 | $base = false === $pos ? $uri : substr($uri, 0, $pos); 194 | $uri = rtrim($base.'?'.$queryString, '?'); 195 | } 196 | 197 | return $uri; 198 | } 199 | 200 | protected function getRawUri(): string 201 | { 202 | // If the form was created from a button rather than the form node, check for HTML5 action overrides 203 | if ($this->button !== $this->node && $this->button->getAttribute('formaction')) { 204 | return $this->button->getAttribute('formaction'); 205 | } 206 | 207 | return $this->node->getAttribute('action'); 208 | } 209 | 210 | /** 211 | * Gets the form method. 212 | * 213 | * If no method is defined in the form, GET is returned. 214 | */ 215 | public function getMethod(): string 216 | { 217 | if (null !== $this->method) { 218 | return $this->method; 219 | } 220 | 221 | // If the form was created from a button rather than the form node, check for HTML5 method override 222 | if ($this->button !== $this->node && $this->button->getAttribute('formmethod')) { 223 | return strtoupper($this->button->getAttribute('formmethod')); 224 | } 225 | 226 | return $this->node->getAttribute('method') ? strtoupper($this->node->getAttribute('method')) : 'GET'; 227 | } 228 | 229 | /** 230 | * Gets the form name. 231 | * 232 | * If no name is defined on the form, an empty string is returned. 233 | */ 234 | public function getName(): string 235 | { 236 | return $this->node->getAttribute('name'); 237 | } 238 | 239 | /** 240 | * Returns true if the named field exists. 241 | */ 242 | public function has(string $name): bool 243 | { 244 | return $this->fields->has($name); 245 | } 246 | 247 | /** 248 | * Removes a field from the form. 249 | */ 250 | public function remove(string $name): void 251 | { 252 | $this->fields->remove($name); 253 | } 254 | 255 | /** 256 | * Gets a named field. 257 | * 258 | * @return FormField|FormField[]|FormField[][] 259 | * 260 | * @throws \InvalidArgumentException When field is not present in this form 261 | */ 262 | public function get(string $name): FormField|array 263 | { 264 | return $this->fields->get($name); 265 | } 266 | 267 | /** 268 | * Sets a named field. 269 | */ 270 | public function set(FormField $field): void 271 | { 272 | $this->fields->add($field); 273 | } 274 | 275 | /** 276 | * Gets all fields. 277 | * 278 | * @return FormField[] 279 | */ 280 | public function all(): array 281 | { 282 | return $this->fields->all(); 283 | } 284 | 285 | /** 286 | * Returns true if the named field exists. 287 | * 288 | * @param string $name The field name 289 | */ 290 | public function offsetExists(mixed $name): bool 291 | { 292 | return $this->has($name); 293 | } 294 | 295 | /** 296 | * Gets the value of a field. 297 | * 298 | * @param string $name The field name 299 | * 300 | * @return FormField|FormField[]|FormField[][] 301 | * 302 | * @throws \InvalidArgumentException if the field does not exist 303 | */ 304 | public function offsetGet(mixed $name): FormField|array 305 | { 306 | return $this->fields->get($name); 307 | } 308 | 309 | /** 310 | * Sets the value of a field. 311 | * 312 | * @param string $name The field name 313 | * @param string|array $value The value of the field 314 | * 315 | * @throws \InvalidArgumentException if the field does not exist 316 | */ 317 | public function offsetSet(mixed $name, mixed $value): void 318 | { 319 | $this->fields->set($name, $value); 320 | } 321 | 322 | /** 323 | * Removes a field from the form. 324 | * 325 | * @param string $name The field name 326 | */ 327 | public function offsetUnset(mixed $name): void 328 | { 329 | $this->fields->remove($name); 330 | } 331 | 332 | /** 333 | * Disables validation. 334 | * 335 | * @return $this 336 | */ 337 | public function disableValidation(): static 338 | { 339 | foreach ($this->fields->all() as $field) { 340 | if ($field instanceof ChoiceFormField) { 341 | $field->disableValidation(); 342 | } 343 | } 344 | 345 | return $this; 346 | } 347 | 348 | /** 349 | * Sets the node for the form. 350 | * 351 | * Expects a 'submit' button \DOMElement and finds the corresponding form element, or the form element itself. 352 | * 353 | * @throws \LogicException If given node is not a button or input or does not have a form ancestor 354 | */ 355 | protected function setNode(\DOMElement $node): void 356 | { 357 | $this->button = $node; 358 | if ('button' === $node->nodeName || ('input' === $node->nodeName && \in_array(strtolower($node->getAttribute('type')), ['submit', 'button', 'image']))) { 359 | if ($node->hasAttribute('form')) { 360 | // if the node has the HTML5-compliant 'form' attribute, use it 361 | $formId = $node->getAttribute('form'); 362 | $form = $node->ownerDocument->getElementById($formId); 363 | if (null === $form) { 364 | throw new \LogicException(\sprintf('The selected node has an invalid form attribute (%s).', $formId)); 365 | } 366 | $this->node = $form; 367 | 368 | return; 369 | } 370 | // we loop until we find a form ancestor 371 | do { 372 | if (null === $node = $node->parentNode) { 373 | throw new \LogicException('The selected node does not have a form ancestor.'); 374 | } 375 | } while ('form' !== $node->nodeName); 376 | } elseif ('form' !== $node->nodeName) { 377 | throw new \LogicException(\sprintf('Unable to submit on a "%s" tag.', $node->nodeName)); 378 | } 379 | 380 | $this->node = $node; 381 | } 382 | 383 | /** 384 | * Adds form elements related to this form. 385 | * 386 | * Creates an internal copy of the submitted 'button' element and 387 | * the form node or the entire document depending on whether we need 388 | * to find non-descendant elements through HTML5 'form' attribute. 389 | */ 390 | private function initialize(): void 391 | { 392 | $this->fields = new FormFieldRegistry(); 393 | 394 | $xpath = new \DOMXPath($this->node->ownerDocument); 395 | 396 | // add submitted button if it has a valid name 397 | if ('form' !== $this->button->nodeName && $this->button->hasAttribute('name') && $this->button->getAttribute('name')) { 398 | if ('input' == $this->button->nodeName && 'image' == strtolower($this->button->getAttribute('type'))) { 399 | $name = $this->button->getAttribute('name'); 400 | $this->button->setAttribute('value', '0'); 401 | 402 | // temporarily change the name of the input node for the x coordinate 403 | $this->button->setAttribute('name', $name.'.x'); 404 | $this->set(new Field\InputFormField($this->button)); 405 | 406 | // temporarily change the name of the input node for the y coordinate 407 | $this->button->setAttribute('name', $name.'.y'); 408 | $this->set(new Field\InputFormField($this->button)); 409 | 410 | // restore the original name of the input node 411 | $this->button->setAttribute('name', $name); 412 | } else { 413 | $this->set(new Field\InputFormField($this->button)); 414 | } 415 | } 416 | 417 | // find form elements corresponding to the current form 418 | if ($this->node->hasAttribute('id')) { 419 | // corresponding elements are either descendants or have a matching HTML5 form attribute 420 | $formId = Crawler::xpathLiteral($this->node->getAttribute('id')); 421 | 422 | $fieldNodes = $xpath->query(\sprintf('( descendant::input[@form=%s] | descendant::button[@form=%1$s] | descendant::textarea[@form=%1$s] | descendant::select[@form=%1$s] | //form[@id=%1$s]//input[not(@form)] | //form[@id=%1$s]//button[not(@form)] | //form[@id=%1$s]//textarea[not(@form)] | //form[@id=%1$s]//select[not(@form)] )[( not(ancestor::template) or ancestor::turbo-stream )]', $formId)); 423 | } else { 424 | // do the xpath query with $this->node as the context node, to only find descendant elements 425 | // however, descendant elements with form attribute are not part of this form 426 | $fieldNodes = $xpath->query('( descendant::input[not(@form)] | descendant::button[not(@form)] | descendant::textarea[not(@form)] | descendant::select[not(@form)] )[( not(ancestor::template) or ancestor::turbo-stream )]', $this->node); 427 | } 428 | 429 | foreach ($fieldNodes as $node) { 430 | $this->addField($node); 431 | } 432 | 433 | if ($this->baseHref && '' !== $this->node->getAttribute('action')) { 434 | $this->currentUri = $this->baseHref; 435 | } 436 | } 437 | 438 | private function addField(\DOMElement $node): void 439 | { 440 | if (!$node->hasAttribute('name') || !$node->getAttribute('name')) { 441 | return; 442 | } 443 | 444 | $nodeName = $node->nodeName; 445 | if ('select' == $nodeName || 'input' == $nodeName && 'checkbox' == strtolower($node->getAttribute('type'))) { 446 | $this->set(new ChoiceFormField($node)); 447 | } elseif ('input' == $nodeName && 'radio' == strtolower($node->getAttribute('type'))) { 448 | // there may be other fields with the same name that are no choice 449 | // fields already registered (see https://github.com/symfony/symfony/issues/11689) 450 | if ($this->has($node->getAttribute('name')) && $this->get($node->getAttribute('name')) instanceof ChoiceFormField) { 451 | $this->get($node->getAttribute('name'))->addChoice($node); 452 | } else { 453 | $this->set(new ChoiceFormField($node)); 454 | } 455 | } elseif ('input' == $nodeName && 'file' == strtolower($node->getAttribute('type'))) { 456 | $this->set(new Field\FileFormField($node)); 457 | } elseif ('input' == $nodeName && !\in_array(strtolower($node->getAttribute('type')), ['submit', 'button', 'image'])) { 458 | $this->set(new Field\InputFormField($node)); 459 | } elseif ('textarea' == $nodeName) { 460 | $this->set(new Field\TextareaFormField($node)); 461 | } 462 | } 463 | } 464 | -------------------------------------------------------------------------------- /FormFieldRegistry.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler; 13 | 14 | use Symfony\Component\DomCrawler\Field\FormField; 15 | 16 | /** 17 | * This is an internal class that must not be used directly. 18 | * 19 | * @internal 20 | */ 21 | class FormFieldRegistry 22 | { 23 | private array $fields = []; 24 | private string $base = ''; 25 | 26 | /** 27 | * Adds a field to the registry. 28 | */ 29 | public function add(FormField $field): void 30 | { 31 | $segments = $this->getSegments($field->getName()); 32 | 33 | $target = &$this->fields; 34 | while ($segments) { 35 | if (!\is_array($target)) { 36 | $target = []; 37 | } 38 | $path = array_shift($segments); 39 | if ('' === $path) { 40 | $target = &$target[]; 41 | } else { 42 | $target = &$target[$path]; 43 | } 44 | } 45 | $target = $field; 46 | } 47 | 48 | /** 49 | * Removes a field based on the fully qualified name and its children from the registry. 50 | */ 51 | public function remove(string $name): void 52 | { 53 | $segments = $this->getSegments($name); 54 | $target = &$this->fields; 55 | while (\count($segments) > 1) { 56 | $path = array_shift($segments); 57 | if (!\is_array($target) || !\array_key_exists($path, $target)) { 58 | return; 59 | } 60 | $target = &$target[$path]; 61 | } 62 | unset($target[array_shift($segments)]); 63 | } 64 | 65 | /** 66 | * Returns the value of the field based on the fully qualified name and its children. 67 | * 68 | * @return FormField|FormField[]|FormField[][] 69 | * 70 | * @throws \InvalidArgumentException if the field does not exist 71 | */ 72 | public function &get(string $name): FormField|array 73 | { 74 | $segments = $this->getSegments($name); 75 | $target = &$this->fields; 76 | while ($segments) { 77 | $path = array_shift($segments); 78 | if (!\is_array($target) || !\array_key_exists($path, $target)) { 79 | throw new \InvalidArgumentException(\sprintf('Unreachable field "%s".', $path)); 80 | } 81 | $target = &$target[$path]; 82 | } 83 | 84 | return $target; 85 | } 86 | 87 | /** 88 | * Tests whether the form has the given field based on the fully qualified name. 89 | */ 90 | public function has(string $name): bool 91 | { 92 | try { 93 | $this->get($name); 94 | 95 | return true; 96 | } catch (\InvalidArgumentException) { 97 | return false; 98 | } 99 | } 100 | 101 | /** 102 | * Set the value of a field based on the fully qualified name and its children. 103 | * 104 | * @throws \InvalidArgumentException if the field does not exist 105 | */ 106 | public function set(string $name, mixed $value): void 107 | { 108 | $target = &$this->get($name); 109 | if ((!\is_array($value) && $target instanceof FormField) || $target instanceof Field\ChoiceFormField) { 110 | $target->setValue($value); 111 | } elseif (\is_array($value)) { 112 | $registry = new static(); 113 | $registry->base = $name; 114 | $registry->fields = $value; 115 | foreach ($registry->all() as $k => $v) { 116 | $this->set($k, $v); 117 | } 118 | } else { 119 | throw new \InvalidArgumentException(\sprintf('Cannot set value on a compound field "%s".', $name)); 120 | } 121 | } 122 | 123 | /** 124 | * Returns the list of field with their value. 125 | * 126 | * @return FormField[] The list of fields as [string] Fully qualified name => (mixed) value) 127 | */ 128 | public function all(): array 129 | { 130 | return $this->walk($this->fields, $this->base); 131 | } 132 | 133 | /** 134 | * Transforms a PHP array in a list of fully qualified name / value. 135 | */ 136 | private function walk(array $array, ?string $base = '', array &$output = []): array 137 | { 138 | foreach ($array as $k => $v) { 139 | $path = $base ? \sprintf('%s[%s]', $base, $k) : $k; 140 | if (\is_array($v)) { 141 | $this->walk($v, $path, $output); 142 | } else { 143 | $output[$path] = $v; 144 | } 145 | } 146 | 147 | return $output; 148 | } 149 | 150 | /** 151 | * Splits a field name into segments as a web browser would do. 152 | * 153 | * getSegments('base[foo][3][]') = ['base', 'foo, '3', '']; 154 | * 155 | * @return string[] 156 | */ 157 | private function getSegments(string $name): array 158 | { 159 | if (preg_match('/^(?P[^[]+)(?P(\[.*)|$)/', $name, $m)) { 160 | $segments = [$m['base']]; 161 | while (!empty($m['extra'])) { 162 | $extra = $m['extra']; 163 | if (preg_match('/^\[(?P.*?)\](?P.*)$/', $extra, $m)) { 164 | $segments[] = $m['segment']; 165 | } else { 166 | $segments[] = $extra; 167 | } 168 | } 169 | 170 | return $segments; 171 | } 172 | 173 | return [$name]; 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /Image.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler; 13 | 14 | /** 15 | * Image represents an HTML image (an HTML img tag). 16 | */ 17 | class Image extends AbstractUriElement 18 | { 19 | public function __construct(\DOMElement $node, ?string $currentUri = null) 20 | { 21 | parent::__construct($node, $currentUri, 'GET'); 22 | } 23 | 24 | protected function getRawUri(): string 25 | { 26 | return $this->node->getAttribute('src'); 27 | } 28 | 29 | protected function setNode(\DOMElement $node): void 30 | { 31 | if ('img' !== $node->nodeName) { 32 | throw new \LogicException(\sprintf('Unable to visualize a "%s" tag.', $node->nodeName)); 33 | } 34 | 35 | $this->node = $node; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2004-present Fabien Potencier 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /Link.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler; 13 | 14 | /** 15 | * Link represents an HTML link (an HTML a, area or link tag). 16 | * 17 | * @author Fabien Potencier 18 | */ 19 | class Link extends AbstractUriElement 20 | { 21 | protected function getRawUri(): string 22 | { 23 | return $this->node->getAttribute('href'); 24 | } 25 | 26 | protected function setNode(\DOMElement $node): void 27 | { 28 | if ('a' !== $node->nodeName && 'area' !== $node->nodeName && 'link' !== $node->nodeName) { 29 | throw new \LogicException(\sprintf('Unable to navigate from a "%s" tag.', $node->nodeName)); 30 | } 31 | 32 | $this->node = $node; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DomCrawler Component 2 | ==================== 3 | 4 | The DomCrawler component eases DOM navigation for HTML and XML documents. 5 | 6 | Resources 7 | --------- 8 | 9 | * [Documentation](https://symfony.com/doc/current/components/dom_crawler.html) 10 | * [Contributing](https://symfony.com/doc/current/contributing/index.html) 11 | * [Report issues](https://github.com/symfony/symfony/issues) and 12 | [send Pull Requests](https://github.com/symfony/symfony/pulls) 13 | in the [main Symfony repository](https://github.com/symfony/symfony) 14 | -------------------------------------------------------------------------------- /Test/Constraint/CrawlerAnySelectorTextContains.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Test\Constraint; 13 | 14 | use PHPUnit\Framework\Constraint\Constraint; 15 | use Symfony\Component\DomCrawler\Crawler; 16 | 17 | final class CrawlerAnySelectorTextContains extends Constraint 18 | { 19 | private bool $hasNode = false; 20 | 21 | public function __construct( 22 | private string $selector, 23 | private string $expectedText, 24 | ) { 25 | } 26 | 27 | public function toString(): string 28 | { 29 | if ($this->hasNode) { 30 | return \sprintf('the text of any node matching selector "%s" contains "%s"', $this->selector, $this->expectedText); 31 | } 32 | 33 | return \sprintf('the Crawler has a node matching selector "%s"', $this->selector); 34 | } 35 | 36 | protected function matches($other): bool 37 | { 38 | if (!$other instanceof Crawler) { 39 | throw new \InvalidArgumentException(\sprintf('"%s" constraint expected an argument of type "%s", got "%s".', self::class, Crawler::class, get_debug_type($other))); 40 | } 41 | 42 | $other = $other->filter($this->selector); 43 | if (!\count($other)) { 44 | $this->hasNode = false; 45 | 46 | return false; 47 | } 48 | 49 | $this->hasNode = true; 50 | 51 | $nodes = $other->each(fn (Crawler $node) => $node->text(null, true)); 52 | $matches = array_filter($nodes, function (string $node): bool { 53 | return str_contains($node, $this->expectedText); 54 | }); 55 | 56 | return 0 < \count($matches); 57 | } 58 | 59 | protected function failureDescription($other): string 60 | { 61 | if (!$other instanceof Crawler) { 62 | throw new \InvalidArgumentException(\sprintf('"%s" constraint expected an argument of type "%s", got "%s".', self::class, Crawler::class, get_debug_type($other))); 63 | } 64 | 65 | return $this->toString(); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /Test/Constraint/CrawlerAnySelectorTextSame.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Test\Constraint; 13 | 14 | use PHPUnit\Framework\Constraint\Constraint; 15 | use Symfony\Component\DomCrawler\Crawler; 16 | 17 | final class CrawlerAnySelectorTextSame extends Constraint 18 | { 19 | public function __construct( 20 | private string $selector, 21 | private string $expectedText, 22 | ) { 23 | } 24 | 25 | public function toString(): string 26 | { 27 | return \sprintf('has at least a node matching selector "%s" with content "%s"', $this->selector, $this->expectedText); 28 | } 29 | 30 | protected function matches($other): bool 31 | { 32 | if (!$other instanceof Crawler) { 33 | throw new \InvalidArgumentException(\sprintf('"%s" constraint expected an argument of type "%s", got "%s".', self::class, Crawler::class, get_debug_type($other))); 34 | } 35 | 36 | $other = $other->filter($this->selector); 37 | if (!\count($other)) { 38 | return false; 39 | } 40 | 41 | $nodes = $other->each(fn (Crawler $node) => trim($node->text(null, true))); 42 | 43 | return \in_array($this->expectedText, $nodes, true); 44 | } 45 | 46 | protected function failureDescription($other): string 47 | { 48 | if (!$other instanceof Crawler) { 49 | throw new \InvalidArgumentException(\sprintf('"%s" constraint expected an argument of type "%s", got "%s".', self::class, Crawler::class, get_debug_type($other))); 50 | } 51 | 52 | return 'the Crawler '.$this->toString(); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /Test/Constraint/CrawlerSelectorAttributeValueSame.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Test\Constraint; 13 | 14 | use PHPUnit\Framework\Constraint\Constraint; 15 | use Symfony\Component\DomCrawler\Crawler; 16 | 17 | final class CrawlerSelectorAttributeValueSame extends Constraint 18 | { 19 | public function __construct( 20 | private string $selector, 21 | private string $attribute, 22 | private string $expectedText, 23 | ) { 24 | } 25 | 26 | public function toString(): string 27 | { 28 | return \sprintf('has a node matching selector "%s" with attribute "%s" of value "%s"', $this->selector, $this->attribute, $this->expectedText); 29 | } 30 | 31 | /** 32 | * @param Crawler $crawler 33 | */ 34 | protected function matches($crawler): bool 35 | { 36 | $crawler = $crawler->filter($this->selector); 37 | if (!\count($crawler)) { 38 | return false; 39 | } 40 | 41 | return $this->expectedText === trim($crawler->attr($this->attribute) ?? ''); 42 | } 43 | 44 | /** 45 | * @param Crawler $crawler 46 | */ 47 | protected function failureDescription($crawler): string 48 | { 49 | return 'the Crawler '.$this->toString(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /Test/Constraint/CrawlerSelectorCount.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Test\Constraint; 13 | 14 | use PHPUnit\Framework\Constraint\Constraint; 15 | use Symfony\Component\DomCrawler\Crawler; 16 | 17 | final class CrawlerSelectorCount extends Constraint 18 | { 19 | public function __construct( 20 | private readonly int $count, 21 | private readonly string $selector, 22 | ) { 23 | } 24 | 25 | public function toString(): string 26 | { 27 | return \sprintf('selector "%s" count is "%d"', $this->selector, $this->count); 28 | } 29 | 30 | /** 31 | * @param Crawler $crawler 32 | */ 33 | protected function matches($crawler): bool 34 | { 35 | return $this->count === \count($crawler->filter($this->selector)); 36 | } 37 | 38 | /** 39 | * @param Crawler $crawler 40 | */ 41 | protected function failureDescription($crawler): string 42 | { 43 | return \sprintf('the Crawler selector "%s" was expected to be found %d time(s) but was found %d time(s)', $this->selector, $this->count, \count($crawler->filter($this->selector))); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /Test/Constraint/CrawlerSelectorExists.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Test\Constraint; 13 | 14 | use PHPUnit\Framework\Constraint\Constraint; 15 | use Symfony\Component\DomCrawler\Crawler; 16 | 17 | final class CrawlerSelectorExists extends Constraint 18 | { 19 | public function __construct( 20 | private string $selector, 21 | ) { 22 | } 23 | 24 | public function toString(): string 25 | { 26 | return \sprintf('matches selector "%s"', $this->selector); 27 | } 28 | 29 | /** 30 | * @param Crawler $crawler 31 | */ 32 | protected function matches($crawler): bool 33 | { 34 | return 0 < \count($crawler->filter($this->selector)); 35 | } 36 | 37 | /** 38 | * @param Crawler $crawler 39 | */ 40 | protected function failureDescription($crawler): string 41 | { 42 | return 'the Crawler '.$this->toString(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /Test/Constraint/CrawlerSelectorTextContains.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Test\Constraint; 13 | 14 | use PHPUnit\Framework\Constraint\Constraint; 15 | use Symfony\Component\DomCrawler\Crawler; 16 | 17 | final class CrawlerSelectorTextContains extends Constraint 18 | { 19 | private bool $hasNode = false; 20 | private string $nodeText; 21 | 22 | public function __construct( 23 | private string $selector, 24 | private string $expectedText, 25 | ) { 26 | } 27 | 28 | public function toString(): string 29 | { 30 | if ($this->hasNode) { 31 | return \sprintf('the text "%s" of the node matching selector "%s" contains "%s"', $this->nodeText, $this->selector, $this->expectedText); 32 | } 33 | 34 | return \sprintf('the Crawler has a node matching selector "%s"', $this->selector); 35 | } 36 | 37 | /** 38 | * @param Crawler $crawler 39 | */ 40 | protected function matches($crawler): bool 41 | { 42 | $crawler = $crawler->filter($this->selector); 43 | if (!\count($crawler)) { 44 | $this->hasNode = false; 45 | 46 | return false; 47 | } 48 | 49 | $this->hasNode = true; 50 | $this->nodeText = $crawler->text(null, true); 51 | 52 | return str_contains($this->nodeText, $this->expectedText); 53 | } 54 | 55 | /** 56 | * @param Crawler $crawler 57 | */ 58 | protected function failureDescription($crawler): string 59 | { 60 | return $this->toString(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Test/Constraint/CrawlerSelectorTextSame.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler\Test\Constraint; 13 | 14 | use PHPUnit\Framework\Constraint\Constraint; 15 | use Symfony\Component\DomCrawler\Crawler; 16 | 17 | final class CrawlerSelectorTextSame extends Constraint 18 | { 19 | public function __construct( 20 | private string $selector, 21 | private string $expectedText, 22 | ) { 23 | } 24 | 25 | public function toString(): string 26 | { 27 | return \sprintf('has a node matching selector "%s" with content "%s"', $this->selector, $this->expectedText); 28 | } 29 | 30 | /** 31 | * @param Crawler $crawler 32 | */ 33 | protected function matches($crawler): bool 34 | { 35 | $crawler = $crawler->filter($this->selector); 36 | if (!\count($crawler)) { 37 | return false; 38 | } 39 | 40 | return $this->expectedText === trim($crawler->text(null, true)); 41 | } 42 | 43 | /** 44 | * @param Crawler $crawler 45 | */ 46 | protected function failureDescription($crawler): string 47 | { 48 | return 'the Crawler '.$this->toString(); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /UriResolver.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Symfony\Component\DomCrawler; 13 | 14 | /** 15 | * The UriResolver class takes an URI (relative, absolute, fragment, etc.) 16 | * and turns it into an absolute URI against another given base URI. 17 | * 18 | * @author Fabien Potencier 19 | * @author Grégoire Pineau 20 | */ 21 | class UriResolver 22 | { 23 | /** 24 | * Resolves a URI according to a base URI. 25 | * 26 | * For example if $uri=/foo/bar and $baseUri=https://symfony.com it will 27 | * return https://symfony.com/foo/bar 28 | * 29 | * If the $uri is not absolute you must pass an absolute $baseUri 30 | */ 31 | public static function resolve(string $uri, ?string $baseUri): string 32 | { 33 | $uri = trim($uri); 34 | 35 | // absolute URL? 36 | if (null !== parse_url(\strlen($uri) !== strcspn($uri, '?#') ? $uri : $uri.'#', \PHP_URL_SCHEME)) { 37 | return $uri; 38 | } 39 | 40 | if (null === $baseUri) { 41 | throw new \InvalidArgumentException('The URI is relative, so you must define its base URI passing an absolute URL.'); 42 | } 43 | 44 | // empty URI 45 | if (!$uri) { 46 | return $baseUri; 47 | } 48 | 49 | // an anchor 50 | if ('#' === $uri[0]) { 51 | return self::cleanupAnchor($baseUri).$uri; 52 | } 53 | 54 | $baseUriCleaned = self::cleanupUri($baseUri); 55 | 56 | if ('?' === $uri[0]) { 57 | return $baseUriCleaned.$uri; 58 | } 59 | 60 | // absolute URL with relative schema 61 | if (str_starts_with($uri, '//')) { 62 | return preg_replace('#^([^/]*)//.*$#', '$1', $baseUriCleaned).$uri; 63 | } 64 | 65 | $baseUriCleaned = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUriCleaned); 66 | 67 | // absolute path 68 | if ('/' === $uri[0]) { 69 | return $baseUriCleaned.$uri; 70 | } 71 | 72 | // relative path 73 | $path = parse_url(substr($baseUri, \strlen($baseUriCleaned)), \PHP_URL_PATH) ?? ''; 74 | $path = self::canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri); 75 | 76 | return $baseUriCleaned.('' === $path || '/' !== $path[0] ? '/' : '').$path; 77 | } 78 | 79 | /** 80 | * Returns the canonicalized URI path (see RFC 3986, section 5.2.4). 81 | */ 82 | private static function canonicalizePath(string $path): string 83 | { 84 | if ('' === $path || '/' === $path) { 85 | return $path; 86 | } 87 | 88 | if (str_ends_with($path, '.')) { 89 | $path .= '/'; 90 | } 91 | 92 | $output = []; 93 | 94 | foreach (explode('/', $path) as $segment) { 95 | if ('..' === $segment) { 96 | array_pop($output); 97 | } elseif ('.' !== $segment) { 98 | $output[] = $segment; 99 | } 100 | } 101 | 102 | return implode('/', $output); 103 | } 104 | 105 | /** 106 | * Removes the query string and the anchor from the given uri. 107 | */ 108 | private static function cleanupUri(string $uri): string 109 | { 110 | return self::cleanupQuery(self::cleanupAnchor($uri)); 111 | } 112 | 113 | /** 114 | * Removes the query string from the uri. 115 | */ 116 | private static function cleanupQuery(string $uri): string 117 | { 118 | if (false !== $pos = strpos($uri, '?')) { 119 | return substr($uri, 0, $pos); 120 | } 121 | 122 | return $uri; 123 | } 124 | 125 | /** 126 | * Removes the anchor from the uri. 127 | */ 128 | private static function cleanupAnchor(string $uri): string 129 | { 130 | if (false !== $pos = strpos($uri, '#')) { 131 | return substr($uri, 0, $pos); 132 | } 133 | 134 | return $uri; 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "symfony/dom-crawler", 3 | "type": "library", 4 | "description": "Eases DOM navigation for HTML and XML documents", 5 | "keywords": [], 6 | "homepage": "https://symfony.com", 7 | "license": "MIT", 8 | "authors": [ 9 | { 10 | "name": "Fabien Potencier", 11 | "email": "fabien@symfony.com" 12 | }, 13 | { 14 | "name": "Symfony Community", 15 | "homepage": "https://symfony.com/contributors" 16 | } 17 | ], 18 | "require": { 19 | "php": ">=8.2", 20 | "symfony/polyfill-ctype": "~1.8", 21 | "symfony/polyfill-mbstring": "~1.0", 22 | "masterminds/html5": "^2.6" 23 | }, 24 | "require-dev": { 25 | "symfony/css-selector": "^6.4|^7.0" 26 | }, 27 | "autoload": { 28 | "psr-4": { "Symfony\\Component\\DomCrawler\\": "" }, 29 | "exclude-from-classmap": [ 30 | "/Tests/" 31 | ] 32 | }, 33 | "minimum-stability": "dev" 34 | } 35 | --------------------------------------------------------------------------------