├── .gitignore ├── README.md ├── composer.json ├── phpunit.xml └── src ├── Exception ├── NodeNotFoundException.php └── XPathException.php ├── Node.php ├── NodeInterface.php ├── NodeList.php ├── NodeListInterface.php ├── Selector.php └── Tests ├── NodeListTest.php ├── NodeTest.php ├── Resources ├── test.html └── test.xml ├── SelectorTest.php ├── TestCase.php └── bootstrap.php /.gitignore: -------------------------------------------------------------------------------- 1 | ############ 2 | ## Windows 3 | ############ 4 | 5 | # Windows image file caches 6 | Thumbs.db 7 | 8 | # Folder config file 9 | Desktop.ini 10 | 11 | vendor/ 12 | composer.lock 13 | phpunit.phar 14 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #XPathSelector 2 | ##Description 3 | XPathSelector is a libary created for HTML webscraping. It was inspired by Python's Scrapy. 4 | It uses PHP DOM extension, so make sure you have it installed. PHP 5.4 is minimum. 5 | 6 | ##Installation 7 | Recommended way to install XPathSelector is through [Composer](http://getcomposer.org/). Run following command: 8 | ```bash 9 | composer require stil/xpath-selector 10 | ``` 11 | 12 | ###Introduction 13 | The starting point of all searches is `XPathSelector\Selector` class. It allows you to load HTML or XML, so you can process it then. There are several methods to do it: 14 | ```php 15 | use XPathSelector\Selector; 16 | $xs = Selector::load($pathToXml); 17 | $xs = Selector::loadHTMLFile($pathToHtml); 18 | $xs = Selector::loadXML($xmlString); 19 | $xs = Selector::loadHTML($htmlString); 20 | ``` 21 | 22 | Next thing you want to do, is to decide whether you're searching for single DOM element or multiple elements. 23 | For single search use `find($query)` method. 24 | ```php 25 | use XPathSelector\Exception\NodeNotFoundException; 26 | 27 | try { 28 | $element = $xs->find('//head'); // returns first element found 29 | echo $element->innerHTML(); // print innerHTML of tag 30 | } catch (NodeNotFoundException $e) { 31 | echo $e->getMessage(); // nothing have been found 32 | } 33 | ``` 34 | And if you need multiple results, use `findAll($query)` instead. This method returns instance of `XPathSelector\NodeListInterface`. Check it out in the API. 35 | ```php 36 | use XPathSelector\Selector; 37 | 38 | $urls = $xs->findAll('//a/@href'); 39 | foreach ($urls as $url) { 40 | echo $url; 41 | } 42 | ``` 43 | 44 | Do you need to check whether XPath path exists? Use `findOneOrNull($query)` method. It returns `Node` object or null, when no results were found. It works just like `find($query)`, except it returns null instead of throwing exception. 45 | ```php 46 | use XPathSelector\Selector; 47 | 48 | $doesExist = $xs->findOneOrNull('//a/@href') !== null; 49 | ``` 50 | 51 | ###sample.xml 52 | ```xml 53 | 54 | 55 | 56 | Everyday Italian 57 | Giada De Laurentiis 58 | 2005 59 | 30.00 60 | 61 | 62 | Harry Potter 63 | J K. Rowling 64 | 2005 65 | 29.99 66 | 67 | 68 | XQuery Kick Start 69 | James McGovern 70 | Per Bothner 71 | Kurt Cagle 72 | James Linn 73 | Vaidyanathan Nagarajan 74 | 2003 75 | 49.99 76 | 77 | 78 | Learning XML 79 | Erik T. Ray 80 | 2003 81 | 39.95 82 | 83 | 84 | ``` 85 | ###Search for single result 86 | ```php 87 | find('/bookstore/book[1]/title'); 92 | ``` 93 | Result: 94 | ``` 95 | Everyday Italian 96 | ``` 97 | ###Search for multiple results 98 | ```php 99 | findAll('/bookstore/book') as $book) { 104 | printf( 105 | "[Title: %s][Price: %s]\n", 106 | $book->find('title')->extract(), 107 | $book->find('price')->extract() 108 | ); 109 | } 110 | ``` 111 | Result: 112 | ``` 113 | [Title: Everyday Italian][Price: 30.00] 114 | [Title: Harry Potter][Price: 29.99] 115 | [Title: XQuery Kick Start][Price: 49.99] 116 | [Title: Learning XML][Price: 39.95] 117 | ``` 118 | ###Map result set to array 119 | ```php 120 | findAll('/bookstore/book')->map(function ($node, $index) { 125 | return [ 126 | 'index' => $index, 127 | 'title' => $node->find('title')->extract(), 128 | 'price' => (float)$node->find('price')->extract() 129 | ]; 130 | }); 131 | 132 | var_dump($array); 133 | ``` 134 | Result: 135 | ``` 136 | array(4) { 137 | [0] => 138 | array(3) { 139 | 'index' => 140 | int(0) 141 | 'title' => 142 | string(16) "Everyday Italian" 143 | 'price' => 144 | double(30) 145 | } 146 | [1] => 147 | array(3) { 148 | 'index' => 149 | int(1) 150 | 'title' => 151 | string(12) "Harry Potter" 152 | 'price' => 153 | double(29.99) 154 | } 155 | [2] => 156 | array(3) { 157 | 'index' => 158 | int(2) 159 | 'title' => 160 | string(17) "XQuery Kick Start" 161 | 'price' => 162 | double(49.99) 163 | } 164 | [3] => 165 | array(3) { 166 | 'index' => 167 | int(3) 168 | 'title' => 169 | string(12) "Learning XML" 170 | 'price' => 171 | double(39.95) 172 | } 173 | } 174 | ``` 175 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "stil/xpath-selector", 3 | "description": "A library which makes it easy to web scrape HTML or XML pages. Uses XPath queries.", 4 | "license": "MIT", 5 | "require": { 6 | "php": ">=5.4", 7 | "ext-dom": "*" 8 | }, 9 | "autoload": { 10 | "psr-4": { 11 | "XPathSelector\\": "src/" 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ./src/Tests/ 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/Exception/NodeNotFoundException.php: -------------------------------------------------------------------------------- 1 | node = $node; 20 | 21 | if ($node instanceof DOMDocument) { 22 | $this->xpath = new DOMXPath($node); 23 | } else { 24 | $this->xpath = $xpath; 25 | } 26 | } 27 | 28 | public function getDOMNode() 29 | { 30 | return $this->node; 31 | } 32 | 33 | public function getDOMXPath() 34 | { 35 | return $this->xpath; 36 | } 37 | 38 | protected function internalQuery($query) 39 | { 40 | $nodeList = @$this->xpath->query($query, $this->node); 41 | if ($nodeList == false) { 42 | throw new XPathException("Invalid expression $query"); 43 | } else { 44 | return $nodeList; 45 | } 46 | } 47 | 48 | public function find($query) 49 | { 50 | $nodeList = $this->internalQuery($query); 51 | if ($nodeList->length == 0) { 52 | throw new NodeNotFoundException("Query $query returned no results"); 53 | } 54 | return new Node($nodeList->item(0), $this->xpath); 55 | } 56 | 57 | public function findOneOrNull($query) 58 | { 59 | try { 60 | return $this->find($query); 61 | } catch (NodeNotFoundException $e) { 62 | return null; 63 | } 64 | } 65 | 66 | public function findAll($query) 67 | { 68 | return new NodeList( 69 | $query, 70 | $this->xpath, 71 | $this->internalQuery($query) 72 | ); 73 | } 74 | 75 | public function extract() 76 | { 77 | return $this->node->nodeValue; 78 | } 79 | 80 | public function __toString() 81 | { 82 | return $this->extract(); 83 | } 84 | 85 | public function innerHTML() 86 | { 87 | if ($this->node instanceof DOMDocument) { 88 | $doc = $this->node; 89 | } else { 90 | $doc = $this->node->ownerDocument; 91 | } 92 | 93 | $innerHTML = ''; 94 | foreach ($this->node->childNodes as $child) { 95 | $innerHTML .= $doc->saveHTML($child); 96 | } 97 | 98 | return $innerHTML; 99 | } 100 | 101 | public function outerHTML() 102 | { 103 | if ($this->node instanceof DOMDocument) { 104 | return $this->node->saveHTML(); 105 | } else { 106 | return $this->node->ownerDocument->saveHTML($this->node); 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/NodeInterface.php: -------------------------------------------------------------------------------- 1 | xpath = $xpath; 13 | foreach ($domNodeList as $domNode) { 14 | $this->childNodes[] = new Node($domNode, $xpath); 15 | } 16 | $this->length = count($this->childNodes); 17 | } 18 | 19 | public function count() 20 | { 21 | return $this->length; 22 | } 23 | 24 | public function item($index) 25 | { 26 | if (isset($this->childNodes[$index])) { 27 | return $this->childNodes[$index]; 28 | } else { 29 | throw new \OutOfBoundsException("Node with index $index does not exist in query results."); 30 | } 31 | } 32 | 33 | public function getIterator() 34 | { 35 | return new \ArrayIterator($this->childNodes); 36 | } 37 | 38 | public function each(callable $callback) 39 | { 40 | foreach ($this->childNodes as $index => $node) { 41 | $callback($node, $index); 42 | } 43 | } 44 | 45 | public function map(callable $callback) 46 | { 47 | $result = []; 48 | foreach ($this->childNodes as $index => $node) { 49 | $result[] = $callback($node, $index); 50 | } 51 | return $result; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/NodeListInterface.php: -------------------------------------------------------------------------------- 1 | load($path); 12 | return new self($dom); 13 | } 14 | 15 | public static function loadXML($xml) 16 | { 17 | $dom = new DOMDocument(); 18 | @$dom->loadXML($xml); 19 | return new self($dom); 20 | } 21 | 22 | public static function loadHTMLFile($html) 23 | { 24 | $dom = new DOMDocument(); 25 | @$dom->loadHTMLFile($html); 26 | return new self($dom); 27 | } 28 | 29 | public static function loadHTML($path) 30 | { 31 | $dom = new DOMDocument(); 32 | @$dom->loadHTML($path); 33 | return new self($dom); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/Tests/NodeListTest.php: -------------------------------------------------------------------------------- 1 | xs = Selector::loadHTMLFile(__DIR__.'/Resources/test.html'); 18 | } 19 | 20 | public function testCount() 21 | { 22 | $divs = $this->xs->findAll('//div'); 23 | $this->assertCount(160, $divs); 24 | $this->assertEquals(160, $divs->count()); 25 | } 26 | 27 | public function testItem() 28 | { 29 | $divs = $this->xs->findAll('//div'); 30 | 31 | $this->assertInstanceOf("XPathSelector\\Node", $divs->item(150)); 32 | 33 | $ex = false; 34 | try { 35 | $divs->item(99999); 36 | } catch (\OutOfBoundsException $e) { 37 | $ex = true; 38 | } 39 | $this->assertTrue($ex); 40 | } 41 | 42 | public function testMap() 43 | { 44 | $langs = $this->xs->findAll('//select[@id="changelang-langs"]/option'); 45 | $str = implode(', ', $langs->map(function (Node $node) { 46 | return $node->extract(); 47 | })); 48 | 49 | $this->assertEquals( 50 | 'English, Brazilian Portuguese, Chinese (Simplified), French, German, '. 51 | 'Italian, Japanese, Romanian, Russian, Spanish, Turkish, Other', 52 | $str 53 | ); 54 | } 55 | 56 | public function testEach() 57 | { 58 | $langs = $this->xs->findAll('//select[@id="changelang-langs"]/option'); 59 | 60 | $str = []; 61 | $langs->each(function (Node $node) use (&$str) { 62 | $str[] = $node->extract(); 63 | }); 64 | 65 | $this->assertEquals( 66 | 'English, Brazilian Portuguese, Chinese (Simplified), French, German, '. 67 | 'Italian, Japanese, Romanian, Russian, Spanish, Turkish, Other', 68 | implode(', ', $str) 69 | ); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/Tests/NodeTest.php: -------------------------------------------------------------------------------- 1 | htmlSelector = Selector::loadHTMLFile(__DIR__.'/Resources/test.html'); 22 | } 23 | 24 | public function testGetDOMNode() 25 | { 26 | $xs = $this->htmlSelector; 27 | $this->assertInstanceOf('DOMNode', $xs->getDOMNode()); 28 | } 29 | 30 | public function testGetDOMXPath() 31 | { 32 | $xs = $this->htmlSelector; 33 | $this->assertInstanceOf('DOMXPath', $xs->getDOMXPath()); 34 | } 35 | 36 | public function testInnerHtml() 37 | { 38 | $xmlPath = __DIR__.'/Resources/test.xml'; 39 | $xs = Selector::load($xmlPath); 40 | $this->assertEquals($xs->innerHTML(), ' 41 | 42 | Everyday Italian 43 | Giada De Laurentiis 44 | 2005 45 | 30.00 46 | 47 | 48 | Harry Potter 49 | J K. Rowling 50 | 2005 51 | 29.99 52 | 53 | 54 | XQuery Kick Start 55 | James McGovern 56 | Per Bothner 57 | Kurt Cagle 58 | James Linn 59 | Vaidyanathan Nagarajan 60 | 2003 61 | 49.99 62 | 63 | 64 | Learning XML 65 | Erik T. Ray 66 | 2003 67 | 39.95 68 | 69 | '); 70 | } 71 | 72 | public function testFind() 73 | { 74 | $xs = $this->htmlSelector; 75 | 76 | $exception = false; 77 | try { 78 | $xs->find('//titl')->extract(); 79 | } catch (Exception\NodeNotFoundException $e) { 80 | $exception = true; 81 | } 82 | $this->assertTrue($exception); 83 | 84 | $exception = false; 85 | try { 86 | $xs->find('//////')->extract(); 87 | } catch (Exception\XPathException $e) { 88 | $exception = true; 89 | } 90 | $this->assertTrue($exception); 91 | 92 | $this->assertEquals( 93 | 'PHP: DOMNode - Manual ', 94 | $xs->find('//title')->extract() 95 | ); 96 | 97 | $this->assertEquals( 98 | 'PHP: DOMNode - Manual ', 99 | (string)$xs->find('//title') 100 | ); 101 | 102 | $this->assertEquals( 103 | 'php', 104 | $xs->find('//a[@class="brand"][1]')->innerHTML() 105 | ); 106 | 107 | $this->assertEquals( 108 | 'php', 109 | $xs->find('//a[@class="brand"][1]')->outerHTML() 110 | ); 111 | } 112 | 113 | public function testFindOneOrNull() 114 | { 115 | $xs = $this->htmlSelector; 116 | $this->assertNull($xs->findOneOrNull('//title[15]')); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/Tests/Resources/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | PHP: DOMNode - Manual 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 44 | 45 | 50 | 51 | 54 | 55 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 82 | 185 |
186 | 190 |
191 | 192 | 205 | 206 | 207 | 208 | 209 |
210 |
211 |
212 |
213 |
214 |
215 | 216 | 230 |
231 |
232 |
233 |
234 | Edit 235 | Report a Bug 236 |
237 |
238 |

The DOMNode class

239 | 240 | 241 |

(PHP 5)

242 | 243 | 244 | 245 |
246 |

Class synopsis

247 | 248 | 249 |
250 |
251 | 252 | 253 |
254 | 255 | DOMNode 256 | 257 | 258 | {
259 | 260 | 261 |
/* Properties */
262 | 263 |
264 | public 265 | readonly 266 | string 267 | $nodeName 268 | ;
269 | 270 |
271 | public 272 | string 273 | $nodeValue 274 | ;
275 | 276 |
277 | public 278 | readonly 279 | int 280 | $nodeType 281 | ;
282 | 283 |
284 | public 285 | readonly 286 | DOMNode 287 | $parentNode 288 | ;
289 | 290 |
291 | public 292 | readonly 293 | DOMNodeList 294 | $childNodes 295 | ;
296 | 297 |
298 | public 299 | readonly 300 | DOMNode 301 | $firstChild 302 | ;
303 | 304 |
305 | public 306 | readonly 307 | DOMNode 308 | $lastChild 309 | ;
310 | 311 |
312 | public 313 | readonly 314 | DOMNode 315 | $previousSibling 316 | ;
317 | 318 |
319 | public 320 | readonly 321 | DOMNode 322 | $nextSibling 323 | ;
324 | 325 |
326 | public 327 | readonly 328 | DOMNamedNodeMap 329 | $attributes 330 | ;
331 | 332 |
333 | public 334 | readonly 335 | DOMDocument 336 | $ownerDocument 337 | ;
338 | 339 |
340 | public 341 | readonly 342 | string 343 | $namespaceURI 344 | ;
345 | 346 |
347 | public 348 | string 349 | $prefix 350 | ;
351 | 352 |
353 | public 354 | readonly 355 | string 356 | $localName 357 | ;
358 | 359 |
360 | public 361 | readonly 362 | string 363 | $baseURI 364 | ;
365 | 366 |
367 | public 368 | readonly 369 | string 370 | $textContent 371 | ;
372 | 373 |
/* Methods */
374 |
375 | public DOMNode appendChild 376 | ( DOMNode $newnode 377 | )
378 |
379 | public string C14N 380 | ([ bool $exclusive 381 | [, bool $with_comments 382 | [, array $xpath 383 | [, array $ns_prefixes 384 | ]]]] )
385 |
386 | public int C14NFile 387 | ( string $uri 388 | [, bool $exclusive 389 | [, bool $with_comments 390 | [, array $xpath 391 | [, array $ns_prefixes 392 | ]]]] )
393 |
394 | public DOMNode cloneNode 395 | ([ bool $deep 396 | ] )
397 |
398 | public int getLineNo 399 | ( void 400 | )
401 |
402 | public string getNodePath 403 | ( void 404 | )
405 |
406 | public bool hasAttributes 407 | ( void 408 | )
409 |
410 | public bool hasChildNodes 411 | ( void 412 | )
413 |
414 | public DOMNode insertBefore 415 | ( DOMNode $newnode 416 | [, DOMNode $refnode 417 | ] )
418 |
419 | public bool isDefaultNamespace 420 | ( string $namespaceURI 421 | )
422 |
423 | public bool isSameNode 424 | ( DOMNode $node 425 | )
426 |
427 | public bool isSupported 428 | ( string $feature 429 | , string $version 430 | )
431 |
432 | public string lookupNamespaceURI 433 | ( string $prefix 434 | )
435 |
436 | public string lookupPrefix 437 | ( string $namespaceURI 438 | )
439 |
440 | public void normalize 441 | ( void 442 | )
443 |
444 | public DOMNode removeChild 445 | ( DOMNode $oldnode 446 | )
447 |
448 | public DOMNode replaceChild 449 | ( DOMNode $newnode 450 | , DOMNode $oldnode 451 | )
452 | 453 | }
454 | 455 | 456 |
457 | 458 | 459 |
460 |

Properties

461 |
462 | 463 | 464 |
nodeName
465 | 466 |
467 | 468 |

Returns the most accurate name for the current node type

469 |
470 | 471 | 472 | 473 |
nodeValue
474 | 475 |
476 | 477 |

The value of this node, depending on its type

478 |
479 | 480 | 481 | 482 |
nodeType
483 | 484 |
485 | 486 |

Gets the type of the node. One of the predefined XML_xxx_NODE constants

487 |
488 | 489 | 490 | 491 |
parentNode
492 | 493 |
494 | 495 |

The parent of this node

496 |
497 | 498 | 499 | 500 |
childNodes
501 | 502 |
503 | 504 |

505 | A DOMNodeList that contains all 506 | children of this node. If there are no children, this is an empty 507 | DOMNodeList. 508 |

509 |
510 | 511 | 512 | 513 |
firstChild
514 | 515 |
516 | 517 |

518 | The first child of this node. If there is no such node, this 519 | returns NULL. 520 |

521 |
522 | 523 | 524 | 525 |
lastChild
526 | 527 |
528 | 529 |

The last child of this node. If there is no such node, this returns NULL.

530 |
531 | 532 | 533 | 534 |
previousSibling
535 | 536 |
537 | 538 |

539 | The node immediately preceding this node. If there is no such 540 | node, this returns NULL. 541 |

542 |
543 | 544 | 545 | 546 |
nextSibling
547 | 548 |
549 | 550 |

551 | The node immediately following this node. If there is no such 552 | node, this returns NULL. 553 |

554 |
555 | 556 | 557 | 558 |
attributes
559 | 560 |
561 | 562 |

563 | A DOMNamedNodeMap containing the 564 | attributes of this node (if it is a DOMElement) 565 | or NULL otherwise. 566 |

567 |
568 | 569 | 570 | 571 |
ownerDocument
572 | 573 |
574 | 575 |

The DOMDocument object associated with this node.

576 |
577 | 578 | 579 | 580 |
namespaceURI
581 | 582 |
583 | 584 |

The namespace URI of this node, or NULL if it is unspecified.

585 |
586 | 587 | 588 | 589 |
prefix
590 | 591 |
592 | 593 |

The namespace prefix of this node, or NULL if it is unspecified.

594 |
595 | 596 | 597 | 598 |
localName
599 | 600 |
601 | 602 |

Returns the local part of the qualified name of this node.

603 |
604 | 605 | 606 | 607 |
baseURI
608 | 609 |
610 | 611 |

612 | The absolute base URI of this node or NULL if the implementation 613 | wasn't able to obtain an absolute URI. 614 |

615 |
616 | 617 | 618 | 619 |
textContent
620 | 621 |
622 | 623 |

This attribute returns the text content of this node and its descendants.

624 |
625 | 626 | 627 |
628 | 629 |
630 | 631 | 632 | 633 |
634 |

Notes

635 |

Note: 636 |

637 | The DOM extension uses UTF-8 encoding. Use utf8_encode() 638 | and utf8_decode() to work with texts in ISO-8859-1 639 | encoding or Iconv for other encodings. 640 |

641 |

642 |
643 | 644 | 645 | 646 |
647 |

See Also

648 |

649 |

652 |

653 |
654 | 655 | 656 |
657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 |

Table of Contents

767 |
768 | 769 |
770 |
771 | add a note add a note 772 |

User Contributed Notes 12 notes

773 |
774 |
775 |
776 | up 777 |
778 |
779 | down 780 |
781 |
782 | 7 783 |
784 |
785 | 786 | David Rekowski
4 years ago
787 |
788 |
789 | You cannot simply overwrite $textContent, to replace the text content of a DOMNode, as the missing readonly flag suggests. Instead you have to do something like this:

<?php

$node
->removeChild($node->firstChild);
$node->appendChild(new DOMText('new text content'));

?>

This example shows what happens:

<?php

$doc
= DOMDocument::loadXML('<node>old content</node>');
$node = $doc->getElementsByTagName('node')->item(0);
echo
"Content 1: ".$node->textContent."\n";

$node->textContent = 'new content';
echo
"Content 2: ".$node->textContent."\n";

$newText = new DOMText('new content');

$node->appendChild($newText);
echo
"Content 3: ".$node->textContent."\n";

$node->removeChild($node->firstChild);
$node->appendChild($newText);
echo
"Content 4: ".$node->textContent."\n";

?>

The output is:

Content 1: old content // starting content
Content 2: old content // trying to replace overwriting $node->textContent
Content 3: old contentnew content // simply appending the new text node
Content 4: new content // removing firstchild before appending the new text node

If you want to have a CDATA section, use this:

<?php
$doc
= DOMDocument::loadXML('<node>old content</node>');
$node = $doc->getElementsByTagName('node')->item(0);
$node->removeChild($node->firstChild);
$newText = $doc->createCDATASection('new cdata content');
$node->appendChild($newText);
echo
"Content withCDATA: ".$doc->saveXML($node)."\n";
?> 790 |
791 |
792 |
793 |
794 |
795 |
796 | up 797 |
798 |
799 | down 800 |
801 |
802 | 3 803 |
804 |
805 | 806 | marc at ermshaus dot org
5 years ago
807 |
808 |
809 | It took me forever to find a mapping for the XML_*_NODE constants. So I thought, it'd be handy to paste it here:

1 XML_ELEMENT_NODE
2 XML_ATTRIBUTE_NODE
3 XML_TEXT_NODE
4 XML_CDATA_SECTION_NODE
5 XML_ENTITY_REFERENCE_NODE
6 XML_ENTITY_NODE
7 XML_PROCESSING_INSTRUCTION_NODE
8 XML_COMMENT_NODE
9 XML_DOCUMENT_NODE
10 XML_DOCUMENT_TYPE_NODE
11 XML_DOCUMENT_FRAGMENT_NODE
12 XML_NOTATION_NODE
810 |
811 |
812 |
813 |
814 |
815 | up 816 |
817 |
818 | down 819 |
820 |
821 | 4 822 |
823 |
824 | 825 | R. Studer
4 years ago
826 |
827 |
828 | For clarification:
The assumingly 'discoverd' by previous posters and seemingly undocumented methods (.getElementsByTagName and .getAttribute) on this class (DOMNode) are in fact methods of the class DOMElement, which inherits from DOMNode.

See: http://www.php.net/manual/en/class.domelement.php
829 |
830 |
831 |
832 |
833 |
834 | up 835 |
836 |
837 | down 838 |
839 |
840 | 1 841 |
842 |
843 | 844 | matt at lamplightdb dot co dot uk
5 years ago
845 |
846 |
847 | And apparently also a setAttribute method too:

$node->setAttribute( 'attrName' , 'value' );
848 |
849 |
850 |
851 |
852 |
853 | up 854 |
855 |
856 | down 857 |
858 |
859 | 2 860 |
861 |
862 | 863 | brian wildwoodassociates.info
5 years ago
864 |
865 |
866 | This class has a getAttribute method.

Assume that a DOMNode object $ref contained an anchor taken out of a DOMNode List.  Then

    $url = $ref->getAttribute('href');

would isolate the url associated with the href part of the anchor.
867 |
868 |
869 |
870 |
871 |
872 | up 873 |
874 |
875 | down 876 |
877 |
878 | 1 879 |
880 |
881 | 882 | imranomar at gmail dot com
3 years ago
883 |
884 |
885 | Just discovered that node->nodeValue strips out all the tags 886 |
887 |
888 |
889 |
890 |
891 | up 892 |
893 |
894 | down 895 |
896 |
897 | 0 898 |
899 |
900 | 901 | metanull
1 month ago
902 |
903 |
904 | Yet another DOMNode to php array conversion function.
Other ones on this page are generating too "complex" arrays; this one should keep the array as tidy as possible.
Note: make sure to set LIBXML_NOBLANKS when calling DOMDocument::load, loadXML or loadHTML
See: http://be2.php.net/manual/en/libxml.constants.php
See: http://be2.php.net/manual/en/domdocument.loadxml.php

<?php
        
/**
         * Returns an array representation of a DOMNode
         * Note, make sure to use the LIBXML_NOBLANKS flag when loading XML into the DOMDocument
         * @param DOMDocument $dom
         * @param DOMNode $node
         * @return array
         */
       
function nodeToArray( $dom, $node) {
            if(!
is_a( $dom, 'DOMDocument' ) || !is_a( $node, 'DOMNode' )) {
                return
false;
            }
           
$array = false;
            if( empty(
trim( $node->localName ))) {// Discard empty nodes
               
return false;
            }
            if(
XML_TEXT_NODE == $node->nodeType ) {
                return
$node->nodeValue;
            }
            foreach (
$node->attributes as $attr) {
               
$array['@'.$attr->localName] = $attr->nodeValue;
            }
            foreach (
$node->childNodes as $childNode) {
                if (
1 == $childNode->childNodes->length && XML_TEXT_NODE == $childNode->firstChild->nodeType ) {
                   
$array[$childNode->localName] = $childNode->nodeValue;
                }  else {
                    if(
false !== ($a = self::nodeToArray( $dom, $childNode))) {
                       
$array[$childNode->localName] =     $a;
                    }
                }
            }
            return
$array;
        }
?> 905 |
906 |
907 |
908 |
909 |
910 |
911 | up 912 |
913 |
914 | down 915 |
916 |
917 | 0 918 |
919 |
920 | 921 | pizarropablo at gmail dot com
4 months ago
922 |
923 |
924 | In response to: alastair dot dallas at gmail dot com about "#text" nodes.
"#text" nodes appear when there are spaces or new lines between end tag and next initial tag.

Eg "<data><age>10</age>[SPACES]<other>20</other>[SPACES]</data>"

"data" childNodes has 4 childs:
- age = 10
- #text = spaces
- other = 20
- #text =  spaces
925 |
926 |
927 |
928 |
929 |
930 | up 931 |
932 |
933 | down 934 |
935 |
936 | 0 937 |
938 |
939 | 940 | matej dot golian at gmail dot com
1 year ago
941 |
942 |
943 | Here is a little function that truncates a DomNode to a specified number of text characters. I use it to generate HTML excerpts for my blog entries.

<?php

function makehtmlexcerpt(DomNode $html, $excerptlength)
{
$remove = 0;
$htmllength = strlen(html_entity_decode($html->textContent, ENT_QUOTES, 'UTF-8'));
$truncate = $htmllength - $excerptlength;
if(
$htmllength > $excerptlength)
{
if(
$html->hasChildNodes())
{
$children = $html->childNodes;
for(
$counter = 0; $counter < $children->length; $counter ++)
{
$child = $children->item($children->length - ($counter + 1));
$childlength = strlen(html_entity_decode($child->textContent, ENT_QUOTES, 'UTF-8'));
if(
$childlength <= $truncate)
{
$remove ++;
$truncate = $truncate - $childlength;
}
else
{
$child = makehtmlexcerpt($child, $childlength - $truncate);
break;
}
}
if(
$remove != 0)
{
for(
$counter = 0; $counter < $remove; $counter ++)
{
$html->removeChild($html->lastChild);
}
}
}
else
{
if(
$html->nodeName == '#text')
{
$html->nodeValue = substr(html_entity_decode($html->nodeValue, ENT_QUOTES, 'UTF-8'), 0, $htmllength - $truncate);
}
}
}
return
$html;
}

?> 944 |
945 |
946 |
947 |
948 |
949 |
950 | up 951 |
952 |
953 | down 954 |
955 |
956 | 0 957 |
958 |
959 | 960 | alastair dot dallas at gmail dot com
2 years ago
961 |
962 |
963 | The issues around mixed content took me some experimentation to remember, so I thought I'd add this note to save others time.

When your markup is something like: <div><p>First text.</p><ul><li><p>First bullet</p></li></ul></div>, you'll get XML_ELEMENT_NODEs that are quite regular. The <div> has children <p> and <ul> and the nodeValue for both <p>s yields the text you expect.

But when your markup is more like <p>This is <b>bold</b> and this is <i>italic</i>.</p>, you realize that the nodeValue for XML_ELEMENT_NODEs is not reliable. In this case, you need to look at the <p>'s child nodes. For this example, the <p> has children: #text, <b>, #text, <i>, #text.

In this example, the nodeValue of <b> and <i> is the same as their #text children. But you could have markup like: <p>This <b>is bold and <i>bold italic</i></b>, you see?</p>. In this case, you need to look at the children of <b>, which will be #text, <i>, because the nodeValue of <b> will not be sufficient.

XML_TEXT_NODEs have no children and are always named '#text'. Depending on how whitespace is handled, your tree may have "empty" #text nodes as children of <body> and elsewhere.

Attributes are nodes, but I had forgotten that they are not in the tree expressed by childNodes. Walking the full tree using childNodes will not visit any attribute nodes.
964 |
965 |
966 |
967 |
968 |
969 | up 970 |
971 |
972 | down 973 |
974 |
975 | 0 976 |
977 |
978 | 979 | Steve K
4 years ago
980 |
981 |
982 | This class apparently also has a getElementsByTagName method.

I was able to confirm this by evaluating the output from DOMNodeList->item() against various tests with the is_a() function.
983 |
984 |
985 |
986 |
987 |
988 | up 989 |
990 |
991 | down 992 |
993 |
994 | -1 995 |
996 |
997 | 998 | I. Cook
4 years ago
999 |
1000 |
1001 | For a reference with more information about the XML DOM node types, see http://www.w3schools.com/dom/dom_nodetype.asp

(When using PHP DOMNode, these constants need to be prefaced with "XML_")
1002 |
1003 |
1004 |
1005 | 1006 | 1007 |
1008 | 1118 | 1119 | 1120 |
1121 | 1122 | 1136 | 1137 | 1138 | 1139 | 1140 | 1141 | 1142 | 1143 | 1144 | 1145 | 1146 | 1147 | To Top 1148 | 1149 | 1150 | 1151 | 1152 | -------------------------------------------------------------------------------- /src/Tests/Resources/test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Everyday Italian 5 | Giada De Laurentiis 6 | 2005 7 | 30.00 8 | 9 | 10 | Harry Potter 11 | J K. Rowling 12 | 2005 13 | 29.99 14 | 15 | 16 | XQuery Kick Start 17 | James McGovern 18 | Per Bothner 19 | Kurt Cagle 20 | James Linn 21 | Vaidyanathan Nagarajan 22 | 2003 23 | 49.99 24 | 25 | 26 | Learning XML 27 | Erik T. Ray 28 | 2003 29 | 39.95 30 | 31 | -------------------------------------------------------------------------------- /src/Tests/SelectorTest.php: -------------------------------------------------------------------------------- 1 | assertInstanceOf('XPathSelector\Selector', $selector); 12 | } 13 | 14 | public function testLoadXML() 15 | { 16 | $xml = file_get_contents(__DIR__.'/Resources/test.xml'); 17 | $selector = Selector::loadXML($xml); 18 | $this->assertInstanceOf('XPathSelector\Selector', $selector); 19 | } 20 | 21 | public function testLoadHTMLFile() 22 | { 23 | $selector = Selector::loadHTMLFile(__DIR__.'/Resources/test.html'); 24 | $this->assertInstanceOf('XPathSelector\Selector', $selector); 25 | } 26 | 27 | public function testLoadHTML() 28 | { 29 | $html = file_get_contents(__DIR__.'/Resources/test.html'); 30 | $selector = Selector::loadHTML($html); 31 | $this->assertInstanceOf('XPathSelector\Selector', $selector); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Tests/TestCase.php: -------------------------------------------------------------------------------- 1 |