├── .github └── workflows │ └── tests.yml ├── .gitignore ├── .phpdoc-md ├── .scrutinizer.yml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── Resources └── jquerytest.html ├── Tests ├── HelpersTest.php ├── HtmlPageCrawlerTest.php ├── HtmlPageTest.php ├── phpunit_bootstrap.php └── utf8.html ├── UPGRADE.md ├── composer.json ├── doc ├── HtmlPage.md ├── HtmlPageCrawler.md └── README.md ├── phpunit.xml.dist └── src ├── Helpers.php ├── HtmlPage.php └── HtmlPageCrawler.php /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | workflow_dispatch: 11 | 12 | jobs: 13 | php: 14 | runs-on: ubuntu-latest 15 | 16 | strategy: 17 | matrix: 18 | php: [8.0, 8.1, 8.2, 8.3] 19 | dependency-version: [prefer-lowest, prefer-stable] 20 | 21 | steps: 22 | - name: checkout code 23 | uses: actions/checkout@v4 24 | 25 | - name: setup PHP 26 | uses: shivammathur/setup-php@v2 27 | with: 28 | php-version: ${{ matrix.php }} 29 | coverage: xdebug 30 | 31 | - name: install dependencies 32 | run: composer update --${{ matrix.dependency-version }} 33 | 34 | - name: run tests 35 | run: php vendor/bin/phpunit 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vendor 2 | test.php 3 | composer.lock 4 | composer.phar 5 | -------------------------------------------------------------------------------- /.phpdoc-md: -------------------------------------------------------------------------------- 1 | 'Wa72\HtmlPageDom', 4 | 'destDirectory' => 'doc', 5 | 'format' => 'github', 6 | 'classes' => [ 7 | '\Wa72\HtmlPageDom\HtmlPage', 8 | '\Wa72\HtmlPageDom\HtmlPageCrawler' 9 | ] 10 | ]; 11 | -------------------------------------------------------------------------------- /.scrutinizer.yml: -------------------------------------------------------------------------------- 1 | before_commands: 2 | - 'composer install --dev --no-interaction --prefer-source' 3 | 4 | tools: 5 | # Code Coverage from Travis 6 | external_code_coverage: 7 | enabled: true 8 | timeout: 300 9 | filter: 10 | excluded_paths: 11 | - 'Tests/*' 12 | - 'vendor/*' 13 | php_code_coverage: 14 | enabled: false 15 | 16 | php_code_sniffer: 17 | enabled: true 18 | config: 19 | standard: PSR2 20 | filter: 21 | excluded_paths: 22 | - 'vendor/*' 23 | 24 | # PHP Mess Detector (http://phpmd.org). 25 | php_mess_detector: 26 | enabled: true 27 | command: phpmd 28 | config: 29 | rulesets: 30 | - codesize 31 | - unusedcode 32 | - design 33 | filter: 34 | excluded_paths: 35 | - 'vendor/*' 36 | 37 | php_pdepend: 38 | enabled: true 39 | excluded_dirs: [vendor, Tests] 40 | 41 | php_loc: 42 | enabled: true 43 | excluded_dirs: [vendor, Tests] 44 | 45 | php_cpd: 46 | enabled: true 47 | excluded_dirs: [vendor, Tests] 48 | 49 | php_analyzer: 50 | enabled: true 51 | filter: 52 | excluded_paths: 53 | - 'Tests/*' 54 | - 'vendor/*' 55 | 56 | 57 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 3.0.0 2 | ===== 3 | 4 | 2022-04-13 5 | 6 | Changed some method signatures (added argument type hints and return types) in HtmlPageCrawler for compatibility with the base Crawler class from Symfony 6. So, this release is only compatible with Symfony 6 and up. 7 | 8 | Otherwise there are no changes, so it does not require changes in code using this lib. 9 | 10 | 2.0.0 11 | ===== 12 | 13 | 2019-10-15 14 | 15 | __BC BREAK__ for compatibility with Symfony 4.3 and up 16 | 17 | - `HtmlPageCrawler::html()` is now just the parent `Crawler::html()` and acts as *getter* only. 18 | Setting HTML content via `HtmlPageCrawler::html($html)` is *not possible* any more, 19 | use `HtmlPageCrawler::setInnerHtml($html)` instead 20 | 21 | - `HtmlPageCrawler::text()` is now just the parent `Crawler::text()` and acts as *getter* only 22 | that returns the text content from the *first* node only. For setting text content, use `HtmlPageCrawler::setText($text)` instead. 23 | 24 | - `HtmlPageCrawler::attr()` is now just the parent `Crawler::attr()` and acts as *getter* only. 25 | For setting attributes use `HtmlPageCrawler::setAttribute($name, $value)` instead 26 | 27 | - new method `HtmlPageCrawler::getCombinedText()` that returns the combined text from all nodes (as jQuery's `text()` function does and previous versions of `HtmlPageCrawler::text()` did) 28 | 29 | - removed method `HtmlPageCrawler::isDisconnected()` 30 | 31 | 32 | 1.4.2 33 | ===== 34 | 35 | 2019-10-15 36 | 37 | - undo deprecation of getInnerHtml() 38 | - deprecate setter use of attr() 39 | - deprecate isDisconnected() 40 | 41 | 42 | 1.4.1 43 | ===== 44 | 45 | 2019-06-28 46 | 47 | - Bugfix: setText() should convert special chars. Closes #34. 48 | 49 | 50 | 1.4.0 51 | ===== 52 | 53 | 2019-05-17 54 | 55 | Preparation for a smooth migration to 2.x / Symfony 4.3: 56 | - deprecate setter use of html() and text(), 57 | - deprecate getInnerHtml(), 58 | - new methods setText() and getCombinedText() 59 | 60 | 61 | 1.3.2 62 | ===== 63 | 64 | 2019-04-18 65 | 66 | - Mark this version as incompatible to Symfony DomCrawler 4.3 67 | 68 | 69 | 1.3 70 | === 71 | 72 | 2016-10-06 73 | 74 | - new method `unwrapInner` (thanks to [@ttk](https://github.com/ttk)) 75 | 76 | - it's now possible to get the number of nodes in the crawler using the 77 | `$crawler->length` property like in Javascript instead of `count($crawler)` 78 | 79 | 80 | 1.2 81 | === 82 | 83 | 2015-11-06 84 | 85 | - new methods `HtmlPage::minify()` and `HtmlPage::indent()` for compressing or nicely indenting the HTML document. These 86 | functions rely on the package `wa72/html-pretty-min` that is *suggested* in composer.json. 87 | 88 | 1.1 89 | === 90 | 91 | 2015-05-20 92 | 93 | - `text()` function now returns combined text of all elements in set (as jQuery does; previously only the nodeValue of 94 | the first element was returned) and can act as a setter `text($string)` that sets the nodeValue of all elements to 95 | the specified string 96 | 97 | - function `hasClass` now returns true if any of the elements in the Crawler has the specified class (previously, 98 | only the first element was checked). 99 | 100 | - new function `makeClone` as equivalent to jQuery's `clone` function ("clone" is not a valid function name in PHP). 101 | As previously, you can alternatively use PHP's clone operator: `$r = $c->makeClone()` is the same as `$r = clone $c`, 102 | but the new function allows chaining. 103 | 104 | - new function `removeAttr` aliasing `removeAttribute` for compatibility with jQuery 105 | 106 | - `appendTo`, `insertBefore`, `insertAfter`, and `replaceAll` now always return a new Crawler object containing 107 | the aggregate set of all elements appended to the target elements (this is the behavior of jQuery 1.9 and newer). 108 | 109 | - `attr` function can now act as setter `attr($name, $value)` which is an alias for `setAttribute($name, $value)` 110 | (previously it accepted only one argument and was a getter equivalent to `getAttribute($name)` only, like it is 111 | in parent DomCrawler) 112 | 113 | - `attr($name)` and `getAttribute($name)` now always return `null` if the attribute does not exist (previously, an empty 114 | string was returned when used with Symfony 2.3) 115 | 116 | 1.0 117 | === 118 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2022 Christoph Singer 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | The software is provided "as is", without warranty of any kind, express or 14 | implied, including but not limited to the warranties of merchantability, 15 | fitness for a particular purpose and noninfringement. In no event shall the 16 | authors or copyright holders be liable for any claim, damages or other 17 | liability, whether in an action of contract, tort or otherwise, arising from, 18 | out of or in connection with the software or the use or other dealings in 19 | the software. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | HtmlPageDom 2 | =========== 3 | 4 | ![tests](https://github.com/wasinger/htmlpagedom/actions/workflows/tests.yml/badge.svg?branch=master) 5 | [![Latest Version](http://img.shields.io/packagist/v/wa72/htmlpagedom.svg)](https://packagist.org/packages/wa72/htmlpagedom) 6 | [![Downloads from Packagist](http://img.shields.io/packagist/dt/wa72/htmlpagedom.svg)](https://packagist.org/packages/wa72/htmlpagedom) 7 | 8 | `Wa72\HtmlPageDom` is a PHP library for easy manipulation of HTML documents using DOM. 9 | It requires [DomCrawler from Symfony components](https://github.com/symfony/DomCrawler) for traversing 10 | the DOM tree and extends it by adding methods for manipulating the DOM tree of HTML documents. 11 | 12 | It's useful when you need to not just extract information from an HTML file (what DomCrawler does) but 13 | also to modify HTML pages. It is usable as a template engine: load your HTML template file, set new 14 | HTML content on certain elements such as the page title, `div#content` or `ul#menu` and print out 15 | the modified page. 16 | 17 | `Wa72\HtmlPageDom` consists of two main classes: 18 | 19 | - `HtmlPageCrawler` extends `Symfony\Components\DomCrawler` by adding jQuery inspired, HTML specific 20 | DOM *manipulation* functions such as `setInnerHtml($htmltext)`, `before()`, `append()`, `wrap()`, `addClass()` or `css()`. 21 | It's like jQuery for PHP: simply select elements of an HTML page using CSS selectors and change their 22 | attributes and content. 23 | 24 | [API doc for HtmlPageCrawler](doc/HtmlPageCrawler.md) 25 | 26 | - `HtmlPage` represents one complete HTML document and offers convenience functions like `getTitle()`, `setTitle($title)`, 27 | `setMeta('description', $description)`, `getBody()`. Internally, it uses the `HtmlPageCrawler` class for 28 | filtering and manipulating DOM Elements. Since version 1.2, it offers methods for compressing (`minify()`) and 29 | prettyprinting (`indent()`) the HTML page. 30 | 31 | [API doc for HtmlPage](doc/HtmlPage.md) 32 | 33 | 34 | Requirements and Compatibility 35 | ------------------------------ 36 | 37 | Version 3.x: 38 | - PHP 8.x 39 | - [Symfony\Components\DomCrawler](https://github.com/symfony/DomCrawler) 6.x | 7.x 40 | - [Symfony\Components\CssSelector](https://github.com/symfony/CssSelector) 6.x | 7.x 41 | 42 | Version 2.x: 43 | - PHP ^7.4 | 8.x 44 | - [Symfony\Components\DomCrawler](https://github.com/symfony/DomCrawler) ^4.4 | 5.x 45 | - [Symfony\Components\CssSelector](https://github.com/symfony/CssSelector) ^4.4 | 5.x 46 | 47 | There is no difference in our API between versions 2.x and 3.0.x. 48 | The only difference is the compatibility with different versions of Symfony. 49 | 50 | Installation 51 | ------------ 52 | 53 | - using [composer](http://getcomposer.org): `composer require wa72/htmlpagedom` 54 | 55 | - using other [PSR-4](http://www.php-fig.org/psr/psr-4/) compliant autoloader: 56 | clone this project to where your included libraries are and point your autoloader to look for the 57 | "\Wa72\HtmlPageDom" namespace in the "src" directory of this project 58 | 59 | Usage 60 | ----- 61 | 62 | `HtmlPageCrawler` is a wrapper around DOMNodes. `HtmlPageCrawler` objects can be created using `new` or the static function 63 | `HtmlPageCrawler::create()`, which accepts an HTML string or a DOMNode (or an array of DOMNodes, a DOMNodeList, or even 64 | another `Crawler` object) as arguments. 65 | 66 | Afterwards you can select nodes from the added DOM tree by calling `filter()` (equivalent to find() in jQuery) and alter 67 | the selected elements using the following jQuery-like manipulation functions: 68 | 69 | - `addClass()`, `hasClass()`, `removeClass()`, `toggleClass()` 70 | - `after()`, `before()` 71 | - `append()`, `appendTo()` 72 | - `makeClone()` (equivalent to `clone()` in jQuery) 73 | - `css()` (alias `getStyle()` / `setStyle()`) 74 | - `html()` (get inner HTML content) and `setInnerHtml($html)` 75 | - `attr()` (alias `getAttribute()` / `setAttribute()`), `removeAttr()` 76 | - `insertAfter()`, `insertBefore()` 77 | - `makeEmpty()` (equivalent to `empty()` in jQuery) 78 | - `prepend()`, `prependTo()` 79 | - `remove()` 80 | - `replaceAll()`, `replaceWith()` 81 | - `text()`, `getCombinedText()` (get text content of all nodes in the Crawler), and `setText($text)` 82 | - `wrap()`, `unwrap()`, `wrapInner()`, `unwrapInner()`, `wrapAll()` 83 | 84 | To get the modified DOM as HTML code use `html()` (returns innerHTML of the first node in your crawler object) 85 | or `saveHTML()` (returns combined "outer" HTML code of all elements in the list). 86 | 87 | See the full methods documentation in the generated [API doc for HtmlPageCrawler](doc/HtmlPageCrawler.md) 88 | 89 | **Example:** 90 | 91 | ```php 92 | use \Wa72\HtmlPageDom\HtmlPageCrawler; 93 | 94 | // create an object from a fragment of HTML code as you would do with jQuery's $() function 95 | $c = HtmlPageCrawler::create('

Title

'); 96 | 97 | // the above is the same as calling: 98 | $c = new HtmlPageCrawler('

Title

'); 99 | 100 | // filter for h1 elements and wrap them with an HTML structure 101 | $c->filter('h1')->wrap('
'); 102 | 103 | // return the modified HTML 104 | echo $c->saveHTML(); 105 | // or simply: 106 | echo $c; // implicit __toString() calls saveHTML() 107 | // will output:

Title

108 | ``` 109 | 110 | **Advanced example: remove the third column from an HTML table** 111 | 112 | ```php 113 | use \Wa72\HtmlPageDom\HtmlPageCrawler; 114 | $html = << 116 | 117 | abc 118 | adsf 119 | to be removed 120 | 121 | 122 | abc 123 | adsf 124 | to be removed 125 | 126 | 127 | abc 128 | adsf 129 | to be removed 130 | 131 | 132 | END; 133 | 134 | $c = HtmlPageCrawler::create($html); 135 | $tr = $c->filter('table > tr > td') 136 | ->reduce( 137 | function ($c, $j) { 138 | if (($j+1) % 3 == 0) { 139 | return true; 140 | } 141 | return false; 142 | } 143 | ); 144 | $tr->remove(); 145 | echo $c->saveHTML(); 146 | ``` 147 | 148 | **Usage examples for the `HtmlPage` class:** 149 | 150 | ```php 151 | use \Wa72\HtmlPageDom\HtmlPage; 152 | 153 | // create a new HtmlPage object with an empty HTML skeleton 154 | $page = new HtmlPage(); 155 | 156 | // or create a HtmlPage object from an existing page 157 | $page = new HtmlPage(file_get_contents('http://www.heise.de')); 158 | 159 | // get or set page title 160 | echo $page->getTitle(); 161 | $page->setTitle('New page title'); 162 | echo $page->getTitle(); 163 | 164 | 165 | // add HTML content 166 | $page->filter('body')->setInnerHtml('

This is the headline

This is a paragraph

'); 167 | 168 | // select elements by css selector 169 | $h1 = $page->filter('#content h1'); 170 | $p = $page->filter('p.text'); 171 | 172 | // change attributes and content of an element 173 | $h1->addClass('headline')->css('margin-top', '10px')->setInnerHtml('This is the new headline'); 174 | 175 | $p->removeClass('text')->append('
There is more than one line in this paragraph'); 176 | 177 | // add a new paragraph to div#content 178 | $page->filter('#content')->append('

This is a new paragraph.

'); 179 | 180 | // add a class and some attribute to all paragraphs 181 | $page->filter('p')->addClass('newclass')->setAttribute('data-foo', 'bar'); 182 | 183 | 184 | // get HTML content of an element 185 | echo $page->filter('#content')->saveHTML(); 186 | 187 | // output the whole HTML page 188 | echo $page->save(); 189 | // or simply: 190 | echo $page; 191 | 192 | // output formatted HTML code 193 | echo $page->indent()->save(); 194 | 195 | // output compressed (minified) HTML code 196 | echo $page->minify()->save(); 197 | ``` 198 | 199 | See also the generated [API doc for HtmlPage](doc/HtmlPage.md) 200 | 201 | Limitations 202 | ----------- 203 | 204 | - HtmlPageDom builds on top of PHP's DOM functions and uses the loadHTML() and saveHTML() methods of the DOMDocument class. 205 | That's why it's output is always HTML, not XHTML. 206 | 207 | - The HTML parser used by PHP is built for HTML4. It throws errors 208 | on HTML5 specific elements which are ignored by HtmlPageDom, so HtmlPageDom is usable for HTML5 with some limitations. 209 | 210 | - HtmlPageDom has not been tested with character encodings other than UTF-8. 211 | 212 | 213 | History 214 | ------- 215 | 216 | When I discovered how easy it was to modify HTML documents using jQuery I looked for a PHP library providing similar 217 | possibilities for PHP. 218 | 219 | Googling around I found [SimpleHtmlDom](http://simplehtmldom.sourceforge.net) 220 | and later [Ganon](http://code.google.com/p/ganon) but both turned out to be very slow. Nevertheless I used both 221 | libraries in my projects. 222 | 223 | When Symfony2 appeared with it's DomCrawler and CssSelector components I thought: 224 | the functions for traversing the DOM tree and selecting elements by CSS selectors are already there, only the 225 | manipulation functions are missing. Let's implement them! So the HtmlPageDom project was born. 226 | 227 | It turned out that it was a good choice to build on PHP's DOM functions: Compared to SimpleHtmlDom and Ganon, HmtlPageDom 228 | is lightning fast. In one of my projects, I have a PHP script that takes a huge HTML page containing several hundreds 229 | of article elements and extracts them into individual HTML files (that are later on demand loaded by AJAX back into the 230 | original HTML page). Using SimpleHtmlDom it took the script 3 minutes (right, minutes!) to run (and I needed to raise 231 | PHP's memory limit to over 500MB). Using Ganon as HTML parsing and manipulation engine it took even longer, 232 | about 5 minutes. After switching to HtmlPageDom the same script doing the same processing tasks is running only about 233 | one second (all on the same server). HtmlPageDom is really fast. 234 | 235 | 236 | © 2012-2023 Christoph Singer. Licensed under the MIT License. 237 | 238 | -------------------------------------------------------------------------------- /Resources/jquerytest.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Testing jquery object identities 7 | 8 | 9 |

Testing jquery object identities

10 |

This page contains javascript code to figure out in which cases jQuery returns references to existing objects 11 | and when it makes copies.

12 |

test paragraph 2555

13 |

test paragraph 3

14 | 15 | 97 | 98 | -------------------------------------------------------------------------------- /Tests/HelpersTest.php: -------------------------------------------------------------------------------- 1 | assertEquals([ 13 | 'font-size' => '15px', 14 | 'font-weight' => 'bold', 15 | 'font-color' => 'black' 16 | ], Helpers::cssStringToArray('invalid_css_string;font-size: 15px;font-weight: bold;font-color: black;')); 17 | } 18 | 19 | public function testCssArrayToString() 20 | { 21 | $this->assertEquals('font-size: 15px;font-weight: bold;font-color: black;', Helpers::cssArrayToString([ 22 | 'font-size' => '15px', 23 | 'font-weight' => 'bold', 24 | 'font-color' => 'black' 25 | ])); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Tests/HtmlPageCrawlerTest.php: -------------------------------------------------------------------------------- 1 | addHtmlContent('

Title

'); 19 | $title = $c->filter('#content > h1'); 20 | 21 | $this->assertInstanceOf('\Wa72\HtmlPageDom\HtmlPageCrawler', $title); 22 | $this->assertInstanceOf('\DOMNode', $title->getNode(0)); 23 | $this->assertEquals('h1', $title->nodeName()); 24 | } 25 | 26 | /** 27 | * 28 | * 29 | * @param $string 30 | * @return string 31 | */ 32 | private function _ignoreNewlines($string) 33 | { 34 | return str_replace("\n", '', $string); 35 | } 36 | 37 | /** 38 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::setInnerHtml 39 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::prepend 40 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::makeEmpty 41 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::setAttribute 42 | */ 43 | public function testManipulationFunctions() 44 | { 45 | $c = new HtmlPageCrawler; 46 | $c->addHtmlContent('

Title

'); 47 | 48 | $content = $c->filter('#content'); 49 | $content->append('

Das ist ein Testabsatz'); 50 | $this->assertEquals("

Title

Das ist ein Testabsatz

", $this->_ignoreNewlines($content->html())); 51 | 52 | $content->setInnerHtml('

Ein neuer Inhalt

'); 53 | $this->assertEquals('

Ein neuer Inhalt

', $content->html()); 54 | 55 | $content->prepend('

Neue Überschrift'); 56 | $this->assertEquals('

Neue Überschrift

Ein neuer Inhalt

', $content->html()); 57 | 58 | $h1 = $content->filter('h1'); 59 | $this->assertEquals('Neue Überschrift', $h1->text()); 60 | 61 | $b = $content->filter('b'); 62 | $this->assertEquals('Inhalt', $b->text()); 63 | 64 | $b2 = $c->filter('#content p b'); 65 | $this->assertEquals('Inhalt', $b2->text()); 66 | 67 | $content->append('

Zweiter Absatz

'); 68 | $content->append('

Dritter Absatz und noch mehr Text

'); 69 | 70 | $a3 = $content->filter('p.a3'); 71 | $this->assertEquals('Dritter Absatz und noch mehr Text', $a3->html()); 72 | 73 | $a3b = $a3->filter('b'); 74 | $this->assertEquals('Dritter Absatz', $a3b->text()); 75 | 76 | $body = $c->filter('body'); 77 | $this->assertEquals('

Neue Überschrift

Ein neuer Inhalt

Zweiter Absatz

Dritter Absatz und noch mehr Text

', $this->_ignoreNewlines($body->html())); 78 | 79 | $paragraphs = $c->filter('p'); 80 | $this->assertEquals(3, count($paragraphs)); 81 | 82 | $paragraphs->append('.'); 83 | $this->assertEquals('

Ein neuer Inhalt.

Zweiter Absatz.

Dritter Absatz und noch mehr Text.

', $c->filter('p')->saveHTML()); 84 | 85 | $body->makeEmpty(); 86 | $this->assertEmpty($body->html()); 87 | 88 | $body->setAttribute('class', 'mybodyclass'); 89 | $this->assertEquals('mybodyclass', $body->attr('class')); 90 | } 91 | 92 | /** 93 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::append 94 | */ 95 | public function testAppend() 96 | { 97 | // Testing append string to several elements 98 | $c = new HtmlPageCrawler('

Paragraph 1

Paragraph 2

Paragraph 3

'); 99 | $c->filter('p')->append('
Appended Text'); 100 | $this->assertEquals('

Paragraph 1
Appended Text

Paragraph 2
Appended Text

Paragraph 3
Appended Text

', $c->saveHTML()); 101 | 102 | // Testing append HtmlPageCrawler to several elements 103 | $c = new HtmlPageCrawler('

Paragraph 1

Paragraph 2

Paragraph 3

'); 104 | $c->filter('p')->append(new HtmlPageCrawler('
Appended Text')); 105 | $this->assertEquals('

Paragraph 1
Appended Text

Paragraph 2
Appended Text

Paragraph 3
Appended Text

', $c->saveHTML()); 106 | 107 | // Testing append DOMNode to several elements 108 | $c = new HtmlPageCrawler('

Paragraph 1

Paragraph 2

Paragraph 3

'); 109 | $app = $c->getDOMDocument()->createElement('span', 'Appended Text'); 110 | $c->filter('p')->append($app); 111 | $this->assertEquals('

Paragraph 1Appended Text

Paragraph 2Appended Text

Paragraph 3Appended Text

', $c->saveHTML()); 112 | 113 | $c = new HtmlPageCrawler('
Append Self
'); 114 | $c->filter('#content')->append($c->filter('span')); 115 | $this->assertEquals('
Append Self
', $c->saveHTML()); 116 | } 117 | 118 | /** 119 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::appendTo 120 | */ 121 | public function testAppendTo() 122 | { 123 | $c = new HtmlPageCrawler('

Title

Big
'); 124 | $c->filter('em')->appendTo($c->filter('h1')); 125 | $this->assertEquals('

TitleBig

', $c->saveHTML()); 126 | 127 | $c = new HtmlPageCrawler('

Self Title

'); 128 | $c->filter('h1')->appendTo($c->filter('h1')); 129 | $this->assertEquals('

Self Title

', $c->saveHTML()); 130 | } 131 | 132 | /** 133 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::isHtmlDocument 134 | */ 135 | public function testIsHtmlDocument() 136 | { 137 | $dom = new \DOMDocument('1.0', 'UTF-8'); 138 | $dom->loadHTML('

Title

'); 139 | $c = new HtmlPageCrawler($dom); 140 | 141 | $this->assertTrue($c->isHtmlDocument()); 142 | 143 | $t = $c->filter('body'); 144 | $this->assertFalse($t->isHtmlDocument()); 145 | 146 | $c = new HtmlPageCrawler('

Title

'); 147 | $this->assertFalse($c->isHtmlDocument()); 148 | 149 | $c = new HtmlPageCrawler('

Title

'); 150 | $this->assertTrue($c->isHtmlDocument()); 151 | } 152 | 153 | /** 154 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::saveHTML 155 | */ 156 | public function testSaveHTML() 157 | { 158 | $html = "

Title

Paragraph 1

Paragraph 2

"; 159 | $dom = new \DOMDocument('1.0', 'UTF-8'); 160 | $dom->loadHTML($html); 161 | $c = new HtmlPageCrawler($dom); 162 | $this->assertEquals($html, $this->_ignoreNewlines($c->saveHTML())); 163 | $ps = $c->filter('p'); 164 | $this->assertEquals('

Paragraph 1

Paragraph 2

', $ps->saveHTML()); 165 | $t = $c->filter('h1'); 166 | $this->assertEquals('

Title

', $t->saveHTML()); 167 | 168 | $c = new HtmlPageCrawler('

Title

'); 169 | $this->assertEquals('

Title

', $c->saveHTML()); 170 | } 171 | 172 | /** 173 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::css 174 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::getStyle 175 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::setStyle 176 | */ 177 | public function testCss() 178 | { 179 | $dom = new \DOMDocument('1.0', 'UTF-8'); 180 | $dom->loadHTML('

Title

'); 182 | $c = new HtmlPageCrawler($dom); 183 | $t = $c->filter('h1'); 184 | $this->assertEquals('10px', $t->css('margin-top')); 185 | $this->assertEquals('1px solid red', $t->css('border-bottom')); 186 | $t->css('margin-bottom', '20px'); 187 | $this->assertEquals('20px', $t->css('margin-bottom')); 188 | $this->assertEquals('10px', $t->getStyle('margin-top')); 189 | $this->assertEquals('

Title

', $t->saveHTML()); 190 | $t->setStyle('border-bottom', ''); 191 | $this->assertEquals('

Title

', $t->saveHTML()); 192 | $t->setStyle('padding-top', '0'); 193 | $this->assertEquals('

Title

', $t->saveHTML()); 194 | $this->assertEquals('0', $t->getStyle('padding-top')); 195 | $this->assertNull($t->getStyle('border-bottom')); 196 | } 197 | 198 | /** 199 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::addClass 200 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::removeClass 201 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::hasClass 202 | */ 203 | public function testClasses() 204 | { 205 | $dom = new \DOMDocument('1.0', 'UTF-8'); 206 | $dom->loadHTML('

Title

'); 207 | $c = new HtmlPageCrawler($dom); 208 | $t = $c->filter('h1'); 209 | $t->addClass('ueberschrift'); 210 | $t->addClass('nochneklasse'); 211 | $t->addClass('style_class'); 212 | $this->assertEquals('

Title

', $t->saveHTML()); 213 | $this->assertTrue($t->hasClass('ueberschrift')); 214 | $this->assertTrue($t->hasClass('nochneklasse')); 215 | $this->assertTrue($t->hasClass('style_class')); 216 | $t->removeClass('nochneklasse'); 217 | $this->assertTrue($t->hasClass('ueberschrift')); 218 | $this->assertFalse($t->hasClass('nochneklasse')); 219 | $t->addClass('class1 class2'); 220 | $this->assertTrue($t->hasClass('class1')); 221 | $this->assertTrue($t->hasClass('class2')); 222 | 223 | $c1 = new HtmlPageCrawler('

'); 224 | $this->assertTrue($c1->hasClass('b')); 225 | } 226 | 227 | /** 228 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::addContent 229 | */ 230 | public function testAddContent() 231 | { 232 | $c = new HtmlPageCrawler(); 233 | $c->addContent('

Title

'); 234 | $this->assertEquals( 235 | '' 236 | . "" . '

Title

' . "", 237 | $this->_ignoreNewlines($c->saveHTML()) 238 | ); 239 | 240 | $c = new HtmlPageCrawler(); 241 | $c->addContent('

Title'); 242 | $this->assertEquals('

Title

', $c->saveHTML()); 243 | 244 | $c = new HtmlPageCrawler(); 245 | $c->addContent('

asdf

asdfaf

'); 246 | $this->assertEquals(2, count($c)); 247 | $this->assertEquals('

asdf

asdfaf

', $c->saveHTML()); 248 | } 249 | 250 | /** 251 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::before 252 | */ 253 | public function testBefore() 254 | { 255 | $c = new HtmlPageCrawler('

Title

'); 256 | $c->filter('h1')->before('

Text before h1

'); 257 | $this->assertEquals('

Text before h1

Title

', $c->saveHTML()); 258 | 259 | $c = new HtmlPageCrawler('

Title

'); 260 | $c->filter('h1')->before(new HtmlPageCrawler('

Text before h1

and more text before

')); 261 | $this->assertEquals('

Text before h1

and more text before

Title

', $c->saveHTML()); 262 | 263 | $c = new HtmlPageCrawler('

Self Before

'); 264 | $c->filter('h1')->before($c->filter('h1')); 265 | $this->assertEquals('

Self Before

', $c->saveHTML()); 266 | } 267 | 268 | /** 269 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::insertBefore 270 | */ 271 | public function testInsertBefore() 272 | { 273 | $c = new HtmlPageCrawler('

Title

Text before h1

'); 274 | $c->filter('p')->insertBefore($c->filter('h1')); 275 | $this->assertEquals('

Text before h1

Title

', $c->saveHTML()); 276 | 277 | $c = new HtmlPageCrawler('

Self Insert Before Title

Text after h1

'); 278 | $c->filter('h1')->insertBefore($c->filter('h1')); 279 | $this->assertEquals('

Self Insert Before Title

Text after h1

', $c->saveHTML()); 280 | } 281 | 282 | /** 283 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::after 284 | */ 285 | public function testAfter() 286 | { 287 | $c = new HtmlPageCrawler('

Title

'); 288 | $c->filter('h1')->after('

Text after h1

'); 289 | $this->assertEquals('

Title

Text after h1

', $c->saveHTML()); 290 | 291 | $c = new HtmlPageCrawler('

Title

Title2

'); 292 | $c->filter('h1')->after(new HtmlPageCrawler('

Text after h1

and more text after

')); 293 | $this->assertEquals('

Title

Text after h1

and more text after

Title2

Text after h1

and more text after

', $c->saveHTML()); 294 | 295 | $c = new HtmlPageCrawler('

Self After

'); 296 | $c->filter('h1')->after($c->filter('h1')); 297 | $this->assertEquals('

Self After

', $c->saveHTML()); 298 | } 299 | 300 | /** 301 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::insertAfter 302 | */ 303 | public function testInsertAfter() 304 | { 305 | $c = new HtmlPageCrawler('

Text after h1

Title

'); 306 | $c->filter('p')->insertAfter($c->filter('h1')); 307 | $this->assertEquals('

Title

Text after h1

', $c->saveHTML()); 308 | 309 | $c = new HtmlPageCrawler('

Text before h1

Self Insert After Title

'); 310 | $c->filter('h1')->insertAfter($c->filter('h1')); 311 | $this->assertEquals('

Text before h1

Self Insert After Title

', $c->saveHTML()); 312 | } 313 | 314 | /** 315 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::prepend 316 | */ 317 | public function testPrepend() 318 | { 319 | $c = new HtmlPageCrawler('

Title

'); 320 | $c->filter('#content')->prepend('

Text before h1

'); 321 | $this->assertEquals('

Text before h1

Title

', $c->saveHTML()); 322 | 323 | $c = new HtmlPageCrawler('
'); 324 | $c->filter('#content')->prepend(new HtmlPageCrawler('

Text before h1

and more text before

')); 325 | $this->assertEquals('

Text before h1

and more text before

', $c->saveHTML()); 326 | 327 | $c = new HtmlPageCrawler('
Prepend Self
'); 328 | $c->filter('#content')->prepend($c->filter('span')); 329 | $this->assertEquals('
Prepend Self
', $c->saveHTML()); 330 | } 331 | 332 | /** 333 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::prependTo 334 | */ 335 | public function testPrependTo() 336 | { 337 | $c = new HtmlPageCrawler('

Text before

'); 338 | $c->filter('p')->prependTo('Text'); 339 | $this->assertEquals('

Text before

', $c->saveHTML()); 340 | 341 | $c = new HtmlPageCrawler('

Title

'); 342 | $c->filter('#content')->prependTo(new HtmlPageCrawler('

paragraph

')); 343 | $this->assertEquals('

Title

', $c->saveHTML()); 344 | 345 | $c = new HtmlPageCrawler('

Title

Big
'); 346 | $c->filter('em')->prependTo($c->filter('h1')); 347 | $this->assertEquals('

BigTitle

', $c->saveHTML()); 348 | 349 | $c = new HtmlPageCrawler('

Self Title

'); 350 | $c->filter('h1')->prependTo($c->filter('h1')); 351 | $this->assertEquals('

Self Title

', $c->saveHTML()); 352 | } 353 | 354 | /** 355 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::wrap 356 | */ 357 | public function testWrap() 358 | { 359 | $c = new HtmlPageCrawler('

Title

'); 360 | $c->filter('h1')->wrap('
'); 361 | $this->assertEquals('

Title

', $c->saveHTML()); 362 | 363 | $c = new HtmlPageCrawler('

Title

'); 364 | $c->filter('h1')->wrap('
asdf
'); 365 | $this->assertEquals('
asdf

Title

', $c->saveHTML()); 366 | 367 | $c = new HtmlPageCrawler('

Title

'); 368 | $c->filter('h1')->wrap('
asdf
jkl
'); // wrap has more than 1 root element 369 | $this->assertEquals('
asdf

Title

', $c->saveHTML()); // only first element is used 370 | 371 | // Test for wrapping multiple nodes 372 | $c = new HtmlPageCrawler('

p1

p2

'); 373 | $c->filter('p')->wrap('
'); 374 | $this->assertEquals('

p1

p2

', $c->saveHTML()); 375 | 376 | $c = new HtmlPageCrawler('plain text node'); 377 | $c->wrap('
'); 378 | $this->assertEquals('
plain text node
', $c->ancestors()->eq(0)->saveHTML()); 379 | 380 | $c = HtmlPageCrawler::create('
'); 381 | $m = HtmlPageCrawler::create('message 1')->appendTo($c); 382 | $m->wrap('

'); 383 | $m = HtmlPageCrawler::create('message 2')->appendTo($c); 384 | $m->wrap('

'); 385 | $this->assertEquals('

message 1

message 2

', $c->saveHTML()); 386 | } 387 | 388 | /** 389 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::replaceWith 390 | */ 391 | public function testReplaceWith() 392 | { 393 | $c = HtmlPageCrawler::create('

Absatz 1

Absatz 2

Absatz 3

'); 394 | $oldparagraphs = $c->filter('p')->replaceWith('
newtext 1
newtext 2
'); 395 | $this->assertEquals('
newtext 1
newtext 2
newtext 1
newtext 2
newtext 1
newtext 2
', $c->saveHTML()); 396 | $this->assertEquals('

Absatz 1

Absatz 2

Absatz 3

', $oldparagraphs->saveHTML()); 397 | } 398 | 399 | /** 400 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::replaceAll 401 | */ 402 | public function testReplaceAll() 403 | { 404 | $c = HtmlPageCrawler::create('

Absatz 1

Absatz 2

Absatz 3

'); 405 | $new = HtmlPageCrawler::create('
newtext 1
newtext 2
'); 406 | $new->replaceAll($c->filter('p')); 407 | $this->assertEquals('
newtext 1
newtext 2
newtext 1
newtext 2
newtext 1
newtext 2
', $c->saveHTML()); 408 | } 409 | 410 | /** 411 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::wrapAll 412 | */ 413 | public function testWrapAll() 414 | { 415 | $c = HtmlPageCrawler::create('
Before

Absatz 1

Inner

Absatz 2

Absatz 3

After
'); 416 | $c->filter('p')->wrapAll('
'); 417 | $this->assertEquals('
Before

Absatz 1

Absatz 2

Absatz 3

Inner
After
', $c->saveHTML()); 418 | 419 | // Test for wrapping with elements that have children 420 | $c = HtmlPageCrawler::create('

Absatz 1

Absatz 2

Absatz 3

'); 421 | $c->filter('p')->wrapAll('
'); 422 | $this->assertEquals('

Absatz 1

Absatz 2

Absatz 3

', $c->saveHTML()); 423 | } 424 | 425 | /** 426 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::wrapInner 427 | */ 428 | public function testWrapInner() 429 | { 430 | $c = HtmlPageCrawler::create('

Absatz 1

Absatz 2

Absatz 3

'); 431 | $c->wrapInner('
'); 432 | $this->assertEquals('

Absatz 1

Absatz 2

Absatz 3

', $c->saveHTML()); 433 | } 434 | 435 | /** 436 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::unwrap 437 | */ 438 | public function testUnwrap() 439 | { 440 | $c = HtmlPageCrawler::create('
Before

Absatz 1

After
'); 441 | $p = $c->filter('p'); 442 | $p->unwrap(); 443 | $this->assertEquals('
Before

Absatz 1

After
', $c->saveHTML()); 444 | } 445 | 446 | public function testUnwrapInnerOnDOMElementExeption() 447 | { 448 | $this->expectException(\InvalidArgumentException::class); 449 | $this->expectErrorMessage('DOMElement does not have a parent DOMElement node.'); 450 | 451 | $c = HtmlPageCrawler::create('
'); 452 | $p = $c->filter('div#content'); 453 | $p->unwrapInner(); 454 | $p->unwrapInner(); 455 | } 456 | 457 | /** 458 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::unwrapInner 459 | */ 460 | public function testUnwrapInner() 461 | { 462 | $c = HtmlPageCrawler::create('
Before

Absatz 1

After
'); 463 | $p = $c->filter('div.a'); 464 | $p->unwrapInner(); 465 | $this->assertEquals('
Before

Absatz 1

After
', $c->saveHTML()); 466 | } 467 | 468 | /** 469 | * @covers Wa72\HtmlPageDom\HtmlPageCrawler::toggleClass 470 | */ 471 | public function testToggleClass() 472 | { 473 | $c = HtmlPageCrawler::create('
'); 474 | $c->filter('div')->toggleClass('a d')->toggleClass('b'); 475 | $this->assertEquals('
', $c->saveHTML()); 476 | } 477 | 478 | public function testRemove() 479 | { 480 | // remove every third td in tbody 481 | $html = << 483 | 484 | 485 | A 486 | B 487 | 488 | 489 | 490 | 491 | 16.12.2013 492 | asdf asdf 493 |   494 | 495 | 496 | 02.12.2013 16:30 497 | asdf asdf 498 |   499 | 500 | 501 | 25.11.2013 16:30 502 | asdf asdf 503 |   504 | 505 | 506 | 18.11.2013 16:30 507 | asdf asdf 508 |   509 | 510 | 511 | 24.10.2013 16:30 512 | asdf asdf 513 |   514 | 515 | 516 | 10.10.2013 16:30 517 | asdf asdf 518 |   519 | 520 | 521 | END; 522 | $c = HtmlPageCrawler::create($html); 523 | $this->assertEquals(1, count($c->filter('td.c23'))); 524 | $tbd = $c->filter('table > tbody > tr > td') 525 | ->reduce( 526 | function ($c, $j) { 527 | if (($j+1) % 3 == 0) { 528 | return true; 529 | } 530 | return false; 531 | } 532 | ); 533 | $this->assertEquals(6, count($tbd)); 534 | $tbd->remove(); 535 | $this->assertEquals(0, count($tbd)); 536 | $this->assertEquals(0, count($c->filter('td.c23'))); 537 | } 538 | 539 | public function testUTF8Characters() 540 | { 541 | $text = file_get_contents(__DIR__ . '/utf8.html'); 542 | $c = HtmlPageCrawler::create($text); 543 | 544 | $expected =<<< END 545 |

Die Burse wurde unmittelbar (1478 bis 1482) nach der Universitätsgründung als Studentenwohnhaus und -lehranstalt errichtet. Hier lehrte der Humanist und Reformator Philipp Melanchthon bis zu seiner Berufung nach Wittenberg 1518, an ihn erinnert eine Gedenktafel. 1803 bis 1805 wurde das Gebäude im Stil des Klassizismus zum ersten Tübinger Klinikum umgebaut. Einer der ersten Patienten war Friedrich Hölderlin, der nach einer 231 Tage dauernden Behandlung am 3. Mai 1807 als unheilbar entlassen wurde.

Einst Badeanstalt vor der Stadtmauer. Wer durch das kleine Stadttor geht, hat – rückwärts gewandt – einen guten Blick auf die Stadtbefestigung mit "Pechnasen" und Spuren des alten Wehrgangs.

546 | END; 547 | 548 | $this->assertEquals($expected, $c->filter('p')->saveHTML()); 549 | } 550 | 551 | public function testAttr() 552 | { 553 | $c = HtmlPageCrawler::create('
'); 554 | $this->assertNull($c->attr('data-foo')); 555 | $c->setAttribute('data-foo', 'bar'); 556 | $this->assertEquals('bar', $c->attr('data-foo')); 557 | $this->assertEquals('bar', $c->getAttribute('data-foo')); 558 | $c->removeAttribute('data-foo'); 559 | $this->assertNull($c->attr('data-foo')); 560 | $c->setAttribute('data-foo', 'bar'); 561 | $this->assertEquals('bar', $c->attr('data-foo')); 562 | // getAttribute is just an alias to attr() and should provide the same result 563 | $this->assertEquals('bar', $c->getAttribute('data-foo')); 564 | $c->removeAttr('data-foo'); 565 | $this->assertNull($c->attr('data-foo')); 566 | 567 | } 568 | 569 | public function testAttrOnInvalidNodeList() 570 | { 571 | $this->expectException(\InvalidArgumentException::class); 572 | $c = HtmlPageCrawler::create(null); 573 | $c->attr('data-foo'); 574 | } 575 | 576 | public function testSetInnerHtml() 577 | { 578 | $html = HtmlPageCrawler::create('

Title

'); 579 | $this->assertInstanceOf('Wa72\HtmlPageDom\HtmlPageCrawler', $html->setInnerHtml('

Title

')); 580 | $this->assertEquals('

Title

', $html->html()); 581 | // getInnerHtml is just an alias for html() and should provide the same result 582 | $this->assertEquals('

Title

', $html->getInnerHtml()); 583 | } 584 | 585 | public function testToString() 586 | { 587 | $html = HtmlPageCrawler::create('

Title

'); 588 | $this->assertEquals('

Title

', (string) $html); 589 | } 590 | 591 | public function testGetDOMDocument() 592 | { 593 | $html = HtmlPageCrawler::create('

Title

'); 594 | $this->assertInstanceOf('\DOMDocument', $html->getDOMDocument()); 595 | } 596 | 597 | public function testAddOnCrawlerInstance() 598 | { 599 | $html = HtmlPageCrawler::create('

Title

'); 600 | $html->add($html); 601 | $this->assertEquals('

Title

', (string) $html); 602 | } 603 | 604 | public function testReturnValues() 605 | { 606 | // appendTo, insertBefore, insertAfter, replaceAll should always return new Crawler objects 607 | // see http://jquery.com/upgrade-guide/1.9/#appendto-insertbefore-insertafter-and-replaceall 608 | 609 | $c1 = HtmlPageCrawler::create('

Headline

'); 610 | $c2 = HtmlPageCrawler::create('

1

2

3

'); 611 | $c3 = HtmlPageCrawler::create('asdf'); 612 | 613 | $r1 = $c3->appendTo($c1); 614 | $this->assertNotEquals(spl_object_hash($c3), spl_object_hash($r1)); 615 | 616 | $r2 = $c3->insertBefore($c1); 617 | $this->assertNotEquals(spl_object_hash($c3), spl_object_hash($r2)); 618 | 619 | $r3 = $c3->insertAfter($c1); 620 | $this->assertNotEquals(spl_object_hash($c3), spl_object_hash($r3)); 621 | 622 | $r4 = $c3->replaceAll($c1); 623 | $this->assertNotEquals(spl_object_hash($c3), spl_object_hash($r4)); 624 | 625 | 626 | $r1 = $c3->appendTo($c2); 627 | $this->assertNotEquals(spl_object_hash($c2), spl_object_hash($r1)); 628 | 629 | $r2 = $c3->insertBefore($c2); 630 | $this->assertNotEquals(spl_object_hash($c2), spl_object_hash($r2)); 631 | 632 | $r3 = $c3->insertAfter($c2); 633 | $this->assertNotEquals(spl_object_hash($c2), spl_object_hash($r3)); 634 | 635 | $r4 = $c3->replaceAll($c2); 636 | $this->assertNotEquals(spl_object_hash($c2), spl_object_hash($r4)); 637 | 638 | } 639 | 640 | public function testDisconnectedNodes() 641 | { 642 | // if after(), before() or replaceWith() is called on a node without parent, 643 | // the unmodified Crawler object should be returned 644 | // 645 | // see http://jquery.com/upgrade-guide/1.9/#after-before-and-replacewith-with-disconnected-nodes 646 | $c = HtmlPageCrawler::create('
abc
'); 647 | $r = HtmlPageCrawler::create('
def
'); 648 | 649 | $r1 = $c->after($r); 650 | $this->assertEquals(spl_object_hash($r1), spl_object_hash($c)); 651 | $this->assertEquals(count($r1), count($c)); 652 | 653 | $r2 = $c->before($r); 654 | $this->assertEquals(spl_object_hash($r2), spl_object_hash($c)); 655 | $this->assertEquals(count($r2), count($c)); 656 | 657 | $r3 = $c->replaceWith($r); 658 | $this->assertEquals(spl_object_hash($r3), spl_object_hash($c)); 659 | $this->assertEquals(count($r3), count($c)); 660 | } 661 | 662 | public function testClone() 663 | { 664 | $c = HtmlPageCrawler::create('

asdf

'); 665 | $p = $c->filter('p'); 666 | 667 | $p1 = $p->makeClone(); 668 | $this->assertNotEquals(spl_object_hash($p), spl_object_hash($p1)); 669 | $this->assertTrue($p1->hasClass('x')); 670 | $p1->removeClass('x'); 671 | $this->assertTrue($p->hasClass('x')); 672 | $this->assertFalse($p1->hasClass('x')); 673 | $p->after($p1); 674 | $this->assertEquals('

asdf

asdf

', $c->saveHTML()); 675 | } 676 | 677 | public function testGetCombinedText() 678 | { 679 | $c = HtmlPageCrawler::create('

abc

def

'); 680 | $this->assertEquals('abcdef', $c->getCombinedText()); 681 | $c->setText('jklo'); 682 | $this->assertEquals('jklojklo', $c->getCombinedText()); 683 | } 684 | 685 | public function testSetText() 686 | { 687 | $c = HtmlPageCrawler::create('
"
'); 688 | $this->assertEquals('"', $c->text()); 689 | $c->setText('&'); 690 | $this->assertEquals('&', $c->text()); 691 | } 692 | 693 | public function testMagicGet() 694 | { 695 | // $crawler->length should give us the number of nodes in the crawler 696 | $c = HtmlPageCrawler::create('

abc

def

'); 697 | $this->assertEquals(2, $c->length); 698 | 699 | // not existing property throws exception 700 | try { 701 | $c->foo; 702 | } catch (\Exception $e) { 703 | $this->assertEquals('No such property foo', $e->getMessage()); 704 | return; 705 | } 706 | $this->fail(); 707 | } 708 | } 709 | -------------------------------------------------------------------------------- /Tests/HtmlPageTest.php: -------------------------------------------------------------------------------- 1 | root = vfsStream::setup('root'); 13 | } 14 | 15 | public function testHtmlPage() 16 | { 17 | $hp = new HtmlPage; 18 | $this->assertEquals("\n\n", $hp->__toString()); 19 | 20 | $title = 'Erste Testseite'; 21 | $hp->setTitle($title); 22 | $this->assertEquals($title, $hp->getTitle()); 23 | 24 | $title = 'Seite "schön & gut" >> so wird\'s, süß'; 25 | $hp->setTitle($title); 26 | $this->assertEquals($title, $hp->getTitle()); 27 | 28 | $description = 'Dies ist die erste "Testseite" >> so wird\'s, süß'; 29 | $hp->setMeta('description', $description); 30 | $this->assertEquals($description, $hp->getMeta('description')); 31 | 32 | $hp->removeMeta('description'); 33 | $this->assertNull($hp->getMeta('description')); 34 | 35 | $bodycontent = '
Testcontent1
'; 36 | $body = $hp->filter('body'); 37 | $body->setInnerHtml($bodycontent); 38 | $this->assertEquals($bodycontent, $body->html()); 39 | $this->assertEquals($bodycontent, $hp->filter('body')->html()); 40 | 41 | $content = "

Überschrift

\n

bla bla
fett

"; 42 | $hp->setHtmlById('content', $content); 43 | // echo $hp; 44 | $this->assertEquals($content, $hp->getElementById('content')->html()); 45 | 46 | $url = 'http://www.tuebingen.de/'; 47 | $hp->setBaseHref($url); 48 | $this->assertEquals($url, $hp->getBaseHref()); 49 | } 50 | 51 | 52 | public function testClone() 53 | { 54 | $hp = new HtmlPage; 55 | $this->assertEquals("\n\n", $hp->__toString()); 56 | 57 | $title = 'Erste Testseite'; 58 | $hp->setTitle($title); 59 | $this->assertEquals($title, $hp->getTitle()); 60 | 61 | $hp2 = clone $hp; 62 | 63 | $newtitle = 'Seitentitel neu'; 64 | $hp->setTitle($newtitle); 65 | 66 | $this->assertEquals($title, $hp2->getTitle()); 67 | $this->assertEquals($newtitle, $hp->getTitle()); 68 | } 69 | 70 | public function testScript() 71 | { 72 | $html =<< 74 | 75 | 76 | 77 | 81 | 82 | 83 | 84 | 85 | 86 | END; 87 | $hp = new HtmlPage($html); 88 | $hp->getBody()->append('

Script Test

'); 89 | $newhtml = $hp->save(); 90 | 91 | $expected =<< 93 | 94 | 95 | 96 | 100 | 101 | 102 |

Script Test

103 | 104 | 105 | END; 106 | $this->assertEquals($expected, $newhtml); 107 | 108 | } 109 | 110 | public function testMinify() 111 | { 112 | $html =<< 114 | 115 | 116 | 117 | 121 | 122 | 123 |

TEST

124 |

125 | asdf jksdlf ajsfk 126 | jasdf 127 | jaksfd asdf 128 | jasdf jaks 129 |

130 | 131 | 132 | 133 | END; 134 | $hp = new HtmlPage($html); 135 | 136 | $expected = << 138 |

TEST

asdf jksdlf ajsfk jasdf jaksfd asdf jasdf jaks

139 | 140 | END; 141 | $this->assertEquals($expected, $hp->minify()->save()); 142 | 143 | } 144 | 145 | public function testIndent() 146 | { 147 | $html =<< 149 | 150 | 151 | 152 | 156 | 157 | 158 |

TEST

159 |

160 | asdf jksdlf ajsfk 161 | jasdf 162 | jaksfd asdf 163 | jasdf jaks 164 |

165 | 166 | 167 | 168 | END; 169 | $hp = new HtmlPage($html); 170 | 171 | $expected = << 173 | 174 | 175 | 176 | 180 | 181 | 182 |

TEST

183 |

asdf jksdlf ajsfk jasdf jaksfd asdf jasdf jaks

184 | 185 | 186 | 187 | END; 188 | $this->assertEquals($expected, $hp->indent()->save()); 189 | 190 | } 191 | 192 | public function testGetCrawler() 193 | { 194 | $html = << 196 | 197 | 198 | 199 | 203 | 204 | 205 |

TEST

206 |

207 | asdf jksdlf ajsfk 208 | jasdf 209 | jaksfd asdf 210 | jasdf jaks 211 |

212 | 213 | 214 | 215 | END; 216 | 217 | $hp = new HtmlPage($html); 218 | $this->assertEquals('

TEST

', $hp->getCrawler()->filter('h1')->saveHtml()); 219 | } 220 | 221 | public function testGetDOMDocument() 222 | { 223 | $html = << 225 | 226 | 227 | 228 | 232 | 233 | 234 |

TEST

235 |

236 | asdf jksdlf ajsfk 237 | jasdf 238 | jaksfd asdf 239 | jasdf jaks 240 |

241 | 242 | 243 | 244 | END; 245 | 246 | $hp = new HtmlPage($html); 247 | $this->assertInstanceOf('\DOMDocument', $hp->getDOMDocument()); 248 | } 249 | 250 | public function testSetTitleOnNoTitleElement() 251 | { 252 | $html = << 254 | 255 | 256 | 260 | 261 | 262 |

TEST

263 |

264 | asdf jksdlf ajsfk 265 | jasdf 266 | jaksfd asdf 267 | jasdf jaks 268 |

269 | 270 | 271 | 272 | END; 273 | 274 | $hp = new HtmlPage($html); 275 | $hp->setTitle('TEST'); 276 | $this->assertEquals('TEST', $hp->getTitle()); 277 | } 278 | 279 | public function testGetTitleShouldReturnNull() 280 | { 281 | $html = << 283 | 284 | 285 | 289 | 290 | 291 |

TEST

292 |

293 | asdf jksdlf ajsfk 294 | jasdf 295 | jaksfd asdf 296 | jasdf jaks 297 |

298 | 299 | 300 | 301 | END; 302 | 303 | $hp = new HtmlPage($html); 304 | $this->assertNull($hp->getTitle()); 305 | } 306 | 307 | public function testGetBaseHrefShouldReturnNull() 308 | { 309 | $hp = new HtmlPage('TESTHello'); 310 | $this->assertNull($hp->getBaseHref()); 311 | } 312 | 313 | public function testGetHeadNodeShouldAddTheHeadTag() 314 | { 315 | $hp = new HtmlPage('Hello'); 316 | $this->assertInstanceOf('\DOMElement', $hp->getHeadNode()); 317 | $this->assertEquals('', (string) $hp->getHead()); 318 | } 319 | 320 | public function testGetBodyNodeShouldAddTheBodyTag() 321 | { 322 | $hp = new HtmlPage(''); 323 | $this->assertInstanceOf('\DOMElement', $hp->getBodyNode()); 324 | $this->assertEquals('', (string) $hp->getBody()); 325 | } 326 | 327 | public function testTrimNewlines() 328 | { 329 | $html = << 331 | 332 | 333 | TEST 334 | 335 | 336 | END; 337 | 338 | $this->assertEquals(' TEST ', (string) HtmlPage::trimNewlines($html)); 339 | } 340 | 341 | public function testSaveOnFileName() 342 | { 343 | $hp = new HtmlPage('TEST'); 344 | $hp->save(vfsStream::url('root/save.html')); 345 | $this->assertFileExists(vfsStream::url('root/save.html')); 346 | } 347 | 348 | public function testEmbeddedScriptWithHtml() 349 | { 350 | // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements 351 | // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string 352 | $html = << 354 | 355 | 356 | test 357 | 358 | 359 |
360 | 363 |
364 | 365 | 366 | END; 367 | $hp = new HtmlPage($html); 368 | $this->assertEquals($html . "\n", $hp->save()); 369 | } 370 | } 371 | -------------------------------------------------------------------------------- /Tests/phpunit_bootstrap.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

Die Burse wurde unmittelbar (1478 bis 1482) nach der Universitätsgründung als Studentenwohnhaus und -lehranstalt errichtet. Hier lehrte der Humanist und Reformator Philipp Melanchthon bis zu seiner Berufung nach Wittenberg 1518, an ihn erinnert eine Gedenktafel. 1803 bis 1805 wurde das Gebäude im Stil des Klassizismus zum ersten Tübinger Klinikum umgebaut. Einer der ersten Patienten war Friedrich Hölderlin, der nach einer 231 Tage dauernden Behandlung am 3. Mai 1807 als unheilbar entlassen wurde.

9 |

Einst Badeanstalt vor der Stadtmauer. Wer durch das kleine Stadttor geht, hat – rückwärts gewandt – einen guten Blick auf die Stadtbefestigung mit "Pechnasen" und Spuren des alten Wehrgangs.

10 | 11 | 12 | -------------------------------------------------------------------------------- /UPGRADE.md: -------------------------------------------------------------------------------- 1 | Upgrade from 2.x to 3.0 2 | ----------------------- 3 | 4 | Release 3.x is compatible only with Symfony 6, while older releases are compatible with Symfony up to 5.4. 5 | Otherwise there are no changes in our API, so no changes should be required in your code using this lib. Just upgrade to Version 3 when you upgrade your project to Symfony 6 and all should be well. 6 | 7 | 8 | Upgrade from 1.x to 2.0 9 | ------------------------ 10 | 11 | Several changes have been made to the public API in 2.0 in order to keep 12 | compatibility with Symfony 4.3: 13 | 14 | - `HtmlPageCrawler::html()` is now just the parent `Crawler::html()` and acts as *getter* only. 15 | Setting HTML content via `HtmlPageCrawler::html($html)` is *not possible* any more, 16 | use `HtmlPageCrawler::setInnerHtml($html)` instead 17 | 18 | - `HtmlPageCrawler::text()` is now just the parent `Crawler::text()` and acts as *getter* only 19 | that returns the text content from the *first* node only. For setting text content, use 20 | `HtmlPageCrawler::setText($text)` instead. 21 | 22 | - new method `HtmlPageCrawler::getCombinedText()` that returns the combined text from all nodes 23 | (as jQuery's `text()` function does and previous versions of `HtmlPageCrawler::text()` did) 24 | 25 | - `HtmlPageCrawler::attr()` is now just the parent `Crawler::attr()` and acts as *getter* only. 26 | For setting attributes use `HtmlPageCrawler::setAttribute($name, $value)` 27 | 28 | - removed method `HtmlPageCrawler::isDisconnected()` 29 | 30 | __To update your code, you have to:__ 31 | 32 | - replace all calls to `$MyCrawlerInstance->html($html)` used as *setter* by `$MyCrawlerInstance->setInnerHtml($html)` 33 | - replace all calls to `$MyCrawlerInstance->attr($name, $value)` used as *setter* by `$MyCrawlerInstance->setAttribute($name, $value)` 34 | - replace all calls to `$MyCrawlerInstance->text($text)` used as *setter* by `$MyCrawlerInstance->setText($text)` 35 | - replace all calls to `$MyCrawlerInstance->text()` (i.e. every call to `text()` not preceded by `first()`) by `$MyCrawlerInstance->getCombinedText()` 36 | - replace all calls to `$MyCrawlerInstance->first()->text()` by `$MyCrawlerInstance->text()` 37 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name":"wa72/htmlpagedom", 3 | "description":"jQuery-inspired DOM manipulation extension for Symfony's Crawler", 4 | "keywords":["HTML", "DOM", "Crawler"], 5 | "homepage":"http://github.com/wasinger/htmlpagedom", 6 | "type":"library", 7 | "license":"MIT", 8 | "authors":[ 9 | { 10 | "name":"Christoph Singer", 11 | "email":"singer@webagentur72.de", 12 | "homepage":"http://www.webagentur72.de" 13 | } 14 | ], 15 | "require":{ 16 | "php":"^8.0", 17 | "ext-dom":"*", 18 | "ext-libxml":"*", 19 | "symfony/polyfill-mbstring": "~1.0", 20 | "symfony/dom-crawler":"^6.0 || ^7.0", 21 | "symfony/css-selector":"^6.0 || ^7.0" 22 | }, 23 | "require-dev": { 24 | "phpunit/phpunit": "^9", 25 | "wa72/html-pretty-min": "~0.1", 26 | "mikey179/vfsstream": "^1.6.10", 27 | "scrutinizer/ocular": "^1.9", 28 | "clean/phpdoc-md": "^0.19.3" 29 | }, 30 | "suggest": { 31 | "wa72/html-pretty-min": "Minify or indent HTML documents" 32 | }, 33 | "autoload":{ 34 | "psr-4":{ 35 | "Wa72\\HtmlPageDom\\":"src/" 36 | } 37 | }, 38 | "extra": { 39 | "branch-alias": { 40 | "dev-master": "3.0-dev" 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /doc/HtmlPage.md: -------------------------------------------------------------------------------- 1 | # Wa72\HtmlPageDom\HtmlPage 2 | 3 | This class represents a complete HTML document. 4 | 5 | It offers convenience functions for getting and setting elements of the document 6 | such as setTitle(), getTitle(), setMeta($name, $value), getBody(). 7 | 8 | It uses HtmlPageCrawler to navigate and manipulate the DOM tree. 9 | 10 | ## Implements: 11 | Stringable 12 | 13 | 14 | 15 | ## Methods 16 | 17 | | Name | Description | 18 | |------|-------------| 19 | |[__clone](#htmlpage__clone)|| 20 | |[__construct](#htmlpage__construct)|| 21 | |[__toString](#htmlpage__tostring)|| 22 | |[filter](#htmlpagefilter)|Filter nodes by using a CSS selector| 23 | |[filterXPath](#htmlpagefilterxpath)|Filter nodes by XPath expression| 24 | |[getBaseHref](#htmlpagegetbasehref)|Get the href attribute from the base tag, null if not present in document| 25 | |[getBody](#htmlpagegetbody)|Get the document's body wrapped in a HtmlPageCrawler instance| 26 | |[getBodyNode](#htmlpagegetbodynode)|Get the document's body as DOMElement| 27 | |[getCrawler](#htmlpagegetcrawler)|Get a HtmlPageCrawler object containing the root node of the HTML document| 28 | |[getDOMDocument](#htmlpagegetdomdocument)|Get a DOMDocument object for the HTML document| 29 | |[getElementById](#htmlpagegetelementbyid)|Get an element in the document by it's id attribute| 30 | |[getHead](#htmlpagegethead)|Get the document's HEAD section wrapped in a HtmlPageCrawler instance| 31 | |[getHeadNode](#htmlpagegetheadnode)|Get the document's HEAD section as DOMElement| 32 | |[getMeta](#htmlpagegetmeta)|Get the content attribute of a meta tag with the specified name attribute| 33 | |[getTitle](#htmlpagegettitle)|Get the page title of the HTML document| 34 | |[indent](#htmlpageindent)|indent the HTML document| 35 | |[minify](#htmlpageminify)|minify the HTML document| 36 | |[removeMeta](#htmlpageremovemeta)|Remove all meta tags with the specified name attribute| 37 | |[save](#htmlpagesave)|Save this document to a HTML file or return HTML code as string| 38 | |[setBaseHref](#htmlpagesetbasehref)|Set the base tag with href attribute set to parameter $url| 39 | |[setHtmlById](#htmlpagesethtmlbyid)|Sets innerHTML content of an element specified by elementId| 40 | |[setMeta](#htmlpagesetmeta)|Set a META tag with specified 'name' and 'content' attributes| 41 | |[setTitle](#htmlpagesettitle)|Sets the page title of the HTML document| 42 | |[trimNewlines](#htmlpagetrimnewlines)|remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space)| 43 | 44 | 45 | 46 | 47 | ### HtmlPage::__clone 48 | 49 | **Description** 50 | 51 | ```php 52 | __clone (void) 53 | ``` 54 | 55 | 56 | 57 | 58 | 59 | **Parameters** 60 | 61 | `This function has no parameters.` 62 | 63 | **Return Values** 64 | 65 | `void` 66 | 67 | 68 |
69 | 70 | 71 | ### HtmlPage::__construct 72 | 73 | **Description** 74 | 75 | ```php 76 | __construct (void) 77 | ``` 78 | 79 | 80 | 81 | 82 | 83 | **Parameters** 84 | 85 | `This function has no parameters.` 86 | 87 | **Return Values** 88 | 89 | `void` 90 | 91 | 92 |
93 | 94 | 95 | ### HtmlPage::__toString 96 | 97 | **Description** 98 | 99 | ```php 100 | __toString (void) 101 | ``` 102 | 103 | 104 | 105 | 106 | 107 | **Parameters** 108 | 109 | `This function has no parameters.` 110 | 111 | **Return Values** 112 | 113 | `void` 114 | 115 | 116 |
117 | 118 | 119 | ### HtmlPage::filter 120 | 121 | **Description** 122 | 123 | ```php 124 | public filter (string $selector) 125 | ``` 126 | 127 | Filter nodes by using a CSS selector 128 | 129 | 130 | 131 | **Parameters** 132 | 133 | * `(string) $selector` 134 | : CSS selector 135 | 136 | **Return Values** 137 | 138 | `\HtmlPageCrawler` 139 | 140 | 141 | 142 | 143 |
144 | 145 | 146 | ### HtmlPage::filterXPath 147 | 148 | **Description** 149 | 150 | ```php 151 | public filterXPath (string $xpath) 152 | ``` 153 | 154 | Filter nodes by XPath expression 155 | 156 | 157 | 158 | **Parameters** 159 | 160 | * `(string) $xpath` 161 | : XPath expression 162 | 163 | **Return Values** 164 | 165 | `\HtmlPageCrawler` 166 | 167 | 168 | 169 | 170 |
171 | 172 | 173 | ### HtmlPage::getBaseHref 174 | 175 | **Description** 176 | 177 | ```php 178 | public getBaseHref (void) 179 | ``` 180 | 181 | Get the href attribute from the base tag, null if not present in document 182 | 183 | 184 | 185 | **Parameters** 186 | 187 | `This function has no parameters.` 188 | 189 | **Return Values** 190 | 191 | `null|string` 192 | 193 | 194 | 195 | 196 |
197 | 198 | 199 | ### HtmlPage::getBody 200 | 201 | **Description** 202 | 203 | ```php 204 | public getBody (void) 205 | ``` 206 | 207 | Get the document's body wrapped in a HtmlPageCrawler instance 208 | 209 | 210 | 211 | **Parameters** 212 | 213 | `This function has no parameters.` 214 | 215 | **Return Values** 216 | 217 | `\HtmlPageCrawler` 218 | 219 | 220 | 221 | 222 |
223 | 224 | 225 | ### HtmlPage::getBodyNode 226 | 227 | **Description** 228 | 229 | ```php 230 | public getBodyNode (void) 231 | ``` 232 | 233 | Get the document's body as DOMElement 234 | 235 | 236 | 237 | **Parameters** 238 | 239 | `This function has no parameters.` 240 | 241 | **Return Values** 242 | 243 | `\DOMElement` 244 | 245 | 246 | 247 | 248 |
249 | 250 | 251 | ### HtmlPage::getCrawler 252 | 253 | **Description** 254 | 255 | ```php 256 | public getCrawler (void) 257 | ``` 258 | 259 | Get a HtmlPageCrawler object containing the root node of the HTML document 260 | 261 | 262 | 263 | **Parameters** 264 | 265 | `This function has no parameters.` 266 | 267 | **Return Values** 268 | 269 | `\HtmlPageCrawler` 270 | 271 | 272 | 273 | 274 |
275 | 276 | 277 | ### HtmlPage::getDOMDocument 278 | 279 | **Description** 280 | 281 | ```php 282 | public getDOMDocument (void) 283 | ``` 284 | 285 | Get a DOMDocument object for the HTML document 286 | 287 | 288 | 289 | **Parameters** 290 | 291 | `This function has no parameters.` 292 | 293 | **Return Values** 294 | 295 | `\DOMDocument` 296 | 297 | 298 | 299 | 300 |
301 | 302 | 303 | ### HtmlPage::getElementById 304 | 305 | **Description** 306 | 307 | ```php 308 | public getElementById (string $id) 309 | ``` 310 | 311 | Get an element in the document by it's id attribute 312 | 313 | 314 | 315 | **Parameters** 316 | 317 | * `(string) $id` 318 | 319 | **Return Values** 320 | 321 | `\HtmlPageCrawler` 322 | 323 | 324 | 325 | 326 |
327 | 328 | 329 | ### HtmlPage::getHead 330 | 331 | **Description** 332 | 333 | ```php 334 | public getHead (void) 335 | ``` 336 | 337 | Get the document's HEAD section wrapped in a HtmlPageCrawler instance 338 | 339 | 340 | 341 | **Parameters** 342 | 343 | `This function has no parameters.` 344 | 345 | **Return Values** 346 | 347 | `\HtmlPageCrawler` 348 | 349 | 350 | 351 | 352 |
353 | 354 | 355 | ### HtmlPage::getHeadNode 356 | 357 | **Description** 358 | 359 | ```php 360 | public getHeadNode (void) 361 | ``` 362 | 363 | Get the document's HEAD section as DOMElement 364 | 365 | 366 | 367 | **Parameters** 368 | 369 | `This function has no parameters.` 370 | 371 | **Return Values** 372 | 373 | `\DOMElement` 374 | 375 | 376 | 377 | 378 |
379 | 380 | 381 | ### HtmlPage::getMeta 382 | 383 | **Description** 384 | 385 | ```php 386 | public getMeta (string $name) 387 | ``` 388 | 389 | Get the content attribute of a meta tag with the specified name attribute 390 | 391 | 392 | 393 | **Parameters** 394 | 395 | * `(string) $name` 396 | 397 | **Return Values** 398 | 399 | `null|string` 400 | 401 | 402 | 403 | 404 |
405 | 406 | 407 | ### HtmlPage::getTitle 408 | 409 | **Description** 410 | 411 | ```php 412 | public getTitle (void) 413 | ``` 414 | 415 | Get the page title of the HTML document 416 | 417 | 418 | 419 | **Parameters** 420 | 421 | `This function has no parameters.` 422 | 423 | **Return Values** 424 | 425 | `null|string` 426 | 427 | 428 | 429 | 430 |
431 | 432 | 433 | ### HtmlPage::indent 434 | 435 | **Description** 436 | 437 | ```php 438 | public indent (array $options) 439 | ``` 440 | 441 | indent the HTML document 442 | 443 | 444 | 445 | **Parameters** 446 | 447 | * `(array) $options` 448 | : Options passed to PrettyMin::__construct() 449 | 450 | **Return Values** 451 | 452 | `\HtmlPage` 453 | 454 | 455 | 456 | 457 | **Throws Exceptions** 458 | 459 | 460 | `\Exception` 461 | 462 | 463 |
464 | 465 | 466 | ### HtmlPage::minify 467 | 468 | **Description** 469 | 470 | ```php 471 | public minify (array $options) 472 | ``` 473 | 474 | minify the HTML document 475 | 476 | 477 | 478 | **Parameters** 479 | 480 | * `(array) $options` 481 | : Options passed to PrettyMin::__construct() 482 | 483 | **Return Values** 484 | 485 | `\HtmlPage` 486 | 487 | 488 | 489 | 490 | **Throws Exceptions** 491 | 492 | 493 | `\Exception` 494 | 495 | 496 |
497 | 498 | 499 | ### HtmlPage::removeMeta 500 | 501 | **Description** 502 | 503 | ```php 504 | public removeMeta (string $name) 505 | ``` 506 | 507 | Remove all meta tags with the specified name attribute 508 | 509 | 510 | 511 | **Parameters** 512 | 513 | * `(string) $name` 514 | 515 | **Return Values** 516 | 517 | `void` 518 | 519 | 520 |
521 | 522 | 523 | ### HtmlPage::save 524 | 525 | **Description** 526 | 527 | ```php 528 | public save (string $filename) 529 | ``` 530 | 531 | Save this document to a HTML file or return HTML code as string 532 | 533 | 534 | 535 | **Parameters** 536 | 537 | * `(string) $filename` 538 | : If provided, output will be saved to this file, otherwise returned 539 | 540 | **Return Values** 541 | 542 | `string|void` 543 | 544 | 545 | 546 | 547 |
548 | 549 | 550 | ### HtmlPage::setBaseHref 551 | 552 | **Description** 553 | 554 | ```php 555 | public setBaseHref (string $url) 556 | ``` 557 | 558 | Set the base tag with href attribute set to parameter $url 559 | 560 | 561 | 562 | **Parameters** 563 | 564 | * `(string) $url` 565 | 566 | **Return Values** 567 | 568 | `void` 569 | 570 | 571 |
572 | 573 | 574 | ### HtmlPage::setHtmlById 575 | 576 | **Description** 577 | 578 | ```php 579 | public setHtmlById (string $elementId, string $html) 580 | ``` 581 | 582 | Sets innerHTML content of an element specified by elementId 583 | 584 | 585 | 586 | **Parameters** 587 | 588 | * `(string) $elementId` 589 | * `(string) $html` 590 | 591 | **Return Values** 592 | 593 | `void` 594 | 595 | 596 |
597 | 598 | 599 | ### HtmlPage::setMeta 600 | 601 | **Description** 602 | 603 | ```php 604 | public setMeta ( $name, $content) 605 | ``` 606 | 607 | Set a META tag with specified 'name' and 'content' attributes 608 | 609 | 610 | 611 | **Parameters** 612 | 613 | * `() $name` 614 | * `() $content` 615 | 616 | **Return Values** 617 | 618 | `void` 619 | 620 | 621 |
622 | 623 | 624 | ### HtmlPage::setTitle 625 | 626 | **Description** 627 | 628 | ```php 629 | public setTitle (string $title) 630 | ``` 631 | 632 | Sets the page title of the HTML document 633 | 634 | 635 | 636 | **Parameters** 637 | 638 | * `(string) $title` 639 | 640 | **Return Values** 641 | 642 | `void` 643 | 644 | 645 |
646 | 647 | 648 | ### HtmlPage::trimNewlines 649 | 650 | **Description** 651 | 652 | ```php 653 | public static trimNewlines (string $string) 654 | ``` 655 | 656 | remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space) 657 | 658 | useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode) 659 | 660 | **Parameters** 661 | 662 | * `(string) $string` 663 | 664 | **Return Values** 665 | 666 | `string` 667 | 668 | 669 | 670 | 671 |
672 | 673 | -------------------------------------------------------------------------------- /doc/HtmlPageCrawler.md: -------------------------------------------------------------------------------- 1 | # Wa72\HtmlPageDom\HtmlPageCrawler 2 | 3 | Extends \Symfony\Component\DomCrawler\Crawler by adding tree manipulation functions 4 | for HTML documents inspired by jQuery such as setInnerHtml(), css(), append(), prepend(), before(), 5 | addClass(), removeClass() 6 | 7 | ## Implements: 8 | Countable, IteratorAggregate, Traversable, Stringable 9 | 10 | ## Extend: 11 | 12 | Symfony\Component\DomCrawler\Crawler 13 | 14 | ## Methods 15 | 16 | | Name | Description | 17 | |------|-------------| 18 | |[__clone](#htmlpagecrawler__clone)|| 19 | |[__get](#htmlpagecrawler__get)|| 20 | |[__toString](#htmlpagecrawler__tostring)|| 21 | |[addClass](#htmlpagecrawleraddclass)|Adds the specified class(es) to each element in the set of matched elements.| 22 | |[addHtmlFragment](#htmlpagecrawleraddhtmlfragment)|| 23 | |[after](#htmlpagecrawlerafter)|Insert content, specified by the parameter, after each element in the set of matched elements.| 24 | |[append](#htmlpagecrawlerappend)|Insert HTML content as child nodes of each element after existing children| 25 | |[appendTo](#htmlpagecrawlerappendto)|Insert every element in the set of matched elements to the end of the target.| 26 | |[before](#htmlpagecrawlerbefore)|Insert content, specified by the parameter, before each element in the set of matched elements.| 27 | |[create](#htmlpagecrawlercreate)|Get an HtmlPageCrawler object from a HTML string, DOMNode, DOMNodeList or HtmlPageCrawler| 28 | |[css](#htmlpagecrawlercss)|Get one CSS style property of the first element or set it for all elements in the list| 29 | |[getAttribute](#htmlpagecrawlergetattribute)|Returns the attribute value of the first node of the list.| 30 | |[getCombinedText](#htmlpagecrawlergetcombinedtext)|Get the combined text contents of each element in the set of matched elements, including their descendants.| 31 | |[getDOMDocument](#htmlpagecrawlergetdomdocument)|get ownerDocument of the first element| 32 | |[getInnerHtml](#htmlpagecrawlergetinnerhtml)|Alias for Crawler::html() for naming consistency with setInnerHtml()| 33 | |[getStyle](#htmlpagecrawlergetstyle)|get one CSS style property of the first element| 34 | |[hasClass](#htmlpagecrawlerhasclass)|Determine whether any of the matched elements are assigned the given class.| 35 | |[insertAfter](#htmlpagecrawlerinsertafter)|Insert every element in the set of matched elements after the target.| 36 | |[insertBefore](#htmlpagecrawlerinsertbefore)|Insert every element in the set of matched elements before the target.| 37 | |[isHtmlDocument](#htmlpagecrawlerishtmldocument)|checks whether the first node contains a complete html document (as opposed to a document fragment)| 38 | |[makeClone](#htmlpagecrawlermakeclone)|Create a deep copy of the set of matched elements.| 39 | |[makeEmpty](#htmlpagecrawlermakeempty)|Removes all child nodes and text from all nodes in set| 40 | |[prepend](#htmlpagecrawlerprepend)|Insert content, specified by the parameter, to the beginning of each element in the set of matched elements.| 41 | |[prependTo](#htmlpagecrawlerprependto)|Insert every element in the set of matched elements to the beginning of the target.| 42 | |[remove](#htmlpagecrawlerremove)|Remove the set of matched elements from the DOM.| 43 | |[removeAttr](#htmlpagecrawlerremoveattr)|Remove an attribute from each element in the set of matched elements.| 44 | |[removeAttribute](#htmlpagecrawlerremoveattribute)|Remove an attribute from each element in the set of matched elements.| 45 | |[removeClass](#htmlpagecrawlerremoveclass)|Remove a class from each element in the list| 46 | |[replaceAll](#htmlpagecrawlerreplaceall)|Replace each target element with the set of matched elements.| 47 | |[replaceWith](#htmlpagecrawlerreplacewith)|Replace each element in the set of matched elements with the provided new content and return the set of elements that was removed.| 48 | |[saveHTML](#htmlpagecrawlersavehtml)|Get the HTML code fragment of all elements and their contents.| 49 | |[setAttribute](#htmlpagecrawlersetattribute)|Sets an attribute on each element| 50 | |[setInnerHtml](#htmlpagecrawlersetinnerhtml)|Set the HTML contents of each element| 51 | |[setStyle](#htmlpagecrawlersetstyle)|set one CSS style property for all elements in the list| 52 | |[setText](#htmlpagecrawlersettext)|Set the text contents of the matched elements.| 53 | |[toggleClass](#htmlpagecrawlertoggleclass)|Add or remove one or more classes from each element in the set of matched elements, depending the class’s presence.| 54 | |[unwrap](#htmlpagecrawlerunwrap)|Remove the parents of the set of matched elements from the DOM, leaving the matched elements in their place.| 55 | |[unwrapInner](#htmlpagecrawlerunwrapinner)|Remove the matched elements, but promote the children to take their place.| 56 | |[wrap](#htmlpagecrawlerwrap)|Wrap an HTML structure around each element in the set of matched elements| 57 | |[wrapAll](#htmlpagecrawlerwrapall)|Wrap an HTML structure around all elements in the set of matched elements.| 58 | |[wrapInner](#htmlpagecrawlerwrapinner)|Wrap an HTML structure around the content of each element in the set of matched elements.| 59 | 60 | ## Inherited methods 61 | 62 | | Name | Description | 63 | |------|-------------| 64 | | [__construct](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.__construct.php) | - | 65 | |add|Adds a node to the current list of nodes.| 66 | |addContent|Adds HTML/XML content.| 67 | |addDocument|Adds a \DOMDocument to the list of nodes.| 68 | |addHtmlContent|Adds an HTML content to the list of nodes.| 69 | |addNode|Adds a \DOMNode instance to the list of nodes.| 70 | |addNodeList|Adds a \DOMNodeList to the list of nodes.| 71 | |addNodes|Adds an array of \DOMNode instances to the list of nodes.| 72 | |addXmlContent|Adds an XML content to the list of nodes.| 73 | |ancestors|Returns the ancestors of the current selection.| 74 | |attr|Returns the attribute value of the first node of the list.| 75 | |children|Returns the children nodes of the current selection.| 76 | |clear|Removes all the nodes.| 77 | |closest|Return first parents (heading toward the document root) of the Element that matches the provided selector.| 78 | | [count](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.count.php) | - | 79 | |each|Calls an anonymous function on each node of the list.| 80 | |eq|Returns a node given its position in the node list.| 81 | |evaluate|Evaluates an XPath expression.| 82 | |extract|Extracts information from the list of nodes.| 83 | |filter|Filters the list of nodes with a CSS selector.| 84 | |filterXPath|Filters the list of nodes with an XPath expression.| 85 | |first|Returns the first node of the current selection.| 86 | |form|Returns a Form object for the first node in the list.| 87 | |getBaseHref|Returns base href.| 88 | | [getIterator](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.getiterator.php) | - | 89 | | [getNode](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.getnode.php) | - | 90 | |getUri|Returns the current URI.| 91 | |html|Returns the first node of the list as HTML.| 92 | |image|Returns an Image object for the first node in the list.| 93 | |images|Returns an array of Image objects for the nodes in the list.| 94 | |innerText|Returns only the inner text that is the direct descendent of the current node, excluding any child nodes.| 95 | |last|Returns the last node of the current selection.| 96 | |link|Returns a Link object for the first node in the list.| 97 | |links|Returns an array of Link objects for the nodes in the list.| 98 | | [matches](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.matches.php) | - | 99 | |nextAll|Returns the next siblings nodes of the current selection.| 100 | |nodeName|Returns the node name of the first node of the list.| 101 | | [outerHtml](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.outerhtml.php) | - | 102 | |previousAll|Returns the previous sibling nodes of the current selection.| 103 | |reduce|Reduces the list of nodes by calling an anonymous function.| 104 | | [registerNamespace](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.registernamespace.php) | - | 105 | |selectButton|Selects a button by name or alt value for images.| 106 | |selectImage|Selects images by alt value.| 107 | |selectLink|Selects links by name or alt value for clickable images.| 108 | |setDefaultNamespacePrefix|Overloads a default namespace prefix to be used with XPath and CSS expressions.| 109 | |siblings|Returns the siblings nodes of the current selection.| 110 | |slice|Slices the list of nodes by $offset and $length.| 111 | |text|Returns the text of the first node of the list.| 112 | |xpathLiteral|Converts string for XPath expressions.| 113 | 114 | 115 | 116 | ### HtmlPageCrawler::__clone 117 | 118 | **Description** 119 | 120 | ```php 121 | __clone (void) 122 | ``` 123 | 124 | 125 | 126 | 127 | 128 | **Parameters** 129 | 130 | `This function has no parameters.` 131 | 132 | **Return Values** 133 | 134 | `void` 135 | 136 | 137 |
138 | 139 | 140 | ### HtmlPageCrawler::__get 141 | 142 | **Description** 143 | 144 | ```php 145 | __get (void) 146 | ``` 147 | 148 | 149 | 150 | 151 | 152 | **Parameters** 153 | 154 | `This function has no parameters.` 155 | 156 | **Return Values** 157 | 158 | `void` 159 | 160 | 161 |
162 | 163 | 164 | ### HtmlPageCrawler::__toString 165 | 166 | **Description** 167 | 168 | ```php 169 | __toString (void) 170 | ``` 171 | 172 | 173 | 174 | 175 | 176 | **Parameters** 177 | 178 | `This function has no parameters.` 179 | 180 | **Return Values** 181 | 182 | `void` 183 | 184 | 185 |
186 | 187 | 188 | ### HtmlPageCrawler::addClass 189 | 190 | **Description** 191 | 192 | ```php 193 | public addClass (string $name) 194 | ``` 195 | 196 | Adds the specified class(es) to each element in the set of matched elements. 197 | 198 | 199 | 200 | **Parameters** 201 | 202 | * `(string) $name` 203 | : One or more space-separated classes to be added to the class attribute of each matched element. 204 | 205 | **Return Values** 206 | 207 | `\HtmlPageCrawler` 208 | 209 | > $this for chaining 210 | 211 | 212 |
213 | 214 | 215 | ### HtmlPageCrawler::addHtmlFragment 216 | 217 | **Description** 218 | 219 | ```php 220 | addHtmlFragment (void) 221 | ``` 222 | 223 | 224 | 225 | 226 | 227 | **Parameters** 228 | 229 | `This function has no parameters.` 230 | 231 | **Return Values** 232 | 233 | `void` 234 | 235 | 236 |
237 | 238 | 239 | ### HtmlPageCrawler::after 240 | 241 | **Description** 242 | 243 | ```php 244 | public after (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content) 245 | ``` 246 | 247 | Insert content, specified by the parameter, after each element in the set of matched elements. 248 | 249 | 250 | 251 | **Parameters** 252 | 253 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content` 254 | 255 | **Return Values** 256 | 257 | `\HtmlPageCrawler` 258 | 259 | > $this for chaining 260 | 261 | 262 |
263 | 264 | 265 | ### HtmlPageCrawler::append 266 | 267 | **Description** 268 | 269 | ```php 270 | public append (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content) 271 | ``` 272 | 273 | Insert HTML content as child nodes of each element after existing children 274 | 275 | 276 | 277 | **Parameters** 278 | 279 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content` 280 | : HTML code fragment or DOMNode to append 281 | 282 | **Return Values** 283 | 284 | `\HtmlPageCrawler` 285 | 286 | > $this for chaining 287 | 288 | 289 |
290 | 291 | 292 | ### HtmlPageCrawler::appendTo 293 | 294 | **Description** 295 | 296 | ```php 297 | public appendTo (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element) 298 | ``` 299 | 300 | Insert every element in the set of matched elements to the end of the target. 301 | 302 | 303 | 304 | **Parameters** 305 | 306 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element` 307 | 308 | **Return Values** 309 | 310 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 311 | 312 | > A new Crawler object containing all elements appended to the target elements 313 | 314 | 315 |
316 | 317 | 318 | ### HtmlPageCrawler::before 319 | 320 | **Description** 321 | 322 | ```php 323 | public before (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content) 324 | ``` 325 | 326 | Insert content, specified by the parameter, before each element in the set of matched elements. 327 | 328 | 329 | 330 | **Parameters** 331 | 332 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content` 333 | 334 | **Return Values** 335 | 336 | `\HtmlPageCrawler` 337 | 338 | > $this for chaining 339 | 340 | 341 |
342 | 343 | 344 | ### HtmlPageCrawler::create 345 | 346 | **Description** 347 | 348 | ```php 349 | public static create (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList|array $content) 350 | ``` 351 | 352 | Get an HtmlPageCrawler object from a HTML string, DOMNode, DOMNodeList or HtmlPageCrawler 353 | 354 | This is the equivalent to jQuery's $() function when used for wrapping DOMNodes or creating DOMElements from HTML code. 355 | 356 | **Parameters** 357 | 358 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList|array) $content` 359 | 360 | **Return Values** 361 | 362 | `\HtmlPageCrawler` 363 | 364 | 365 | 366 | 367 |
368 | 369 | 370 | ### HtmlPageCrawler::css 371 | 372 | **Description** 373 | 374 | ```php 375 | public css (string $key, null|string $value) 376 | ``` 377 | 378 | Get one CSS style property of the first element or set it for all elements in the list 379 | 380 | Function is here for compatibility with jQuery; it is the same as getStyle() and setStyle() 381 | 382 | **Parameters** 383 | 384 | * `(string) $key` 385 | : The name of the style property 386 | * `(null|string) $value` 387 | : The CSS value to set, or NULL to get the current value 388 | 389 | **Return Values** 390 | 391 | `\HtmlPageCrawler|string` 392 | 393 | > If no param is provided, returns the CSS styles of the first element 394 | 395 | 396 |
397 | 398 | 399 | ### HtmlPageCrawler::getAttribute 400 | 401 | **Description** 402 | 403 | ```php 404 | public getAttribute (string $name) 405 | ``` 406 | 407 | Returns the attribute value of the first node of the list. 408 | 409 | This is just an alias for attr() for naming consistency with setAttribute() 410 | 411 | **Parameters** 412 | 413 | * `(string) $name` 414 | : The attribute name 415 | 416 | **Return Values** 417 | 418 | `string|null` 419 | 420 | > The attribute value or null if the attribute does not exist 421 | 422 | 423 | **Throws Exceptions** 424 | 425 | 426 | `\InvalidArgumentException` 427 | > When current node is empty 428 | 429 |
430 | 431 | 432 | ### HtmlPageCrawler::getCombinedText 433 | 434 | **Description** 435 | 436 | ```php 437 | public getCombinedText (void) 438 | ``` 439 | 440 | Get the combined text contents of each element in the set of matched elements, including their descendants. 441 | 442 | This is what the jQuery text() function does, contrary to the Crawler::text() method that returns only 443 | the text of the first node. 444 | 445 | **Parameters** 446 | 447 | `This function has no parameters.` 448 | 449 | **Return Values** 450 | 451 | `string` 452 | 453 | 454 | 455 | 456 |
457 | 458 | 459 | ### HtmlPageCrawler::getDOMDocument 460 | 461 | **Description** 462 | 463 | ```php 464 | public getDOMDocument (void) 465 | ``` 466 | 467 | get ownerDocument of the first element 468 | 469 | 470 | 471 | **Parameters** 472 | 473 | `This function has no parameters.` 474 | 475 | **Return Values** 476 | 477 | `\DOMDocument|null` 478 | 479 | 480 | 481 | 482 |
483 | 484 | 485 | ### HtmlPageCrawler::getInnerHtml 486 | 487 | **Description** 488 | 489 | ```php 490 | public getInnerHtml (void) 491 | ``` 492 | 493 | Alias for Crawler::html() for naming consistency with setInnerHtml() 494 | 495 | 496 | 497 | **Parameters** 498 | 499 | `This function has no parameters.` 500 | 501 | **Return Values** 502 | 503 | `string` 504 | 505 | 506 | 507 | 508 |
509 | 510 | 511 | ### HtmlPageCrawler::getStyle 512 | 513 | **Description** 514 | 515 | ```php 516 | public getStyle (string $key) 517 | ``` 518 | 519 | get one CSS style property of the first element 520 | 521 | 522 | 523 | **Parameters** 524 | 525 | * `(string) $key` 526 | : name of the property 527 | 528 | **Return Values** 529 | 530 | `string|null` 531 | 532 | > value of the property 533 | 534 | 535 |
536 | 537 | 538 | ### HtmlPageCrawler::hasClass 539 | 540 | **Description** 541 | 542 | ```php 543 | public hasClass (string $name) 544 | ``` 545 | 546 | Determine whether any of the matched elements are assigned the given class. 547 | 548 | 549 | 550 | **Parameters** 551 | 552 | * `(string) $name` 553 | 554 | **Return Values** 555 | 556 | `bool` 557 | 558 | 559 | 560 | 561 |
562 | 563 | 564 | ### HtmlPageCrawler::insertAfter 565 | 566 | **Description** 567 | 568 | ```php 569 | public insertAfter (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element) 570 | ``` 571 | 572 | Insert every element in the set of matched elements after the target. 573 | 574 | 575 | 576 | **Parameters** 577 | 578 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element` 579 | 580 | **Return Values** 581 | 582 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 583 | 584 | > A new Crawler object containing all elements appended to the target elements 585 | 586 | 587 |
588 | 589 | 590 | ### HtmlPageCrawler::insertBefore 591 | 592 | **Description** 593 | 594 | ```php 595 | public insertBefore (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element) 596 | ``` 597 | 598 | Insert every element in the set of matched elements before the target. 599 | 600 | 601 | 602 | **Parameters** 603 | 604 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element` 605 | 606 | **Return Values** 607 | 608 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 609 | 610 | > A new Crawler object containing all elements appended to the target elements 611 | 612 | 613 |
614 | 615 | 616 | ### HtmlPageCrawler::isHtmlDocument 617 | 618 | **Description** 619 | 620 | ```php 621 | public isHtmlDocument (void) 622 | ``` 623 | 624 | checks whether the first node contains a complete html document (as opposed to a document fragment) 625 | 626 | 627 | 628 | **Parameters** 629 | 630 | `This function has no parameters.` 631 | 632 | **Return Values** 633 | 634 | `bool` 635 | 636 | 637 | 638 | 639 |
640 | 641 | 642 | ### HtmlPageCrawler::makeClone 643 | 644 | **Description** 645 | 646 | ```php 647 | public makeClone (void) 648 | ``` 649 | 650 | Create a deep copy of the set of matched elements. 651 | 652 | Equivalent to clone() in jQuery (clone is not a valid PHP function name) 653 | 654 | **Parameters** 655 | 656 | `This function has no parameters.` 657 | 658 | **Return Values** 659 | 660 | `\HtmlPageCrawler` 661 | 662 | 663 | 664 | 665 |
666 | 667 | 668 | ### HtmlPageCrawler::makeEmpty 669 | 670 | **Description** 671 | 672 | ```php 673 | public makeEmpty (void) 674 | ``` 675 | 676 | Removes all child nodes and text from all nodes in set 677 | 678 | Equivalent to jQuery's empty() function which is not a valid function name in PHP 679 | 680 | **Parameters** 681 | 682 | `This function has no parameters.` 683 | 684 | **Return Values** 685 | 686 | `\HtmlPageCrawler` 687 | 688 | > $this 689 | 690 | 691 |
692 | 693 | 694 | ### HtmlPageCrawler::prepend 695 | 696 | **Description** 697 | 698 | ```php 699 | public prepend (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content) 700 | ``` 701 | 702 | Insert content, specified by the parameter, to the beginning of each element in the set of matched elements. 703 | 704 | 705 | 706 | **Parameters** 707 | 708 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content` 709 | : HTML code fragment 710 | 711 | **Return Values** 712 | 713 | `\HtmlPageCrawler` 714 | 715 | > $this for chaining 716 | 717 | 718 |
719 | 720 | 721 | ### HtmlPageCrawler::prependTo 722 | 723 | **Description** 724 | 725 | ```php 726 | public prependTo (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element) 727 | ``` 728 | 729 | Insert every element in the set of matched elements to the beginning of the target. 730 | 731 | 732 | 733 | **Parameters** 734 | 735 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element` 736 | 737 | **Return Values** 738 | 739 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 740 | 741 | > A new Crawler object containing all elements prepended to the target elements 742 | 743 | 744 |
745 | 746 | 747 | ### HtmlPageCrawler::remove 748 | 749 | **Description** 750 | 751 | ```php 752 | public remove (void) 753 | ``` 754 | 755 | Remove the set of matched elements from the DOM. 756 | 757 | (as opposed to Crawler::clear() which detaches the nodes only from Crawler 758 | but leaves them in the DOM) 759 | 760 | **Parameters** 761 | 762 | `This function has no parameters.` 763 | 764 | **Return Values** 765 | 766 | `void` 767 | 768 | 769 |
770 | 771 | 772 | ### HtmlPageCrawler::removeAttr 773 | 774 | **Description** 775 | 776 | ```php 777 | public removeAttr (string $name) 778 | ``` 779 | 780 | Remove an attribute from each element in the set of matched elements. 781 | 782 | Alias for removeAttribute for compatibility with jQuery 783 | 784 | **Parameters** 785 | 786 | * `(string) $name` 787 | 788 | **Return Values** 789 | 790 | `\HtmlPageCrawler` 791 | 792 | 793 | 794 | 795 |
796 | 797 | 798 | ### HtmlPageCrawler::removeAttribute 799 | 800 | **Description** 801 | 802 | ```php 803 | public removeAttribute (string $name) 804 | ``` 805 | 806 | Remove an attribute from each element in the set of matched elements. 807 | 808 | 809 | 810 | **Parameters** 811 | 812 | * `(string) $name` 813 | 814 | **Return Values** 815 | 816 | `\HtmlPageCrawler` 817 | 818 | 819 | 820 | 821 |
822 | 823 | 824 | ### HtmlPageCrawler::removeClass 825 | 826 | **Description** 827 | 828 | ```php 829 | public removeClass (string $name) 830 | ``` 831 | 832 | Remove a class from each element in the list 833 | 834 | 835 | 836 | **Parameters** 837 | 838 | * `(string) $name` 839 | 840 | **Return Values** 841 | 842 | `\HtmlPageCrawler` 843 | 844 | > $this for chaining 845 | 846 | 847 |
848 | 849 | 850 | ### HtmlPageCrawler::replaceAll 851 | 852 | **Description** 853 | 854 | ```php 855 | public replaceAll (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element) 856 | ``` 857 | 858 | Replace each target element with the set of matched elements. 859 | 860 | 861 | 862 | **Parameters** 863 | 864 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element` 865 | 866 | **Return Values** 867 | 868 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 869 | 870 | > A new Crawler object containing all elements appended to the target elements 871 | 872 | 873 |
874 | 875 | 876 | ### HtmlPageCrawler::replaceWith 877 | 878 | **Description** 879 | 880 | ```php 881 | public replaceWith (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content) 882 | ``` 883 | 884 | Replace each element in the set of matched elements with the provided new content and return the set of elements that was removed. 885 | 886 | 887 | 888 | **Parameters** 889 | 890 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content` 891 | 892 | **Return Values** 893 | 894 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 895 | 896 | > $this for chaining 897 | 898 | 899 |
900 | 901 | 902 | ### HtmlPageCrawler::saveHTML 903 | 904 | **Description** 905 | 906 | ```php 907 | public saveHTML (void) 908 | ``` 909 | 910 | Get the HTML code fragment of all elements and their contents. 911 | 912 | If the first node contains a complete HTML document return only 913 | the full code of this document. 914 | 915 | **Parameters** 916 | 917 | `This function has no parameters.` 918 | 919 | **Return Values** 920 | 921 | `string` 922 | 923 | > HTML code (fragment) 924 | 925 | 926 |
927 | 928 | 929 | ### HtmlPageCrawler::setAttribute 930 | 931 | **Description** 932 | 933 | ```php 934 | public setAttribute (string $name, string $value) 935 | ``` 936 | 937 | Sets an attribute on each element 938 | 939 | 940 | 941 | **Parameters** 942 | 943 | * `(string) $name` 944 | * `(string) $value` 945 | 946 | **Return Values** 947 | 948 | `\HtmlPageCrawler` 949 | 950 | > $this for chaining 951 | 952 | 953 |
954 | 955 | 956 | ### HtmlPageCrawler::setInnerHtml 957 | 958 | **Description** 959 | 960 | ```php 961 | public setInnerHtml (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content) 962 | ``` 963 | 964 | Set the HTML contents of each element 965 | 966 | 967 | 968 | **Parameters** 969 | 970 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content` 971 | : HTML code fragment 972 | 973 | **Return Values** 974 | 975 | `\HtmlPageCrawler` 976 | 977 | > $this for chaining 978 | 979 | 980 |
981 | 982 | 983 | ### HtmlPageCrawler::setStyle 984 | 985 | **Description** 986 | 987 | ```php 988 | public setStyle (string $key, string $value) 989 | ``` 990 | 991 | set one CSS style property for all elements in the list 992 | 993 | 994 | 995 | **Parameters** 996 | 997 | * `(string) $key` 998 | : name of the property 999 | * `(string) $value` 1000 | : value of the property 1001 | 1002 | **Return Values** 1003 | 1004 | `\HtmlPageCrawler` 1005 | 1006 | > $this for chaining 1007 | 1008 | 1009 |
1010 | 1011 | 1012 | ### HtmlPageCrawler::setText 1013 | 1014 | **Description** 1015 | 1016 | ```php 1017 | public setText (string $text) 1018 | ``` 1019 | 1020 | Set the text contents of the matched elements. 1021 | 1022 | 1023 | 1024 | **Parameters** 1025 | 1026 | * `(string) $text` 1027 | 1028 | **Return Values** 1029 | 1030 | `\HtmlPageCrawler` 1031 | 1032 | 1033 | 1034 | 1035 |
1036 | 1037 | 1038 | ### HtmlPageCrawler::toggleClass 1039 | 1040 | **Description** 1041 | 1042 | ```php 1043 | public toggleClass (string $classname) 1044 | ``` 1045 | 1046 | Add or remove one or more classes from each element in the set of matched elements, depending the class’s presence. 1047 | 1048 | 1049 | 1050 | **Parameters** 1051 | 1052 | * `(string) $classname` 1053 | : One or more classnames separated by spaces 1054 | 1055 | **Return Values** 1056 | 1057 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 1058 | 1059 | > $this for chaining 1060 | 1061 | 1062 |
1063 | 1064 | 1065 | ### HtmlPageCrawler::unwrap 1066 | 1067 | **Description** 1068 | 1069 | ```php 1070 | public unwrap (void) 1071 | ``` 1072 | 1073 | Remove the parents of the set of matched elements from the DOM, leaving the matched elements in their place. 1074 | 1075 | 1076 | 1077 | **Parameters** 1078 | 1079 | `This function has no parameters.` 1080 | 1081 | **Return Values** 1082 | 1083 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 1084 | 1085 | > $this for chaining 1086 | 1087 | 1088 |
1089 | 1090 | 1091 | ### HtmlPageCrawler::unwrapInner 1092 | 1093 | **Description** 1094 | 1095 | ```php 1096 | public unwrapInner (void) 1097 | ``` 1098 | 1099 | Remove the matched elements, but promote the children to take their place. 1100 | 1101 | 1102 | 1103 | **Parameters** 1104 | 1105 | `This function has no parameters.` 1106 | 1107 | **Return Values** 1108 | 1109 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 1110 | 1111 | > $this for chaining 1112 | 1113 | 1114 |
1115 | 1116 | 1117 | ### HtmlPageCrawler::wrap 1118 | 1119 | **Description** 1120 | 1121 | ```php 1122 | public wrap (string|\HtmlPageCrawler|\DOMNode $wrappingElement) 1123 | ``` 1124 | 1125 | Wrap an HTML structure around each element in the set of matched elements 1126 | 1127 | The HTML structure must contain only one root node, e.g.: 1128 | Works:
1129 | Does not work:
1130 | 1131 | **Parameters** 1132 | 1133 | * `(string|\HtmlPageCrawler|\DOMNode) $wrappingElement` 1134 | 1135 | **Return Values** 1136 | 1137 | `\HtmlPageCrawler` 1138 | 1139 | > $this for chaining 1140 | 1141 | 1142 |
1143 | 1144 | 1145 | ### HtmlPageCrawler::wrapAll 1146 | 1147 | **Description** 1148 | 1149 | ```php 1150 | public wrapAll (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content) 1151 | ``` 1152 | 1153 | Wrap an HTML structure around all elements in the set of matched elements. 1154 | 1155 | 1156 | 1157 | **Parameters** 1158 | 1159 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content` 1160 | 1161 | **Return Values** 1162 | 1163 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 1164 | 1165 | > $this for chaining 1166 | 1167 | 1168 | **Throws Exceptions** 1169 | 1170 | 1171 | `\LogicException` 1172 | 1173 | 1174 |
1175 | 1176 | 1177 | ### HtmlPageCrawler::wrapInner 1178 | 1179 | **Description** 1180 | 1181 | ```php 1182 | public wrapInner (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content) 1183 | ``` 1184 | 1185 | Wrap an HTML structure around the content of each element in the set of matched elements. 1186 | 1187 | 1188 | 1189 | **Parameters** 1190 | 1191 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content` 1192 | 1193 | **Return Values** 1194 | 1195 | `\Wa72\HtmlPageDom\HtmlPageCrawler` 1196 | 1197 | > $this for chaining 1198 | 1199 | 1200 |
1201 | 1202 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Wa72\HtmlPageDom 2 | 3 | * [HtmlPage](HtmlPage.md) 4 | * [HtmlPageCrawler](HtmlPageCrawler.md) 5 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ./src/ 6 | 7 | 8 | 9 | 10 | ./Tests/ 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/Helpers.php: -------------------------------------------------------------------------------- 1 | value pairs of CSS properties 31 | */ 32 | public static function cssStringToArray($css) 33 | { 34 | $statements = explode(';', preg_replace('/\s+/s', ' ', $css)); 35 | $styles = array(); 36 | foreach ($statements as $statement) { 37 | $statement = trim($statement); 38 | if ('' === $statement) { 39 | continue; 40 | } 41 | $p = strpos($statement, ':'); 42 | if ($p <= 0) { 43 | continue; 44 | } // invalid statement, just ignore it 45 | $key = trim(substr($statement, 0, $p)); 46 | $value = trim(substr($statement, $p + 1)); 47 | $styles[$key] = $value; 48 | } 49 | return $styles; 50 | } 51 | 52 | /** 53 | * Convert CSS name->value array to string 54 | * 55 | * @param array $array name=>value pairs of CSS properties 56 | * @return string list of CSS properties separated by ; 57 | */ 58 | public static function cssArrayToString($array) 59 | { 60 | $styles = ''; 61 | foreach ($array as $key => $value) { 62 | $styles .= $key . ': ' . $value . ';'; 63 | } 64 | return $styles; 65 | } 66 | 67 | /** 68 | * Helper function for getting a body element 69 | * from an HTML fragment 70 | * 71 | * @param string $html A fragment of HTML code 72 | * @param string $charset 73 | * @return \DOMNode The body node containing child nodes created from the HTML fragment 74 | */ 75 | public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8') 76 | { 77 | 78 | $html = '' . $html . ''; 79 | $d = self::loadHtml($html, $charset); 80 | return $d->getElementsByTagName('body')->item(0); 81 | } 82 | 83 | public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument 84 | { 85 | return self::parseXhtml($html, $charset); 86 | } 87 | /** 88 | * Function originally taken from Symfony\Component\DomCrawler\Crawler 89 | * (c) Fabien Potencier 90 | * License: MIT 91 | */ 92 | private static function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument 93 | { 94 | $htmlContent = self::convertToHtmlEntities($htmlContent, $charset); 95 | 96 | $internalErrors = libxml_use_internal_errors(true); 97 | 98 | $dom = new \DOMDocument('1.0', $charset); 99 | $dom->validateOnParse = true; 100 | 101 | if ('' !== trim($htmlContent)) { 102 | // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements 103 | // Option LIBXML_SCHEMA_CREATE seems to prevent this 104 | // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string 105 | @$dom->loadHTML($htmlContent, \LIBXML_SCHEMA_CREATE); 106 | } 107 | 108 | libxml_use_internal_errors($internalErrors); 109 | 110 | return $dom; 111 | } 112 | 113 | /** 114 | * Converts charset to HTML-entities to ensure valid parsing. 115 | * Function taken from Symfony\Component\DomCrawler\Crawler 116 | * (c) Fabien Potencier 117 | * License: MIT 118 | */ 119 | private static function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string 120 | { 121 | set_error_handler(function () { throw new \Exception(); }); 122 | 123 | try { 124 | return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset); 125 | } catch (\Exception|\ValueError) { 126 | try { 127 | $htmlContent = iconv($charset, 'UTF-8', $htmlContent); 128 | $htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8'); 129 | } catch (\Exception|\ValueError) { 130 | } 131 | return $htmlContent; 132 | } finally { 133 | restore_error_handler(); 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/HtmlPage.php: -------------------------------------------------------------------------------- 1 | charset = $charset; 45 | $this->url = $url; 46 | if ($content == '') { 47 | $content = ''; 48 | } 49 | $this->dom = Helpers::loadHtml($content, $charset); 50 | $this->crawler = new HtmlPageCrawler($this->dom); 51 | } 52 | 53 | /** 54 | * Get a HtmlPageCrawler object containing the root node of the HTML document 55 | * 56 | * @return HtmlPageCrawler 57 | */ 58 | public function getCrawler() 59 | { 60 | return $this->crawler; 61 | } 62 | 63 | /** 64 | * Get a DOMDocument object for the HTML document 65 | * 66 | * @return \DOMDocument 67 | */ 68 | public function getDOMDocument() 69 | { 70 | return $this->dom; 71 | } 72 | 73 | /** 74 | * Sets the page title of the HTML document 75 | * 76 | * @param string $title 77 | */ 78 | public function setTitle($title) 79 | { 80 | $t = $this->dom->getElementsByTagName('title')->item(0); 81 | if ($t == null) { 82 | $t = $this->dom->createElement('title'); 83 | $this->getHeadNode()->appendChild($t); 84 | } 85 | $t->nodeValue = htmlspecialchars($title); 86 | } 87 | 88 | /** 89 | * Get the page title of the HTML document 90 | * 91 | * @return null|string 92 | */ 93 | public function getTitle() 94 | { 95 | $t = $this->dom->getElementsByTagName('title')->item(0); 96 | if ($t == null) { 97 | return null; 98 | } else { 99 | return $t->nodeValue; 100 | } 101 | } 102 | 103 | /** 104 | * Set a META tag with specified 'name' and 'content' attributes 105 | * 106 | * @TODO: add support for multiple meta tags with the same name but different languages 107 | * 108 | * @param $name 109 | * @param $content 110 | */ 111 | public function setMeta($name, $content) 112 | { 113 | $c = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']'); 114 | if (count($c) == 0) { 115 | $node = $this->dom->createElement('meta'); 116 | $node->setAttribute('name', $name); 117 | $this->getHeadNode()->appendChild($node); 118 | $c->addNode($node); 119 | } 120 | $c->setAttribute('content', $content); 121 | } 122 | 123 | /** 124 | * Remove all meta tags with the specified name attribute 125 | * 126 | * @param string $name 127 | */ 128 | public function removeMeta($name) 129 | { 130 | $meta = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']'); 131 | $meta->remove(); 132 | } 133 | 134 | /** 135 | * Get the content attribute of a meta tag with the specified name attribute 136 | * 137 | * @param string $name 138 | * @return null|string 139 | */ 140 | public function getMeta($name) 141 | { 142 | $node = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']')->getNode(0); 143 | if ($node instanceof \DOMElement) { 144 | return $node->getAttribute('content'); 145 | } else { 146 | return null; 147 | } 148 | } 149 | 150 | /** 151 | * Set the base tag with href attribute set to parameter $url 152 | * 153 | * @param string $url 154 | */ 155 | public function setBaseHref($url) 156 | { 157 | $node = $this->filterXPath('descendant-or-self::base')->getNode(0); 158 | if ($node == null) { 159 | $node = $this->dom->createElement('base'); 160 | $this->getHeadNode()->appendChild($node); 161 | } 162 | $node->setAttribute('href', $url); 163 | } 164 | 165 | /** 166 | * Get the href attribute from the base tag, null if not present in document 167 | * 168 | * @return null|string 169 | */ 170 | public function getBaseHref() 171 | { 172 | $node = $this->filterXPath('descendant-or-self::base')->getNode(0); 173 | if ($node instanceof \DOMElement) { 174 | return $node->getAttribute('href'); 175 | } else { 176 | return null; 177 | } 178 | } 179 | 180 | /** 181 | * Sets innerHTML content of an element specified by elementId 182 | * 183 | * @param string $elementId 184 | * @param string $html 185 | */ 186 | public function setHtmlById($elementId, $html) 187 | { 188 | $this->getElementById($elementId)->setInnerHtml($html); 189 | } 190 | 191 | /** 192 | * Get the document's HEAD section as DOMElement 193 | * 194 | * @return \DOMElement 195 | */ 196 | public function getHeadNode() 197 | { 198 | $head = $this->dom->getElementsByTagName('head')->item(0); 199 | if ($head == null) { 200 | $head = $this->dom->createElement('head'); 201 | $head = $this->dom->documentElement->insertBefore($head, $this->getBodyNode()); 202 | } 203 | return $head; 204 | } 205 | 206 | /** 207 | * Get the document's body as DOMElement 208 | * 209 | * @return \DOMElement 210 | */ 211 | public function getBodyNode() 212 | { 213 | $body = $this->dom->getElementsByTagName('body')->item(0); 214 | if ($body == null) { 215 | $body = $this->dom->createElement('body'); 216 | $body = $this->dom->documentElement->appendChild($body); 217 | } 218 | return $body; 219 | } 220 | 221 | /** 222 | * Get the document's HEAD section wrapped in a HtmlPageCrawler instance 223 | * 224 | * @return HtmlPageCrawler 225 | */ 226 | public function getHead() 227 | { 228 | return new HtmlPageCrawler($this->getHeadNode()); 229 | } 230 | 231 | /** 232 | * Get the document's body wrapped in a HtmlPageCrawler instance 233 | * 234 | * @return HtmlPageCrawler 235 | */ 236 | public function getBody() 237 | { 238 | return new HtmlPageCrawler($this->getBodyNode()); 239 | } 240 | 241 | public function __toString() 242 | { 243 | return $this->dom->saveHTML(); 244 | } 245 | 246 | /** 247 | * Save this document to a HTML file or return HTML code as string 248 | * 249 | * @param string $filename If provided, output will be saved to this file, otherwise returned 250 | * @return string|void 251 | */ 252 | public function save($filename = '') 253 | { 254 | if ($filename != '') { 255 | file_put_contents($filename, (string) $this); 256 | return; 257 | } else { 258 | return (string) $this; 259 | } 260 | } 261 | 262 | /** 263 | * Get an element in the document by it's id attribute 264 | * 265 | * @param string $id 266 | * @return HtmlPageCrawler 267 | */ 268 | public function getElementById($id) 269 | { 270 | return $this->filterXPath('descendant-or-self::*[@id = \'' . $id . '\']'); 271 | } 272 | 273 | /** 274 | * Filter nodes by using a CSS selector 275 | * 276 | * @param string $selector CSS selector 277 | * @return HtmlPageCrawler 278 | */ 279 | public function filter($selector) 280 | { 281 | //echo "\n" . CssSelector::toXPath($selector) . "\n"; 282 | return $this->crawler->filter($selector); 283 | } 284 | 285 | /** 286 | * Filter nodes by XPath expression 287 | * 288 | * @param string $xpath XPath expression 289 | * @return HtmlPageCrawler 290 | */ 291 | public function filterXPath($xpath) 292 | { 293 | return $this->crawler->filterXPath($xpath); 294 | } 295 | 296 | /** 297 | * remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space) 298 | * 299 | * useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode) 300 | * 301 | * @param string $string 302 | * @return string 303 | */ 304 | public static function trimNewlines($string) 305 | { 306 | return Helpers::trimNewlines($string); 307 | } 308 | 309 | public function __clone() 310 | { 311 | $this->dom = $this->dom->cloneNode(true); 312 | $this->crawler = new HtmlPageCrawler($this->dom); 313 | } 314 | 315 | /** 316 | * minify the HTML document 317 | * 318 | * @param array $options Options passed to PrettyMin::__construct() 319 | * @return HtmlPage 320 | * @throws \Exception 321 | */ 322 | public function minify(array $options = array()) 323 | { 324 | if (!class_exists('Wa72\\HtmlPrettymin\\PrettyMin')) { 325 | throw new \Exception('Function minify needs composer package wa72/html-pretty-min'); 326 | } 327 | $pm = new PrettyMin($options); 328 | $pm->load($this->dom)->minify(); 329 | return $this; 330 | } 331 | 332 | /** 333 | * indent the HTML document 334 | * 335 | * @param array $options Options passed to PrettyMin::__construct() 336 | * @return HtmlPage 337 | * @throws \Exception 338 | */ 339 | public function indent(array $options = array()) 340 | { 341 | if (!class_exists('Wa72\\HtmlPrettymin\\PrettyMin')) { 342 | throw new \Exception('Function indent needs composer package wa72/html-pretty-min'); 343 | } 344 | $pm = new PrettyMin($options); 345 | $pm->load($this->dom)->indent(); 346 | return $this; 347 | } 348 | } 349 | -------------------------------------------------------------------------------- /src/HtmlPageCrawler.php: -------------------------------------------------------------------------------- 1 | getAttribute('class')); 53 | $found = false; 54 | $count = count($classes); 55 | for ($i = 0; $i < $count; $i++) { 56 | if ($classes[$i] == $name) { 57 | $found = true; 58 | } 59 | } 60 | if (!$found) { 61 | $classes[] = $name; 62 | $node->setAttribute('class', trim(join(' ', $classes))); 63 | } 64 | } 65 | } 66 | return $this; 67 | } 68 | 69 | /** 70 | * Insert content, specified by the parameter, after each element in the set of matched elements. 71 | * 72 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content 73 | * @return HtmlPageCrawler $this for chaining 74 | * @api 75 | */ 76 | public function after($content) 77 | { 78 | $content = self::create($content); 79 | $newnodes = array(); 80 | foreach ($this as $i => $node) { 81 | /** @var \DOMNode $node */ 82 | $refnode = $node->nextSibling; 83 | foreach ($content as $newnode) { 84 | /** @var \DOMNode $newnode */ 85 | $newnode = static::importNewnode($newnode, $node, $i); 86 | if ($refnode === null) { 87 | $node->parentNode->appendChild($newnode); 88 | } else { 89 | $node->parentNode->insertBefore($newnode, $refnode); 90 | } 91 | $newnodes[] = $newnode; 92 | } 93 | } 94 | $content->clear(); 95 | $content->add($newnodes); 96 | return $this; 97 | } 98 | 99 | /** 100 | * Insert HTML content as child nodes of each element after existing children 101 | * 102 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment or DOMNode to append 103 | * @return HtmlPageCrawler $this for chaining 104 | * @api 105 | */ 106 | public function append($content) 107 | { 108 | $content = self::create($content); 109 | $newnodes = array(); 110 | foreach ($this as $i => $node) { 111 | /** @var \DOMNode $node */ 112 | foreach ($content as $newnode) { 113 | /** @var \DOMNode $newnode */ 114 | $newnode = static::importNewnode($newnode, $node, $i); 115 | $node->appendChild($newnode); 116 | $newnodes[] = $newnode; 117 | } 118 | } 119 | $content->clear(); 120 | $content->add($newnodes); 121 | return $this; 122 | } 123 | 124 | /** 125 | * Insert every element in the set of matched elements to the end of the target. 126 | * 127 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element 128 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements 129 | * @api 130 | */ 131 | public function appendTo($element) 132 | { 133 | $e = self::create($element); 134 | $newnodes = array(); 135 | foreach ($e as $i => $node) { 136 | /** @var \DOMNode $node */ 137 | foreach ($this as $newnode) { 138 | /** @var \DOMNode $newnode */ 139 | if ($node !== $newnode) { 140 | $newnode = static::importNewnode($newnode, $node, $i); 141 | $node->appendChild($newnode); 142 | } 143 | $newnodes[] = $newnode; 144 | } 145 | } 146 | return self::create($newnodes); 147 | } 148 | 149 | /** 150 | * Sets an attribute on each element 151 | * 152 | * @param string $name 153 | * @param string $value 154 | * @return HtmlPageCrawler $this for chaining 155 | * @api 156 | */ 157 | public function setAttribute($name, $value) 158 | { 159 | foreach ($this as $node) { 160 | if ($node instanceof \DOMElement) { 161 | /** @var \DOMElement $node */ 162 | $node->setAttribute($name, $value); 163 | } 164 | } 165 | return $this; 166 | } 167 | 168 | /** 169 | * Returns the attribute value of the first node of the list. 170 | * This is just an alias for attr() for naming consistency with setAttribute() 171 | * 172 | * @param string $name The attribute name 173 | * @return string|null The attribute value or null if the attribute does not exist 174 | * @throws \InvalidArgumentException When current node is empty 175 | */ 176 | public function getAttribute($name) 177 | { 178 | return parent::attr($name); 179 | } 180 | 181 | /** 182 | * Insert content, specified by the parameter, before each element in the set of matched elements. 183 | * 184 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content 185 | * @return HtmlPageCrawler $this for chaining 186 | * @api 187 | */ 188 | public function before($content) 189 | { 190 | $content = self::create($content); 191 | $newnodes = array(); 192 | foreach ($this as $i => $node) { 193 | /** @var \DOMNode $node */ 194 | foreach ($content as $newnode) { 195 | /** @var \DOMNode $newnode */ 196 | if ($node !== $newnode) { 197 | $newnode = static::importNewnode($newnode, $node, $i); 198 | $node->parentNode->insertBefore($newnode, $node); 199 | $newnodes[] = $newnode; 200 | } 201 | } 202 | } 203 | $content->clear(); 204 | $content->add($newnodes); 205 | return $this; 206 | } 207 | 208 | /** 209 | * Create a deep copy of the set of matched elements. 210 | * 211 | * Equivalent to clone() in jQuery (clone is not a valid PHP function name) 212 | * 213 | * @return HtmlPageCrawler 214 | * @api 215 | */ 216 | public function makeClone() 217 | { 218 | return clone $this; 219 | } 220 | 221 | public function __clone() 222 | { 223 | $newnodes = array(); 224 | foreach ($this as $node) { 225 | /** @var \DOMNode $node */ 226 | $newnodes[] = $node->cloneNode(true); 227 | } 228 | $this->clear(); 229 | $this->add($newnodes); 230 | } 231 | 232 | /** 233 | * Get one CSS style property of the first element or set it for all elements in the list 234 | * 235 | * Function is here for compatibility with jQuery; it is the same as getStyle() and setStyle() 236 | * 237 | * @see HtmlPageCrawler::getStyle() 238 | * @see HtmlPageCrawler::setStyle() 239 | * 240 | * @param string $key The name of the style property 241 | * @param null|string $value The CSS value to set, or NULL to get the current value 242 | * @return HtmlPageCrawler|string If no param is provided, returns the CSS styles of the first element 243 | * @api 244 | */ 245 | public function css($key, $value = null) 246 | { 247 | if (null === $value) { 248 | return $this->getStyle($key); 249 | } else { 250 | return $this->setStyle($key, $value); 251 | } 252 | } 253 | 254 | /** 255 | * get one CSS style property of the first element 256 | * 257 | * @param string $key name of the property 258 | * @return string|null value of the property 259 | */ 260 | public function getStyle($key) 261 | { 262 | $styles = Helpers::cssStringToArray($this->getAttribute('style')); 263 | return (isset($styles[$key]) ? $styles[$key] : null); 264 | } 265 | 266 | /** 267 | * set one CSS style property for all elements in the list 268 | * 269 | * @param string $key name of the property 270 | * @param string $value value of the property 271 | * @return HtmlPageCrawler $this for chaining 272 | */ 273 | public function setStyle($key, $value) 274 | { 275 | foreach ($this as $node) { 276 | if ($node instanceof \DOMElement) { 277 | /** @var \DOMElement $node */ 278 | $styles = Helpers::cssStringToArray($node->getAttribute('style')); 279 | if ($value != '') { 280 | $styles[$key] = $value; 281 | } elseif (isset($styles[$key])) { 282 | unset($styles[$key]); 283 | } 284 | $node->setAttribute('style', Helpers::cssArrayToString($styles)); 285 | } 286 | } 287 | return $this; 288 | } 289 | 290 | /** 291 | * Removes all child nodes and text from all nodes in set 292 | * 293 | * Equivalent to jQuery's empty() function which is not a valid function name in PHP 294 | * @return HtmlPageCrawler $this 295 | * @api 296 | */ 297 | public function makeEmpty() 298 | { 299 | foreach ($this as $node) { 300 | $node->nodeValue = ''; 301 | } 302 | return $this; 303 | } 304 | 305 | /** 306 | * Determine whether any of the matched elements are assigned the given class. 307 | * 308 | * @param string $name 309 | * @return bool 310 | * @api 311 | */ 312 | public function hasClass($name) 313 | { 314 | foreach ($this as $node) { 315 | if ($node instanceof \DOMElement && $class = $node->getAttribute('class')) { 316 | $classes = preg_split('/\s+/s', $class); 317 | if (in_array($name, $classes)) { 318 | return true; 319 | } 320 | } 321 | } 322 | return false; 323 | } 324 | 325 | /** 326 | * Set the HTML contents of each element 327 | * 328 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment 329 | * @return HtmlPageCrawler $this for chaining 330 | * @api 331 | */ 332 | public function setInnerHtml($content) 333 | { 334 | $content = self::create($content); 335 | foreach ($this as $node) { 336 | $node->nodeValue = ''; 337 | foreach ($content as $newnode) { 338 | /** @var \DOMNode $node */ 339 | /** @var \DOMNode $newnode */ 340 | $newnode = static::importNewnode($newnode, $node); 341 | $node->appendChild($newnode); 342 | } 343 | } 344 | return $this; 345 | } 346 | 347 | /** 348 | * Alias for Crawler::html() for naming consistency with setInnerHtml() 349 | * 350 | * @return string 351 | * @api 352 | */ 353 | public function getInnerHtml() 354 | { 355 | return parent::html(); 356 | } 357 | 358 | /** 359 | * Insert every element in the set of matched elements after the target. 360 | * 361 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element 362 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements 363 | * @api 364 | */ 365 | public function insertAfter($element) 366 | { 367 | $e = self::create($element); 368 | $newnodes = array(); 369 | foreach ($e as $i => $node) { 370 | /** @var \DOMNode $node */ 371 | $refnode = $node->nextSibling; 372 | foreach ($this as $newnode) { 373 | /** @var \DOMNode $newnode */ 374 | $newnode = static::importNewnode($newnode, $node, $i); 375 | if ($refnode === null) { 376 | $node->parentNode->appendChild($newnode); 377 | } else { 378 | $node->parentNode->insertBefore($newnode, $refnode); 379 | } 380 | $newnodes[] = $newnode; 381 | } 382 | } 383 | return self::create($newnodes); 384 | } 385 | 386 | /** 387 | * Insert every element in the set of matched elements before the target. 388 | * 389 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element 390 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements 391 | * @api 392 | */ 393 | public function insertBefore($element) 394 | { 395 | $e = self::create($element); 396 | $newnodes = array(); 397 | foreach ($e as $i => $node) { 398 | /** @var \DOMNode $node */ 399 | foreach ($this as $newnode) { 400 | /** @var \DOMNode $newnode */ 401 | $newnode = static::importNewnode($newnode, $node, $i); 402 | if ($newnode !== $node) { 403 | $node->parentNode->insertBefore($newnode, $node); 404 | } 405 | $newnodes[] = $newnode; 406 | } 407 | } 408 | return self::create($newnodes); 409 | } 410 | 411 | /** 412 | * Insert content, specified by the parameter, to the beginning of each element in the set of matched elements. 413 | * 414 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment 415 | * @return HtmlPageCrawler $this for chaining 416 | * @api 417 | */ 418 | public function prepend($content) 419 | { 420 | $content = self::create($content); 421 | $newnodes = array(); 422 | foreach ($this as $i => $node) { 423 | $refnode = $node->firstChild; 424 | /** @var \DOMNode $node */ 425 | foreach ($content as $newnode) { 426 | /** @var \DOMNode $newnode */ 427 | $newnode = static::importNewnode($newnode, $node, $i); 428 | if ($refnode === null) { 429 | $node->appendChild($newnode); 430 | } else if ($refnode !== $newnode) { 431 | $node->insertBefore($newnode, $refnode); 432 | } 433 | $newnodes[] = $newnode; 434 | } 435 | } 436 | $content->clear(); 437 | $content->add($newnodes); 438 | return $this; 439 | } 440 | 441 | /** 442 | * Insert every element in the set of matched elements to the beginning of the target. 443 | * 444 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element 445 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements prepended to the target elements 446 | * @api 447 | */ 448 | public function prependTo($element) 449 | { 450 | $e = self::create($element); 451 | $newnodes = array(); 452 | foreach ($e as $i => $node) { 453 | $refnode = $node->firstChild; 454 | /** @var \DOMNode $node */ 455 | foreach ($this as $newnode) { 456 | /** @var \DOMNode $newnode */ 457 | $newnode = static::importNewnode($newnode, $node, $i); 458 | if ($newnode !== $node) { 459 | if ($refnode === null) { 460 | $node->appendChild($newnode); 461 | } else { 462 | $node->insertBefore($newnode, $refnode); 463 | } 464 | } 465 | $newnodes[] = $newnode; 466 | } 467 | } 468 | return self::create($newnodes); 469 | } 470 | 471 | /** 472 | * Remove the set of matched elements from the DOM. 473 | * 474 | * (as opposed to Crawler::clear() which detaches the nodes only from Crawler 475 | * but leaves them in the DOM) 476 | * 477 | * @api 478 | */ 479 | public function remove() 480 | { 481 | foreach ($this as $node) { 482 | /** 483 | * @var \DOMNode $node 484 | */ 485 | if ($node->parentNode instanceof \DOMElement) { 486 | $node->parentNode->removeChild($node); 487 | } 488 | } 489 | $this->clear(); 490 | } 491 | 492 | /** 493 | * Remove an attribute from each element in the set of matched elements. 494 | * 495 | * Alias for removeAttribute for compatibility with jQuery 496 | * 497 | * @param string $name 498 | * @return HtmlPageCrawler 499 | * @api 500 | */ 501 | public function removeAttr($name) 502 | { 503 | return $this->removeAttribute($name); 504 | } 505 | 506 | /** 507 | * Remove an attribute from each element in the set of matched elements. 508 | * 509 | * @param string $name 510 | * @return HtmlPageCrawler 511 | */ 512 | public function removeAttribute($name) 513 | { 514 | foreach ($this as $node) { 515 | if ($node instanceof \DOMElement) { 516 | /** @var \DOMElement $node */ 517 | if ($node->hasAttribute($name)) { 518 | $node->removeAttribute($name); 519 | } 520 | } 521 | } 522 | return $this; 523 | } 524 | 525 | /** 526 | * Remove a class from each element in the list 527 | * 528 | * @param string $name 529 | * @return HtmlPageCrawler $this for chaining 530 | * @api 531 | */ 532 | public function removeClass($name) 533 | { 534 | foreach ($this as $node) { 535 | if ($node instanceof \DOMElement) { 536 | /** @var \DOMElement $node */ 537 | $classes = preg_split('/\s+/s', $node->getAttribute('class')); 538 | $count = count($classes); 539 | for ($i = 0; $i < $count; $i++) { 540 | if ($classes[$i] == $name) { 541 | unset($classes[$i]); 542 | } 543 | } 544 | $node->setAttribute('class', trim(join(' ', $classes))); 545 | } 546 | } 547 | return $this; 548 | } 549 | 550 | /** 551 | * Replace each target element with the set of matched elements. 552 | * 553 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element 554 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements 555 | * @api 556 | */ 557 | public function replaceAll($element) 558 | { 559 | $e = self::create($element); 560 | $newnodes = array(); 561 | foreach ($e as $i => $node) { 562 | /** @var \DOMNode $node */ 563 | $parent = $node->parentNode; 564 | $refnode = $node->nextSibling; 565 | foreach ($this as $j => $newnode) { 566 | /** @var \DOMNode $newnode */ 567 | $newnode = static::importNewnode($newnode, $node, $i); 568 | if ($j == 0) { 569 | $parent->replaceChild($newnode, $node); 570 | } else { 571 | $parent->insertBefore($newnode, $refnode); 572 | } 573 | $newnodes[] = $newnode; 574 | } 575 | } 576 | return self::create($newnodes); 577 | } 578 | 579 | /** 580 | * Replace each element in the set of matched elements with the provided new content and return the set of elements that was removed. 581 | * 582 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content 583 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining 584 | * @api 585 | */ 586 | public function replaceWith($content) 587 | { 588 | $content = self::create($content); 589 | $newnodes = array(); 590 | foreach ($this as $i => $node) { 591 | /** @var \DOMNode $node */ 592 | $parent = $node->parentNode; 593 | $refnode = $node->nextSibling; 594 | foreach ($content as $j => $newnode) { 595 | /** @var \DOMNode $newnode */ 596 | $newnode = static::importNewnode($newnode, $node, $i); 597 | if ($j == 0) { 598 | $parent->replaceChild($newnode, $node); 599 | } else { 600 | $parent->insertBefore($newnode, $refnode); 601 | } 602 | $newnodes[] = $newnode; 603 | } 604 | } 605 | $content->clear(); 606 | $content->add($newnodes); 607 | return $this; 608 | } 609 | 610 | /** 611 | * Get the combined text contents of each element in the set of matched elements, including their descendants. 612 | * This is what the jQuery text() function does, contrary to the Crawler::text() method that returns only 613 | * the text of the first node. 614 | * 615 | * @return string 616 | * @api 617 | */ 618 | public function getCombinedText() 619 | { 620 | $text = ''; 621 | foreach ($this as $node) { 622 | /** @var \DOMNode $node */ 623 | $text .= $node->nodeValue; 624 | } 625 | return $text; 626 | } 627 | 628 | /** 629 | * Set the text contents of the matched elements. 630 | * 631 | * @param string $text 632 | * @return HtmlPageCrawler 633 | * @api 634 | */ 635 | public function setText($text) 636 | { 637 | $text = htmlspecialchars($text); 638 | foreach ($this as $node) { 639 | /** @var \DOMNode $node */ 640 | $node->nodeValue = $text; 641 | } 642 | return $this; 643 | } 644 | 645 | /** 646 | * Add or remove one or more classes from each element in the set of matched elements, depending the class’s presence. 647 | * 648 | * @param string $classname One or more classnames separated by spaces 649 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining 650 | * @api 651 | */ 652 | public function toggleClass($classname) 653 | { 654 | $classes = explode(' ', $classname); 655 | foreach ($this as $i => $node) { 656 | $c = self::create($node); 657 | /** @var \DOMNode $node */ 658 | foreach ($classes as $class) { 659 | if ($c->hasClass($class)) { 660 | $c->removeClass($class); 661 | } else { 662 | $c->addClass($class); 663 | } 664 | } 665 | } 666 | return $this; 667 | } 668 | 669 | /** 670 | * Remove the parents of the set of matched elements from the DOM, leaving the matched elements in their place. 671 | * 672 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining 673 | * @api 674 | */ 675 | public function unwrap() 676 | { 677 | $parents = array(); 678 | foreach($this as $i => $node) { 679 | $parents[] = $node->parentNode; 680 | } 681 | 682 | self::create($parents)->unwrapInner(); 683 | return $this; 684 | } 685 | 686 | /** 687 | * Remove the matched elements, but promote the children to take their place. 688 | * 689 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining 690 | * @api 691 | */ 692 | public function unwrapInner() 693 | { 694 | foreach($this as $i => $node) { 695 | if (!$node->parentNode instanceof \DOMElement) { 696 | throw new \InvalidArgumentException('DOMElement does not have a parent DOMElement node.'); 697 | } 698 | 699 | /** @var \DOMNode[] $children */ 700 | $children = iterator_to_array($node->childNodes); 701 | foreach ($children as $child) { 702 | $node->parentNode->insertBefore($child, $node); 703 | } 704 | 705 | $node->parentNode->removeChild($node); 706 | } 707 | } 708 | 709 | 710 | /** 711 | * Wrap an HTML structure around each element in the set of matched elements 712 | * 713 | * The HTML structure must contain only one root node, e.g.: 714 | * Works:
715 | * Does not work:
716 | * 717 | * @param string|HtmlPageCrawler|\DOMNode $wrappingElement 718 | * @return HtmlPageCrawler $this for chaining 719 | * @api 720 | */ 721 | public function wrap($wrappingElement) 722 | { 723 | $content = self::create($wrappingElement); 724 | $newnodes = array(); 725 | foreach ($this as $i => $node) { 726 | /** @var \DOMNode $node */ 727 | $newnode = $content->getNode(0); 728 | /** @var \DOMNode $newnode */ 729 | // $newnode = static::importNewnode($newnode, $node, $i); 730 | if ($newnode->ownerDocument !== $node->ownerDocument) { 731 | $newnode = $node->ownerDocument->importNode($newnode, true); 732 | } else { 733 | if ($i > 0) { 734 | $newnode = $newnode->cloneNode(true); 735 | } 736 | } 737 | $oldnode = $node->parentNode->replaceChild($newnode, $node); 738 | while ($newnode->hasChildNodes()) { 739 | $elementFound = false; 740 | foreach ($newnode->childNodes as $child) { 741 | if ($child instanceof \DOMElement) { 742 | $newnode = $child; 743 | $elementFound = true; 744 | break; 745 | } 746 | } 747 | if (!$elementFound) { 748 | break; 749 | } 750 | } 751 | $newnode->appendChild($oldnode); 752 | $newnodes[] = $newnode; 753 | } 754 | $content->clear(); 755 | $content->add($newnodes); 756 | return $this; 757 | } 758 | 759 | /** 760 | * Wrap an HTML structure around all elements in the set of matched elements. 761 | * 762 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content 763 | * @throws \LogicException 764 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining 765 | * @api 766 | */ 767 | public function wrapAll($content) 768 | { 769 | $content = self::create($content); 770 | $parent = $this->getNode(0)->parentNode; 771 | foreach ($this as $i => $node) { 772 | /** @var \DOMNode $node */ 773 | if ($node->parentNode !== $parent) { 774 | throw new \LogicException('Nodes to be wrapped with wrapAll() must all have the same parent'); 775 | } 776 | } 777 | 778 | $newnode = $content->getNode(0); 779 | /** @var \DOMNode $newnode */ 780 | $newnode = static::importNewnode($newnode, $parent); 781 | 782 | $newnode = $parent->insertBefore($newnode,$this->getNode(0)); 783 | $content->clear(); 784 | $content->add($newnode); 785 | 786 | while ($newnode->hasChildNodes()) { 787 | $elementFound = false; 788 | foreach ($newnode->childNodes as $child) { 789 | if ($child instanceof \DOMElement) { 790 | $newnode = $child; 791 | $elementFound = true; 792 | break; 793 | } 794 | } 795 | if (!$elementFound) { 796 | break; 797 | } 798 | } 799 | foreach ($this as $i => $node) { 800 | /** @var \DOMNode $node */ 801 | $newnode->appendChild($node); 802 | } 803 | return $this; 804 | } 805 | 806 | /** 807 | * Wrap an HTML structure around the content of each element in the set of matched elements. 808 | * 809 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content 810 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining 811 | * @api 812 | */ 813 | public function wrapInner($content) 814 | { 815 | foreach ($this as $i => $node) { 816 | /** @var \DOMNode $node */ 817 | self::create($node->childNodes)->wrapAll($content); 818 | } 819 | return $this; 820 | } 821 | 822 | /** 823 | * Get the HTML code fragment of all elements and their contents. 824 | * 825 | * If the first node contains a complete HTML document return only 826 | * the full code of this document. 827 | * 828 | * @return string HTML code (fragment) 829 | * @api 830 | */ 831 | public function saveHTML() 832 | { 833 | if ($this->isHtmlDocument()) { 834 | return $this->getDOMDocument()->saveHTML(); 835 | } else { 836 | $doc = new \DOMDocument('1.0', 'UTF-8'); 837 | $root = $doc->appendChild($doc->createElement('_root')); 838 | foreach ($this as $node) { 839 | $root->appendChild($doc->importNode($node, true)); 840 | } 841 | $html = trim($doc->saveHTML()); 842 | return preg_replace('@^<'.self::FRAGMENT_ROOT_TAGNAME.'[^>]*>|$@', '', $html); 843 | } 844 | } 845 | 846 | public function __toString() 847 | { 848 | return $this->saveHTML(); 849 | } 850 | 851 | /** 852 | * checks whether the first node contains a complete html document 853 | * (as opposed to a document fragment) 854 | * 855 | * @return boolean 856 | */ 857 | public function isHtmlDocument() 858 | { 859 | $node = $this->getNode(0); 860 | if ($node instanceof \DOMElement 861 | && $node->ownerDocument instanceof \DOMDocument 862 | && $node->ownerDocument->documentElement === $node 863 | && $node->nodeName == 'html' 864 | ) { 865 | return true; 866 | } else { 867 | return false; 868 | } 869 | } 870 | 871 | /** 872 | * get ownerDocument of the first element 873 | * 874 | * @return \DOMDocument|null 875 | */ 876 | public function getDOMDocument() 877 | { 878 | $node = $this->getNode(0); 879 | $r = null; 880 | if ($node instanceof \DOMElement 881 | && $node->ownerDocument instanceof \DOMDocument 882 | ) { 883 | $r = $node->ownerDocument; 884 | } 885 | return $r; 886 | } 887 | 888 | /** 889 | * Filters the list of nodes with a CSS selector. 890 | * 891 | * @param string $selector 892 | * @return HtmlPageCrawler 893 | */ 894 | public function filter(string $selector): static 895 | { 896 | return parent::filter($selector); 897 | } 898 | 899 | /** 900 | * Filters the list of nodes with an XPath expression. 901 | * 902 | * @param string $xpath An XPath expression 903 | * 904 | * @return HtmlPageCrawler A new instance of Crawler with the filtered list of nodes 905 | * 906 | * @api 907 | */ 908 | public function filterXPath($xpath): static 909 | { 910 | return parent::filterXPath($xpath); 911 | } 912 | 913 | /** 914 | * Adds HTML/XML content to the HtmlPageCrawler object (but not to the DOM of an already attached node). 915 | * 916 | * Function overriden from Crawler because HTML fragments are always added as complete documents there 917 | * 918 | * 919 | * @param string $content A string to parse as HTML/XML 920 | * @param null|string $type The content type of the string 921 | * 922 | * @return null|void 923 | */ 924 | public function addContent($content, $type = null): void 925 | { 926 | if (empty($type)) { 927 | $type = 'text/html;charset=UTF-8'; 928 | } 929 | if (substr($type, 0, 9) == 'text/html' && !preg_match('/]*>/i', $content)) { 930 | // string contains no Tag => no complete document but an HTML fragment! 931 | $this->addHtmlFragment($content); 932 | } else { 933 | parent::addContent($content, $type); 934 | } 935 | } 936 | 937 | public function addHtmlFragment($content, $charset = 'UTF-8') 938 | { 939 | $d = new \DOMDocument('1.0', $charset); 940 | $d->preserveWhiteSpace = false; 941 | $root = $d->appendChild($d->createElement(self::FRAGMENT_ROOT_TAGNAME)); 942 | $bodynode = Helpers::getBodyNodeFromHtmlFragment($content, $charset); 943 | foreach ($bodynode->childNodes as $child) { 944 | $inode = $root->appendChild($d->importNode($child, true)); 945 | if ($inode) { 946 | $this->addNode($inode); 947 | } 948 | } 949 | } 950 | 951 | /** 952 | * Adds a node to the current list of nodes. 953 | * 954 | * This method uses the appropriate specialized add*() method based 955 | * on the type of the argument. 956 | * 957 | * Overwritten from parent to allow Crawler to be added 958 | * 959 | * @param \DOMNodeList|\DOMNode|array|string|Crawler|null $node A node 960 | * 961 | * @api 962 | */ 963 | public function add(\DOMNodeList|\DOMNode|array|string|Crawler|null $node): void 964 | { 965 | if ($node instanceof Crawler) { 966 | foreach ($node as $childnode) { 967 | $this->addNode($childnode); 968 | } 969 | } else { 970 | parent::add($node); 971 | } 972 | } 973 | 974 | /** 975 | * @param \DOMNode $newnode 976 | * @param \DOMNode $referencenode 977 | * @param int $clone 978 | * @return \DOMNode 979 | */ 980 | protected static function importNewnode(\DOMNode $newnode, \DOMNode $referencenode, $clone = 0) { 981 | if ($newnode->ownerDocument !== $referencenode->ownerDocument) { 982 | $referencenode->ownerDocument->preserveWhiteSpace = false; 983 | $newnode = $referencenode->ownerDocument->importNode($newnode, true); 984 | } else { 985 | if ($clone > 0) { 986 | $newnode = $newnode->cloneNode(true); 987 | } 988 | } 989 | return $newnode; 990 | } 991 | 992 | // /** 993 | // * Checks whether the first node in the set is disconnected (has no parent node) 994 | // * 995 | // * @return bool 996 | // */ 997 | // public function isDisconnected() 998 | // { 999 | // $parent = $this->getNode(0)->parentNode; 1000 | // return ($parent == null || $parent->tagName == self::FRAGMENT_ROOT_TAGNAME); 1001 | // } 1002 | 1003 | public function __get($name) 1004 | { 1005 | switch ($name) { 1006 | case 'count': 1007 | case 'length': 1008 | return count($this); 1009 | } 1010 | throw new \Exception('No such property ' . $name); 1011 | } 1012 | } 1013 | --------------------------------------------------------------------------------