├── .github
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .phpdoc-md
├── .scrutinizer.yml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── Resources
    └── jquerytest.html
├── Tests
    ├── HelpersTest.php
    ├── HtmlPageCrawlerTest.php
    ├── HtmlPageTest.php
    ├── phpunit_bootstrap.php
    └── utf8.html
├── UPGRADE.md
├── composer.json
├── doc
    ├── HtmlPage.md
    ├── HtmlPageCrawler.md
    └── README.md
├── phpunit.xml.dist
└── src
    ├── Helpers.php
    ├── HtmlPage.php
    └── HtmlPageCrawler.php


/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   php:
14 |     runs-on: ubuntu-latest
15 | 
16 |     strategy:
17 |       matrix:
18 |         php: [8.0, 8.1, 8.2, 8.3]
19 |         dependency-version: [prefer-lowest, prefer-stable]
20 | 
21 |     steps:
22 |       - name: checkout code
23 |         uses: actions/checkout@v4
24 | 
25 |       - name: setup PHP
26 |         uses: shivammathur/setup-php@v2
27 |         with:
28 |           php-version: ${{ matrix.php }}
29 |           coverage: xdebug
30 | 
31 |       - name: install dependencies
32 |         run: composer update --${{ matrix.dependency-version }}
33 | 
34 |       - name: run tests
35 |         run: php vendor/bin/phpunit
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | vendor
2 | test.php
3 | composer.lock
4 | composer.phar
5 | 


--------------------------------------------------------------------------------
/.phpdoc-md:
--------------------------------------------------------------------------------
 1 | <?php
 2 | return (object)[
 3 |     'rootNamespace' => 'Wa72\HtmlPageDom',
 4 |     'destDirectory' => 'doc',
 5 |     'format' => 'github',
 6 |     'classes' => [
 7 |         '\Wa72\HtmlPageDom\HtmlPage',
 8 |         '\Wa72\HtmlPageDom\HtmlPageCrawler'
 9 |     ]
10 | ];
11 | 


--------------------------------------------------------------------------------
/.scrutinizer.yml:
--------------------------------------------------------------------------------
 1 | before_commands:
 2 |     - 'composer install --dev --no-interaction --prefer-source'
 3 | 
 4 | tools:
 5 |     # Code Coverage from Travis
 6 |     external_code_coverage:
 7 |         enabled: true
 8 |         timeout: 300
 9 |         filter:
10 |             excluded_paths:
11 |                 - 'Tests/*'
12 |                 - 'vendor/*'
13 |     php_code_coverage:
14 |         enabled: false
15 | 
16 |     php_code_sniffer:
17 |         enabled: true
18 |         config:
19 |             standard:         PSR2
20 |         filter:
21 |             excluded_paths:
22 |                 - 'vendor/*'
23 | 
24 |     # PHP Mess Detector (http://phpmd.org).
25 |     php_mess_detector:
26 |         enabled:              true
27 |         command:              phpmd
28 |         config:
29 |             rulesets:
30 |                 - codesize
31 |                 - unusedcode
32 |                 - design
33 |         filter:
34 |             excluded_paths:
35 |                 - 'vendor/*'
36 | 
37 |     php_pdepend:
38 |         enabled: true
39 |         excluded_dirs: [vendor, Tests]
40 | 
41 |     php_loc:
42 |         enabled: true
43 |         excluded_dirs: [vendor, Tests]
44 | 
45 |     php_cpd:
46 |         enabled: true
47 |         excluded_dirs: [vendor, Tests]
48 | 
49 |     php_analyzer:
50 |         enabled:              true
51 |         filter:
52 |             excluded_paths:
53 |                 - 'Tests/*'
54 |                 - 'vendor/*'
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | 3.0.0
  2 | =====
  3 | 
  4 | 2022-04-13
  5 | 
  6 | Changed some method signatures (added argument type hints and return types) in HtmlPageCrawler for compatibility with the base Crawler class from Symfony 6. So, this release is only compatible with Symfony 6 and up.
  7 | 
  8 | Otherwise there are no changes, so it does not require changes in code using this lib.
  9 | 
 10 | 2.0.0
 11 | =====
 12 | 
 13 | 2019-10-15
 14 | 
 15 | __BC BREAK__ for compatibility with Symfony 4.3 and up
 16 | 
 17 | - `HtmlPageCrawler::html()` is now just the parent `Crawler::html()` and acts as *getter* only.
 18 |   Setting HTML content via `HtmlPageCrawler::html($html)` is *not possible* any more,
 19 |   use `HtmlPageCrawler::setInnerHtml($html)` instead
 20 | 
 21 | - `HtmlPageCrawler::text()` is now just the parent `Crawler::text()` and acts as *getter* only
 22 |   that returns the text content from the *first* node only. For setting text content, use `HtmlPageCrawler::setText($text)` instead.
 23 |     
 24 | - `HtmlPageCrawler::attr()` is now just the parent `Crawler::attr()` and acts as *getter* only.
 25 |   For setting attributes use `HtmlPageCrawler::setAttribute($name, $value)` instead
 26 | 
 27 | - new method `HtmlPageCrawler::getCombinedText()` that returns the combined text from all nodes (as jQuery's `text()` function does and previous versions of `HtmlPageCrawler::text()` did)
 28 | 
 29 | - removed method `HtmlPageCrawler::isDisconnected()`
 30 | 
 31 | 
 32 | 1.4.2
 33 | =====
 34 | 
 35 | 2019-10-15
 36 | 
 37 | - undo deprecation of getInnerHtml()
 38 | - deprecate setter use of attr()
 39 | - deprecate isDisconnected()
 40 | 
 41 | 
 42 | 1.4.1
 43 | =====
 44 | 
 45 | 2019-06-28
 46 | 
 47 | - Bugfix: setText() should convert special chars. Closes #34.
 48 | 
 49 | 
 50 | 1.4.0
 51 | =====
 52 | 
 53 | 2019-05-17
 54 | 
 55 | Preparation for a smooth migration to 2.x / Symfony 4.3:
 56 | - deprecate setter use of html() and text(),
 57 | - deprecate getInnerHtml(),
 58 | - new methods setText() and getCombinedText()
 59 | 
 60 | 
 61 | 1.3.2
 62 | =====
 63 | 
 64 | 2019-04-18
 65 | 
 66 | - Mark this version as incompatible to Symfony DomCrawler 4.3
 67 | 
 68 | 
 69 | 1.3
 70 | ===
 71 | 
 72 | 2016-10-06
 73 | 
 74 | - new method `unwrapInner` (thanks to [@ttk](https://github.com/ttk))
 75 | 
 76 | - it's now possible to get the number of nodes in the crawler using the
 77 |   `$crawler->length` property like in Javascript instead of `count($crawler)`
 78 | 
 79 | 
 80 | 1.2
 81 | ===
 82 | 
 83 | 2015-11-06
 84 | 
 85 | - new methods `HtmlPage::minify()` and `HtmlPage::indent()` for compressing or nicely indenting the HTML document. These
 86 |   functions rely on the package `wa72/html-pretty-min` that is *suggested* in composer.json.
 87 | 
 88 | 1.1
 89 | ===
 90 | 
 91 | 2015-05-20
 92 | 
 93 | - `text()` function now returns combined text of all elements in set (as jQuery does; previously only the nodeValue of 
 94 |   the first element was returned) and can act as a setter `text($string)` that sets the nodeValue of all elements to
 95 |   the specified string
 96 | 
 97 | - function `hasClass` now returns true if any of the elements in the Crawler has the specified class (previously,
 98 |   only the first element was checked). 
 99 | 
100 | - new function `makeClone` as equivalent to jQuery's `clone` function ("clone" is not a valid function name in PHP).
101 |   As previously, you can alternatively use PHP's clone operator: `$r = $c->makeClone()` is the same as `$r = clone $c`,
102 |   but the new function allows chaining.
103 | 
104 | - new function `removeAttr` aliasing `removeAttribute` for compatibility with jQuery
105 | 
106 | - `appendTo`, `insertBefore`, `insertAfter`, and `replaceAll` now always return a new Crawler object containing
107 |   the aggregate set of all elements appended to the target elements (this is the behavior of jQuery 1.9 and newer).
108 |   
109 | - `attr` function can now act as setter `attr($name, $value)` which is an alias for `setAttribute($name, $value)`
110 |   (previously it accepted only one argument and was a getter equivalent to `getAttribute($name)` only, like it is
111 |   in parent DomCrawler)
112 |   
113 | - `attr($name)` and `getAttribute($name)` now always return `null` if the attribute does not exist (previously, an empty
114 |   string was returned when used with Symfony 2.3)
115 | 
116 | 1.0
117 | ===
118 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2022 Christoph Singer
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is furnished
 8 | to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | The software is provided "as is", without warranty of any kind, express or
14 | implied, including but not limited to the warranties of merchantability,
15 | fitness for a particular purpose and noninfringement. In no event shall the
16 | authors or copyright holders be liable for any claim, damages or other
17 | liability, whether in an action of contract, tort or otherwise, arising from,
18 | out of or in connection with the software or the use or other dealings in
19 | the software.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | HtmlPageDom
  2 | ===========
  3 | 
  4 | ![tests](https://github.com/wasinger/htmlpagedom/actions/workflows/tests.yml/badge.svg?branch=master)
  5 | [![Latest Version](http://img.shields.io/packagist/v/wa72/htmlpagedom.svg)](https://packagist.org/packages/wa72/htmlpagedom)
  6 | [![Downloads from Packagist](http://img.shields.io/packagist/dt/wa72/htmlpagedom.svg)](https://packagist.org/packages/wa72/htmlpagedom)
  7 | 
  8 | `Wa72\HtmlPageDom` is a PHP library for easy manipulation of HTML documents using DOM.
  9 | It requires [DomCrawler from Symfony components](https://github.com/symfony/DomCrawler) for traversing 
 10 | the DOM tree and extends it by adding methods for manipulating the DOM tree of HTML documents.    
 11 | 
 12 | It's useful when you need to not just extract information from an HTML file (what DomCrawler does) but
 13 | also to modify HTML pages. It is usable as a template engine: load your HTML template file, set new
 14 | HTML content on certain elements such as the page title, `div#content` or `ul#menu` and print out
 15 | the modified page.
 16 | 
 17 | `Wa72\HtmlPageDom` consists of two main classes:
 18 | 
 19 | -   `HtmlPageCrawler` extends `Symfony\Components\DomCrawler` by adding jQuery inspired, HTML specific 
 20 |     DOM *manipulation* functions such as `setInnerHtml($htmltext)`, `before()`, `append()`, `wrap()`, `addClass()` or `css()`.
 21 |     It's like jQuery for PHP: simply select elements of an HTML page using CSS selectors and change their 
 22 |     attributes and content. 
 23 |     
 24 |     [API doc for HtmlPageCrawler](doc/HtmlPageCrawler.md)
 25 | 
 26 | -   `HtmlPage` represents one complete HTML document and offers convenience functions like `getTitle()`, `setTitle($title)`,
 27 |     `setMeta('description', $description)`, `getBody()`. Internally, it uses the `HtmlPageCrawler` class for 
 28 |     filtering and manipulating DOM Elements. Since version 1.2, it offers methods for compressing (`minify()`) and
 29 |     prettyprinting (`indent()`) the HTML page.
 30 |     
 31 |     [API doc for HtmlPage](doc/HtmlPage.md)
 32 |  
 33 | 
 34 | Requirements and Compatibility
 35 | ------------------------------
 36 | 
 37 | Version 3.x:
 38 | - PHP 8.x
 39 | - [Symfony\Components\DomCrawler](https://github.com/symfony/DomCrawler) 6.x | 7.x
 40 | - [Symfony\Components\CssSelector](https://github.com/symfony/CssSelector) 6.x | 7.x
 41 | 
 42 | Version 2.x:
 43 | - PHP ^7.4 | 8.x
 44 | - [Symfony\Components\DomCrawler](https://github.com/symfony/DomCrawler) ^4.4 | 5.x
 45 | - [Symfony\Components\CssSelector](https://github.com/symfony/CssSelector) ^4.4 | 5.x
 46 | 
 47 | There is no difference in our API between versions 2.x and 3.0.x.
 48 | The only difference is the compatibility with different versions of Symfony.
 49 | 
 50 | Installation
 51 | ------------
 52 | 
 53 | -   using [composer](http://getcomposer.org): `composer require wa72/htmlpagedom`
 54 | 
 55 | -   using other [PSR-4](http://www.php-fig.org/psr/psr-4/) compliant autoloader:
 56 |     clone this project to where your included libraries are and point your autoloader to look for the 
 57 |     "\Wa72\HtmlPageDom" namespace in the "src" directory of this project
 58 | 
 59 | Usage
 60 | -----
 61 | 
 62 | `HtmlPageCrawler` is a wrapper around DOMNodes. `HtmlPageCrawler` objects can be created using `new` or the static function
 63 | `HtmlPageCrawler::create()`, which accepts an HTML string or a DOMNode (or an array of DOMNodes, a DOMNodeList, or even
 64 | another `Crawler` object) as arguments.
 65 | 
 66 | Afterwards you can select nodes from the added DOM tree by calling `filter()` (equivalent to find() in jQuery) and alter
 67 | the selected elements using the following jQuery-like manipulation functions:
 68 | 
 69 | -   `addClass()`, `hasClass()`, `removeClass()`, `toggleClass()`
 70 | -   `after()`, `before()`
 71 | -   `append()`, `appendTo()`
 72 | -   `makeClone()` (equivalent to `clone()` in jQuery)
 73 | -   `css()` (alias `getStyle()` / `setStyle()`)
 74 | -   `html()` (get inner HTML content) and `setInnerHtml($html)`
 75 | -   `attr()` (alias `getAttribute()` / `setAttribute()`), `removeAttr()`
 76 | -   `insertAfter()`, `insertBefore()`
 77 | -   `makeEmpty()` (equivalent to `empty()` in jQuery)
 78 | -   `prepend()`, `prependTo()`
 79 | -   `remove()`
 80 | -   `replaceAll()`, `replaceWith()`
 81 | -   `text()`, `getCombinedText()` (get text content of all nodes in the Crawler), and `setText($text)`
 82 | -   `wrap()`, `unwrap()`, `wrapInner()`, `unwrapInner()`, `wrapAll()`
 83 | 
 84 | To get the modified DOM as HTML code use `html()` (returns innerHTML of the first node in your crawler object)
 85 | or `saveHTML()` (returns combined "outer" HTML code of all elements in the list).
 86 | 
 87 | See the full methods documentation in the generated [API doc for HtmlPageCrawler](doc/HtmlPageCrawler.md)
 88 | 
 89 | **Example:**
 90 | 
 91 | ```php
 92 | use \Wa72\HtmlPageDom\HtmlPageCrawler;
 93 | 
 94 | // create an object from a fragment of HTML code as you would do with jQuery's $() function
 95 | $c = HtmlPageCrawler::create('<div id="content"><h1>Title</h1></div>');
 96 | 
 97 | // the above is the same as calling:
 98 | $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
 99 | 
100 | // filter for h1 elements and wrap them with an HTML structure
101 | $c->filter('h1')->wrap('<div class="innercontent">');
102 | 
103 | // return the modified HTML
104 | echo $c->saveHTML();
105 | // or simply:
106 | echo $c; // implicit __toString() calls saveHTML()
107 | // will output: <div id="content"><div class="innercontent"><h1>Title</h1></div></div>
108 | ```
109 | 
110 | **Advanced example: remove the third column from an HTML table**
111 | 
112 | ```php
113 | use \Wa72\HtmlPageDom\HtmlPageCrawler;
114 | $html = <<<END
115 | <table>
116 |     <tr>
117 |         <td>abc</td>
118 |         <td>adsf</td>
119 |         <td>to be removed</td>
120 |     </tr>
121 |     <tr>
122 |         <td>abc</td>
123 |         <td>adsf</td>
124 |         <td>to be removed</td>
125 |     </tr>
126 |     <tr>
127 |         <td>abc</td>
128 |         <td>adsf</td>
129 |         <td>to be removed</td>
130 |     </tr>
131 | </table>    
132 | END;  
133 | 
134 | $c = HtmlPageCrawler::create($html);
135 | $tr = $c->filter('table > tr > td')
136 |     ->reduce(
137 |         function ($c, $j) {
138 |             if (($j+1) % 3 == 0) {
139 |                 return true;
140 |             }
141 |             return false;
142 |         }
143 |     );
144 | $tr->remove();
145 | echo $c->saveHTML();
146 | ```
147 | 
148 | **Usage examples for the `HtmlPage` class:**
149 | 
150 | ```php
151 | use \Wa72\HtmlPageDom\HtmlPage;
152 | 
153 | // create a new HtmlPage object with an empty HTML skeleton
154 | $page = new HtmlPage();
155 | 
156 | // or create a HtmlPage object from an existing page
157 | $page = new HtmlPage(file_get_contents('http://www.heise.de'));
158 | 
159 | // get or set page title
160 | echo $page->getTitle();
161 | $page->setTitle('New page title');
162 | echo $page->getTitle();
163 | 
164 | 
165 | // add HTML content
166 | $page->filter('body')->setInnerHtml('<div id="#content"><h1>This is the headline</h1><p class="text">This is a paragraph</p></div>');
167 | 
168 | // select elements by css selector
169 | $h1 = $page->filter('#content h1');
170 | $p = $page->filter('p.text');
171 | 
172 | // change attributes and content of an element
173 | $h1->addClass('headline')->css('margin-top', '10px')->setInnerHtml('This is the <em>new</em> headline');
174 | 
175 | $p->removeClass('text')->append('<br>There is more than one line in this paragraph');
176 | 
177 | // add a new paragraph to div#content
178 | $page->filter('#content')->append('<p>This is a new paragraph.</p>');
179 | 
180 | // add a class and some attribute to all paragraphs
181 | $page->filter('p')->addClass('newclass')->setAttribute('data-foo', 'bar');
182 | 
183 | 
184 | // get HTML content of an element
185 | echo $page->filter('#content')->saveHTML();
186 | 
187 | // output the whole HTML page
188 | echo $page->save();
189 | // or simply:
190 | echo $page;
191 | 
192 | // output formatted HTML code
193 | echo $page->indent()->save();
194 | 
195 | // output compressed (minified) HTML code
196 | echo $page->minify()->save();
197 | ```
198 | 
199 | See also the generated [API doc for HtmlPage](doc/HtmlPage.md)
200 | 
201 | Limitations
202 | -----------
203 | 
204 | - HtmlPageDom builds on top of PHP's DOM functions and uses the loadHTML() and saveHTML() methods of the DOMDocument class.
205 | That's why it's output is always HTML, not XHTML.
206 | 
207 | - The HTML parser used by PHP is built for HTML4. It throws errors 
208 | on HTML5 specific elements which are ignored by HtmlPageDom, so HtmlPageDom is usable for HTML5 with some limitations.
209 | 
210 | - HtmlPageDom has not been tested with character encodings other than UTF-8.
211 | 
212 | 
213 | History
214 | -------
215 | 
216 | When I discovered how easy it was to modify HTML documents using jQuery I looked for a PHP library providing similar
217 | possibilities for PHP.
218 | 
219 | Googling around I found [SimpleHtmlDom](http://simplehtmldom.sourceforge.net)
220 | and later [Ganon](http://code.google.com/p/ganon) but both turned out to be very slow. Nevertheless I used both
221 | libraries in my projects.
222 | 
223 | When Symfony2 appeared with it's DomCrawler and CssSelector components I thought:
224 | the functions for traversing the DOM tree and selecting elements by CSS selectors are already there, only the
225 | manipulation functions are missing. Let's implement them! So the HtmlPageDom project was born.
226 | 
227 | It turned out that it was a good choice to build on PHP's DOM functions: Compared to SimpleHtmlDom and Ganon, HmtlPageDom
228 | is lightning fast. In one of my projects, I have a PHP script that takes a huge HTML page containing several hundreds
229 | of article elements and extracts them into individual HTML files (that are later on demand loaded by AJAX back into the
230 | original HTML page). Using SimpleHtmlDom it took the script 3 minutes (right, minutes!) to run (and I needed to raise
231 | PHP's memory limit to over 500MB). Using Ganon as HTML parsing and manipulation engine it took even longer,
232 | about 5 minutes. After switching to HtmlPageDom the same script doing the same processing tasks is running only about
233 | one second (all on the same server). HtmlPageDom is really fast.
234 | 
235 | 
236 | © 2012-2023 Christoph Singer. Licensed under the MIT License.
237 | 
238 | 


--------------------------------------------------------------------------------
/Resources/jquerytest.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head lang="en">
 4 |     <meta charset="UTF-8">
 5 |     <script src="https://code.jquery.com/jquery-2.1.4.js"></script>
 6 |     <title>Testing jquery object identities</title>
 7 | </head>
 8 | <body>
 9 | <h1>Testing jquery object identities</h1>
10 | <p>This page contains javascript code to figure out in which cases jQuery returns references to existing objects
11 | and when it makes copies.</p>
12 | <p>test paragraph 2<span>555</span></p>
13 | <p>test paragraph 3</p>
14 | 
15 | <script>
16 |     (function() {
17 |         if ( typeof Object.prototype.uniqueId == "undefined" ) {
18 |             var id = 0;
19 |             Object.prototype.uniqueId = function() {
20 |                 if ( typeof this.__uniqueid == "undefined" ) {
21 |                     this.__uniqueid = ++id;
22 |                 }
23 |                 return this.__uniqueid;
24 |             };
25 |         }
26 |     })();
27 |     $(document).ready(function(){
28 |         var $a = $('<span style="font-weight: bold;"> asdf</span>');
29 |         var $b = $('p');
30 |         var $h = $('h1');
31 |         var $ba, $ha;
32 | 
33 |         $ba = $a.appendTo($b);
34 |         $ha = $a.appendTo($h);
35 | 
36 |         console.log('$a: ' + $a.uniqueId());
37 |         console.log('span: ' + $a[0].uniqueId());
38 | 
39 |         console.log('$b: ' + $b.uniqueId());
40 |         console.log($ba);
41 |         console.log('$ba: ' + $ba.uniqueId());
42 |         console.log('$ba span 0: ' + $ba[0].uniqueId());
43 |         console.log('$ba span 1: ' + $ba[1].uniqueId());
44 |         console.log('$ba span 2: ' + $ba[2].uniqueId());
45 | 
46 |         console.log('$ha: ' + $ha.uniqueId());
47 |         console.log('$ha span 0: ' + $ha[0].uniqueId());
48 | 
49 |         console.log($b.text());
50 | 
51 |         $b.text('<span>444</span>');
52 | 
53 |         console.log($b.text());
54 | 
55 | 
56 |         // Test for issue #33 https://github.com/wasinger/htmlpagedom/issues/33
57 |         // Works like reporter expects in jquery but not in HmtlPageDom
58 | 
59 |         var $rootNode = $('<div />').appendTo($('body'));
60 |         var $p = $('<p />');
61 |         var $testNode = $('<span />');
62 |         $testNode.text('incorrect text');
63 |         $p.append($testNode);
64 |         $rootNode.append($p);
65 | 
66 |         // Change test node text after node appended
67 |         $testNode.text('correct text');
68 | 
69 |         // Output root or parent node html. Incorrect in HtmlPageDom, Correct in jquery
70 |         console.log($rootNode.html());
71 |         console.log($p.html());
72 | 
73 |         // Output node html. Correct
74 |         console.log($testNode.html());
75 | 
76 |         // Second test: adding node to multiple nodes.
77 |         // If $testNode is appended to multple elements it doesn't work in jquery, either:
78 |         $rootNode = $('<div />').appendTo($('body'));
79 |         $p = $('<p /><p />');
80 |         $testNode = $('<span />');
81 |         $testNode.text('incorrect text');
82 |         $p.append($testNode);
83 |         $rootNode.append($p);
84 | 
85 |         // Change test node text after node appended
86 |         $testNode.text('correct text');
87 | 
88 |         // Output root or parent node html. Incorrect in jquery and HtmlPageDom
89 |         console.log($rootNode.html());
90 |         console.log($p.html());
91 | 
92 |         // Output node html. Correct
93 |         console.log($testNode.html());
94 | 
95 |     });
96 | </script>
97 | </body>
98 | </html>


--------------------------------------------------------------------------------
/Tests/HelpersTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | namespace Wa72\HtmlPageDom\Tests;
 3 | 
 4 | use Wa72\HtmlPageDom\Helpers;
 5 | use org\bovigo\vfs\vfsStream;
 6 | use PHPUnit\Framework\TestCase;
 7 | 
 8 | class HelpersTest extends TestCase
 9 | {
10 |     public function testCssStringToArray()
11 |     {
12 |         $this->assertEquals([
13 |             'font-size' => '15px',
14 |             'font-weight' => 'bold',
15 |             'font-color' => 'black'
16 |         ], Helpers::cssStringToArray('invalid_css_string;font-size: 15px;font-weight: bold;font-color: black;'));
17 |     }
18 | 
19 |     public function testCssArrayToString()
20 |     {
21 |         $this->assertEquals('font-size: 15px;font-weight: bold;font-color: black;', Helpers::cssArrayToString([
22 |             'font-size' => '15px',
23 |             'font-weight' => 'bold',
24 |             'font-color' => 'black'
25 |         ]));
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/Tests/HtmlPageCrawlerTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | namespace Wa72\HtmlPageDom\Tests;
  3 | 
  4 | use Wa72\HtmlPageDom\HtmlPageCrawler;
  5 | use PHPUnit\Framework\TestCase;
  6 | 
  7 | class HtmlPageCrawlerTest extends TestCase
  8 | {
  9 |     /**
 10 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::__construct
 11 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::filter
 12 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::getFirstNode
 13 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::nodeName
 14 |      */
 15 |     public function testHtmlPageCrawler()
 16 |     {
 17 |         $c = new HtmlPageCrawler;
 18 |         $c->addHtmlContent('<!doctype html><html><body><div id="content"><h1>Title</h1></div></body></html>');
 19 |         $title = $c->filter('#content > h1');
 20 | 
 21 |         $this->assertInstanceOf('\Wa72\HtmlPageDom\HtmlPageCrawler', $title);
 22 |         $this->assertInstanceOf('\DOMNode', $title->getNode(0));
 23 |         $this->assertEquals('h1', $title->nodeName());
 24 |     }
 25 | 
 26 |     /**
 27 |      *
 28 |      *
 29 |      * @param $string
 30 |      * @return string
 31 |      */
 32 |     private function _ignoreNewlines($string)
 33 |     {
 34 |         return str_replace("\n", '', $string);
 35 |     }
 36 | 
 37 |     /**
 38 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::setInnerHtml
 39 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::prepend
 40 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::makeEmpty
 41 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::setAttribute
 42 |      */
 43 |     public function testManipulationFunctions()
 44 |     {
 45 |         $c = new HtmlPageCrawler;
 46 |         $c->addHtmlContent('<!doctype html><html><body><div id="content"><h1>Title</h1></div></body></html>');
 47 | 
 48 |         $content = $c->filter('#content');
 49 |         $content->append('<p>Das ist ein Testabsatz');
 50 |         $this->assertEquals("<h1>Title</h1><p>Das ist ein Testabsatz</p>", $this->_ignoreNewlines($content->html()));
 51 | 
 52 |         $content->setInnerHtml('<p>Ein neuer <b>Inhalt</p>');
 53 |         $this->assertEquals('<p>Ein neuer <b>Inhalt</b></p>', $content->html());
 54 | 
 55 |         $content->prepend('<h1>Neue Überschrift');
 56 |         $this->assertEquals('<h1>Neue Überschrift</h1><p>Ein neuer <b>Inhalt</b></p>', $content->html());
 57 | 
 58 |         $h1 = $content->filter('h1');
 59 |         $this->assertEquals('Neue Überschrift', $h1->text());
 60 | 
 61 |         $b = $content->filter('b');
 62 |         $this->assertEquals('Inhalt', $b->text());
 63 | 
 64 |         $b2 = $c->filter('#content p b');
 65 |         $this->assertEquals('Inhalt', $b2->text());
 66 | 
 67 |         $content->append('<p class="a2">Zweiter Absatz</p>');
 68 |         $content->append('<p class="a3"><b>Dritter Absatz</b> und noch mehr Text</p>');
 69 | 
 70 |         $a3 = $content->filter('p.a3');
 71 |         $this->assertEquals('<b>Dritter Absatz</b> und noch mehr Text', $a3->html());
 72 | 
 73 |         $a3b = $a3->filter('b');
 74 |         $this->assertEquals('Dritter Absatz', $a3b->text());
 75 | 
 76 |         $body = $c->filter('body');
 77 |         $this->assertEquals('<div id="content"><h1>Neue Überschrift</h1><p>Ein neuer <b>Inhalt</b></p><p class="a2">Zweiter Absatz</p><p class="a3"><b>Dritter Absatz</b> und noch mehr Text</p></div>', $this->_ignoreNewlines($body->html()));
 78 | 
 79 |         $paragraphs = $c->filter('p');
 80 |         $this->assertEquals(3, count($paragraphs));
 81 | 
 82 |         $paragraphs->append('<span class="appended">.</span>');
 83 |         $this->assertEquals('<p>Ein neuer <b>Inhalt</b><span class="appended">.</span></p><p class="a2">Zweiter Absatz<span class="appended">.</span></p><p class="a3"><b>Dritter Absatz</b> und noch mehr Text<span class="appended">.</span></p>', $c->filter('p')->saveHTML());
 84 | 
 85 |         $body->makeEmpty();
 86 |         $this->assertEmpty($body->html());
 87 | 
 88 |         $body->setAttribute('class', 'mybodyclass');
 89 |         $this->assertEquals('mybodyclass', $body->attr('class'));
 90 |     }
 91 | 
 92 |     /**
 93 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::append
 94 |      */
 95 |     public function testAppend()
 96 |     {
 97 |         // Testing append string to several elements
 98 |         $c = new HtmlPageCrawler('<p>Paragraph 1</p><p>Paragraph 2</p><p>Paragraph 3</p>');
 99 |         $c->filter('p')->append('<br>Appended Text');
100 |         $this->assertEquals('<p>Paragraph 1<br>Appended Text</p><p>Paragraph 2<br>Appended Text</p><p>Paragraph 3<br>Appended Text</p>', $c->saveHTML());
101 | 
102 |         // Testing append HtmlPageCrawler to several elements
103 |         $c = new HtmlPageCrawler('<p>Paragraph 1</p><p>Paragraph 2</p><p>Paragraph 3</p>');
104 |         $c->filter('p')->append(new HtmlPageCrawler('<br>Appended Text'));
105 |         $this->assertEquals('<p>Paragraph 1<br>Appended Text</p><p>Paragraph 2<br>Appended Text</p><p>Paragraph 3<br>Appended Text</p>', $c->saveHTML());
106 | 
107 |         // Testing append DOMNode to several elements
108 |         $c = new HtmlPageCrawler('<p>Paragraph 1</p><p>Paragraph 2</p><p>Paragraph 3</p>');
109 |         $app = $c->getDOMDocument()->createElement('span', 'Appended Text');
110 |         $c->filter('p')->append($app);
111 |         $this->assertEquals('<p>Paragraph 1<span>Appended Text</span></p><p>Paragraph 2<span>Appended Text</span></p><p>Paragraph 3<span>Appended Text</span></p>', $c->saveHTML());
112 | 
113 |         $c = new HtmlPageCrawler('<div id="content"><span>Append Self</span></div>');
114 |         $c->filter('#content')->append($c->filter('span'));
115 |         $this->assertEquals('<div id="content"><span>Append Self</span></div>', $c->saveHTML());
116 |     }
117 | 
118 |     /**
119 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::appendTo
120 |      */
121 |     public function testAppendTo()
122 |     {
123 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1><em>Big</em></div>');
124 |         $c->filter('em')->appendTo($c->filter('h1'));
125 |         $this->assertEquals('<div id="content"><h1>Title<em>Big</em></h1></div>', $c->saveHTML());
126 | 
127 |         $c = new HtmlPageCrawler('<div id="content"><h1>Self Title</h1></div>');
128 |         $c->filter('h1')->appendTo($c->filter('h1'));
129 |         $this->assertEquals('<div id="content"><h1>Self Title</h1></div>', $c->saveHTML());
130 |     }
131 | 
132 |     /**
133 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::isHtmlDocument
134 |      */
135 |     public function testIsHtmlDocument()
136 |     {
137 |         $dom = new \DOMDocument('1.0', 'UTF-8');
138 |         $dom->loadHTML('<!DOCTYPE html><html><body><div id="content"><h1>Title</h1></div></body></html>');
139 |         $c = new HtmlPageCrawler($dom);
140 | 
141 |         $this->assertTrue($c->isHtmlDocument());
142 | 
143 |         $t = $c->filter('body');
144 |         $this->assertFalse($t->isHtmlDocument());
145 | 
146 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
147 |         $this->assertFalse($c->isHtmlDocument());
148 | 
149 |         $c = new HtmlPageCrawler('<html><body><div id="content"><h1>Title</h1></div></body></html>');
150 |         $this->assertTrue($c->isHtmlDocument());
151 |     }
152 | 
153 |     /**
154 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::saveHTML
155 |      */
156 |     public function testSaveHTML()
157 |     {
158 |         $html = "<!DOCTYPE html><html><body><h1>Title</h1><p>Paragraph 1</p><p>Paragraph 2</p></body></html>";
159 |         $dom = new \DOMDocument('1.0', 'UTF-8');
160 |         $dom->loadHTML($html);
161 |         $c = new HtmlPageCrawler($dom);
162 |         $this->assertEquals($html, $this->_ignoreNewlines($c->saveHTML()));
163 |         $ps = $c->filter('p');
164 |         $this->assertEquals('<p>Paragraph 1</p><p>Paragraph 2</p>', $ps->saveHTML());
165 |         $t = $c->filter('h1');
166 |         $this->assertEquals('<h1>Title</h1>', $t->saveHTML());
167 | 
168 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
169 |         $this->assertEquals('<div id="content"><h1>Title</h1></div>', $c->saveHTML());
170 |     }
171 | 
172 |     /**
173 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::css
174 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::getStyle
175 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::setStyle
176 |      */
177 |     public function testCss()
178 |     {
179 |         $dom = new \DOMDocument('1.0', 'UTF-8');
180 |         $dom->loadHTML('<!DOCTYPE html><html><body><div id="content"><h1 style=" margin-top:
181 |          10px;border-bottom:  1px solid red">Title</h1></div></body></html>');
182 |         $c = new HtmlPageCrawler($dom);
183 |         $t = $c->filter('h1');
184 |         $this->assertEquals('10px', $t->css('margin-top'));
185 |         $this->assertEquals('1px solid red', $t->css('border-bottom'));
186 |         $t->css('margin-bottom', '20px');
187 |         $this->assertEquals('20px', $t->css('margin-bottom'));
188 |         $this->assertEquals('10px', $t->getStyle('margin-top'));
189 |         $this->assertEquals('<h1 style="margin-top: 10px;border-bottom: 1px solid red;margin-bottom: 20px;">Title</h1>', $t->saveHTML());
190 |         $t->setStyle('border-bottom', '');
191 |         $this->assertEquals('<h1 style="margin-top: 10px;margin-bottom: 20px;">Title</h1>', $t->saveHTML());
192 |         $t->setStyle('padding-top', '0');
193 |         $this->assertEquals('<h1 style="margin-top: 10px;margin-bottom: 20px;padding-top: 0;">Title</h1>', $t->saveHTML());
194 |         $this->assertEquals('0', $t->getStyle('padding-top'));
195 |         $this->assertNull($t->getStyle('border-bottom'));
196 |     }
197 | 
198 |     /**
199 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::addClass
200 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::removeClass
201 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::hasClass
202 |      */
203 |     public function testClasses()
204 |     {
205 |         $dom = new \DOMDocument('1.0', 'UTF-8');
206 |         $dom->loadHTML('<!DOCTYPE html><html><body><div id="content"><h1 class="style_class">Title</h1></div></body></html>');
207 |         $c = new HtmlPageCrawler($dom);
208 |         $t = $c->filter('h1');
209 |         $t->addClass('ueberschrift');
210 |         $t->addClass('nochneklasse');
211 |         $t->addClass('style_class');
212 |         $this->assertEquals('<h1 class="style_class ueberschrift nochneklasse">Title</h1>', $t->saveHTML());
213 |         $this->assertTrue($t->hasClass('ueberschrift'));
214 |         $this->assertTrue($t->hasClass('nochneklasse'));
215 |         $this->assertTrue($t->hasClass('style_class'));
216 |         $t->removeClass('nochneklasse');
217 |         $this->assertTrue($t->hasClass('ueberschrift'));
218 |         $this->assertFalse($t->hasClass('nochneklasse'));
219 |         $t->addClass('class1 class2');
220 |         $this->assertTrue($t->hasClass('class1'));
221 |         $this->assertTrue($t->hasClass('class2'));
222 | 
223 |         $c1 = new HtmlPageCrawler('<p class="a"></p><p class="b"></p><p class="c"></p>');
224 |         $this->assertTrue($c1->hasClass('b'));
225 |     }
226 | 
227 |     /**
228 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::addContent
229 |      */
230 |     public function testAddContent()
231 |     {
232 |         $c = new HtmlPageCrawler();
233 |         $c->addContent('<html><body><div id="content"><h1>Title</h1></div></body>');
234 |         $this->assertEquals(
235 |             '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
236 |             . "" . '<html><body><div id="content"><h1>Title</h1></div></body></html>' . "",
237 |             $this->_ignoreNewlines($c->saveHTML())
238 |         );
239 | 
240 |         $c = new HtmlPageCrawler();
241 |         $c->addContent('<div id="content"><h1>Title');
242 |         $this->assertEquals('<div id="content"><h1>Title</h1></div>', $c->saveHTML());
243 | 
244 |         $c = new HtmlPageCrawler();
245 |         $c->addContent('<p>asdf<p>asdfaf</p>');
246 |         $this->assertEquals(2, count($c));
247 |         $this->assertEquals('<p>asdf</p><p>asdfaf</p>', $c->saveHTML());
248 |     }
249 | 
250 |     /**
251 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::before
252 |      */
253 |     public function testBefore()
254 |     {
255 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
256 |         $c->filter('h1')->before('<p>Text before h1</p>');
257 |         $this->assertEquals('<div id="content"><p>Text before h1</p><h1>Title</h1></div>', $c->saveHTML());
258 | 
259 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
260 |         $c->filter('h1')->before(new HtmlPageCrawler('<p>Text before h1</p><p>and more text before</p>'));
261 |         $this->assertEquals('<div id="content"><p>Text before h1</p><p>and more text before</p><h1>Title</h1></div>', $c->saveHTML());
262 | 
263 |         $c = new HtmlPageCrawler('<div id="content"><h1>Self Before</h1></div>');
264 |         $c->filter('h1')->before($c->filter('h1'));
265 |         $this->assertEquals('<div id="content"><h1>Self Before</h1></div>', $c->saveHTML());
266 |     }
267 | 
268 |     /**
269 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::insertBefore
270 |      */
271 |     public function testInsertBefore()
272 |     {
273 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1><p>Text before h1</p></div>');
274 |         $c->filter('p')->insertBefore($c->filter('h1'));
275 |         $this->assertEquals('<div id="content"><p>Text before h1</p><h1>Title</h1></div>', $c->saveHTML());
276 | 
277 |         $c = new HtmlPageCrawler('<div id="content"><h1>Self Insert Before Title</h1><p>Text after h1</p></div>');
278 |         $c->filter('h1')->insertBefore($c->filter('h1'));
279 |         $this->assertEquals('<div id="content"><h1>Self Insert Before Title</h1><p>Text after h1</p></div>', $c->saveHTML());
280 |     }
281 | 
282 |     /**
283 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::after
284 |      */
285 |     public function testAfter()
286 |     {
287 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
288 |         $c->filter('h1')->after('<p>Text after h1</p>');
289 |         $this->assertEquals('<div id="content"><h1>Title</h1><p>Text after h1</p></div>', $c->saveHTML());
290 | 
291 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1><h1>Title2</h1></div>');
292 |         $c->filter('h1')->after(new HtmlPageCrawler('<p>Text after h1</p><p>and more text after</p>'));
293 |         $this->assertEquals('<div id="content"><h1>Title</h1><p>Text after h1</p><p>and more text after</p><h1>Title2</h1><p>Text after h1</p><p>and more text after</p></div>', $c->saveHTML());
294 | 
295 |         $c = new HtmlPageCrawler('<div id="content"><h1>Self After</h1></div>');
296 |         $c->filter('h1')->after($c->filter('h1'));
297 |         $this->assertEquals('<div id="content"><h1>Self After</h1></div>', $c->saveHTML());
298 |     }
299 | 
300 |     /**
301 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::insertAfter
302 |      */
303 |     public function testInsertAfter()
304 |     {
305 |         $c = new HtmlPageCrawler('<div id="content"><p>Text after h1</p><h1>Title</h1></div>');
306 |         $c->filter('p')->insertAfter($c->filter('h1'));
307 |         $this->assertEquals('<div id="content"><h1>Title</h1><p>Text after h1</p></div>', $c->saveHTML());
308 | 
309 |         $c = new HtmlPageCrawler('<div id="content"><p>Text before h1</p><h1>Self Insert After Title</h1></div>');
310 |         $c->filter('h1')->insertAfter($c->filter('h1'));
311 |         $this->assertEquals('<div id="content"><p>Text before h1</p><h1>Self Insert After Title</h1></div>', $c->saveHTML());
312 |     }
313 | 
314 |     /**
315 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::prepend
316 |      */
317 |     public function testPrepend()
318 |     {
319 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
320 |         $c->filter('#content')->prepend('<p>Text before h1</p>');
321 |         $this->assertEquals('<div id="content"><p>Text before h1</p><h1>Title</h1></div>', $c->saveHTML());
322 | 
323 |         $c = new HtmlPageCrawler('<div id="content"></div>');
324 |         $c->filter('#content')->prepend(new HtmlPageCrawler('<p>Text before h1</p><p>and more text before</p>'));
325 |         $this->assertEquals('<div id="content"><p>Text before h1</p><p>and more text before</p></div>', $c->saveHTML());
326 | 
327 |         $c = new HtmlPageCrawler('<div id="content"><span>Prepend Self</span></div>');
328 |         $c->filter('#content')->prepend($c->filter('span'));
329 |         $this->assertEquals('<div id="content"><span>Prepend Self</span></div>', $c->saveHTML());
330 |     }
331 | 
332 |     /**
333 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::prependTo
334 |      */
335 |     public function testPrependTo()
336 |     {
337 |         $c = new HtmlPageCrawler('<div id="content"><p>Text before</p></div>');
338 |         $c->filter('p')->prependTo('Text');
339 |         $this->assertEquals('<div id="content"><p>Text before</p></div>', $c->saveHTML());
340 | 
341 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
342 |         $c->filter('#content')->prependTo(new HtmlPageCrawler('<p>paragraph</p>'));
343 |         $this->assertEquals('<div id="content"><h1>Title</h1></div>', $c->saveHTML());
344 | 
345 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1><em>Big</em></div>');
346 |         $c->filter('em')->prependTo($c->filter('h1'));
347 |         $this->assertEquals('<div id="content"><h1><em>Big</em>Title</h1></div>', $c->saveHTML());
348 | 
349 |         $c = new HtmlPageCrawler('<div id="content"><h1>Self Title</h1></div>');
350 |         $c->filter('h1')->prependTo($c->filter('h1'));
351 |         $this->assertEquals('<div id="content"><h1>Self Title</h1></div>', $c->saveHTML());
352 |     }
353 | 
354 |     /**
355 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::wrap
356 |      */
357 |     public function testWrap()
358 |     {
359 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
360 |         $c->filter('h1')->wrap('<div class="innercontent">');
361 |         $this->assertEquals('<div id="content"><div class="innercontent"><h1>Title</h1></div></div>', $c->saveHTML());
362 | 
363 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
364 |         $c->filter('h1')->wrap('<div class="ic">asdf<div class="a1"><div class="a2"></div></div></div></div>');
365 |         $this->assertEquals('<div id="content"><div class="ic">asdf<div class="a1"><div class="a2"><h1>Title</h1></div></div></div></div>', $c->saveHTML());
366 | 
367 |         $c = new HtmlPageCrawler('<div id="content"><h1>Title</h1></div>');
368 |         $c->filter('h1')->wrap('<div class="ic">asdf</div><div>jkl</div>'); // wrap has more than 1 root element
369 |         $this->assertEquals('<div id="content"><div class="ic">asdf<h1>Title</h1></div></div>', $c->saveHTML()); // only first element is used
370 | 
371 |         // Test for wrapping multiple nodes
372 |         $c = new HtmlPageCrawler('<div id="content"><p>p1</p><p>p2</p></div>');
373 |         $c->filter('p')->wrap('<div class="p"></div>');
374 |         $this->assertEquals('<div id="content"><div class="p"><p>p1</p></div><div class="p"><p>p2</p></div></div>', $c->saveHTML());
375 | 
376 |         $c = new HtmlPageCrawler('plain text node');
377 |         $c->wrap('<div class="ic"></div>');
378 |         $this->assertEquals('<div class="ic">plain text node</div>', $c->ancestors()->eq(0)->saveHTML());
379 | 
380 |         $c = HtmlPageCrawler::create('<div>');
381 |         $m = HtmlPageCrawler::create('message 1')->appendTo($c);
382 |         $m->wrap('<p>');
383 |         $m = HtmlPageCrawler::create('message 2')->appendTo($c);
384 |         $m->wrap('<p>');
385 |         $this->assertEquals('<div><p>message 1</p><p>message 2</p></div>', $c->saveHTML());
386 |     }
387 | 
388 |     /**
389 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::replaceWith
390 |      */
391 |     public function testReplaceWith()
392 |     {
393 |         $c = HtmlPageCrawler::create('<div id="content"><p>Absatz 1</p><p>Absatz 2</p><p>Absatz 3</p></div>');
394 |         $oldparagraphs = $c->filter('p')->replaceWith('<div>newtext 1</div><div>newtext 2</div>');
395 |         $this->assertEquals('<div id="content"><div>newtext 1</div><div>newtext 2</div><div>newtext 1</div><div>newtext 2</div><div>newtext 1</div><div>newtext 2</div></div>', $c->saveHTML());
396 |         $this->assertEquals('<p>Absatz 1</p><p>Absatz 2</p><p>Absatz 3</p>', $oldparagraphs->saveHTML());
397 |     }
398 | 
399 |     /**
400 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::replaceAll
401 |      */
402 |     public function testReplaceAll()
403 |     {
404 |         $c = HtmlPageCrawler::create('<div id="content"><p>Absatz 1</p><p>Absatz 2</p><p>Absatz 3</p></div>');
405 |         $new = HtmlPageCrawler::create('<div>newtext 1</div><div>newtext 2</div>');
406 |         $new->replaceAll($c->filter('p'));
407 |         $this->assertEquals('<div id="content"><div>newtext 1</div><div>newtext 2</div><div>newtext 1</div><div>newtext 2</div><div>newtext 1</div><div>newtext 2</div></div>', $c->saveHTML());
408 |     }
409 | 
410 |     /**
411 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::wrapAll
412 |      */
413 |     public function testWrapAll()
414 |     {
415 |         $c = HtmlPageCrawler::create('<div id="content"><div>Before</div><p>Absatz 1</p><div>Inner</div><p>Absatz 2</p><p>Absatz 3</p><div>After</div></div>');
416 |         $c->filter('p')->wrapAll('<div class="a">');
417 |         $this->assertEquals('<div id="content"><div>Before</div><div class="a"><p>Absatz 1</p><p>Absatz 2</p><p>Absatz 3</p></div><div>Inner</div><div>After</div></div>', $c->saveHTML());
418 | 
419 |         // Test for wrapping with elements that have children
420 |         $c = HtmlPageCrawler::create('<div id="content"><p>Absatz 1</p><p>Absatz 2</p><p>Absatz 3</p></div>');
421 |         $c->filter('p')->wrapAll('<article><section><div class="a"></div></section></article>');
422 |         $this->assertEquals('<div id="content"><article><section><div class="a"><p>Absatz 1</p><p>Absatz 2</p><p>Absatz 3</p></div></section></article></div>', $c->saveHTML());
423 |     }
424 | 
425 |     /**
426 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::wrapInner
427 |      */
428 |     public function testWrapInner()
429 |     {
430 |         $c = HtmlPageCrawler::create('<div id="content"><p>Absatz 1</p><p>Absatz 2</p><p>Absatz 3</p></div>');
431 |         $c->wrapInner('<div class="a">');
432 |         $this->assertEquals('<div id="content"><div class="a"><p>Absatz 1</p><p>Absatz 2</p><p>Absatz 3</p></div></div>', $c->saveHTML());
433 |     }
434 | 
435 |     /**
436 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::unwrap
437 |      */
438 |     public function testUnwrap()
439 |     {
440 |         $c = HtmlPageCrawler::create('<div id="content"><div>Before</div><div class="a"><p>Absatz 1</p></div><div>After</div></div>');
441 |         $p = $c->filter('p');
442 |         $p->unwrap();
443 |         $this->assertEquals('<div id="content"><div>Before</div><p>Absatz 1</p><div>After</div></div>', $c->saveHTML());
444 |     }
445 |     
446 |     public function testUnwrapInnerOnDOMElementExeption()
447 |     {
448 |         $this->expectException(\InvalidArgumentException::class);
449 |         $this->expectErrorMessage('DOMElement does not have a parent DOMElement node.');
450 |         
451 |         $c = HtmlPageCrawler::create('<div id="content"></div>');
452 |         $p = $c->filter('div#content');
453 |         $p->unwrapInner();
454 |         $p->unwrapInner();
455 |     }
456 | 
457 |     /**
458 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::unwrapInner
459 |      */
460 |     public function testUnwrapInner()
461 |     {
462 |         $c = HtmlPageCrawler::create('<div id="content"><div>Before</div><div class="a"><p>Absatz 1</p></div><div>After</div></div>');
463 |         $p = $c->filter('div.a');
464 |         $p->unwrapInner();
465 |         $this->assertEquals('<div id="content"><div>Before</div><p>Absatz 1</p><div>After</div></div>', $c->saveHTML());
466 |     }
467 | 
468 |     /**
469 |      * @covers Wa72\HtmlPageDom\HtmlPageCrawler::toggleClass
470 |      */
471 |     public function testToggleClass()
472 |     {
473 |         $c = HtmlPageCrawler::create('<div id="1" class="a c"><div id="2" class="b c"></div></div>');
474 |         $c->filter('div')->toggleClass('a d')->toggleClass('b');
475 |         $this->assertEquals('<div id="1" class="c d b"><div id="2" class="c a d"></div></div>', $c->saveHTML());
476 |     }
477 | 
478 |     public function testRemove()
479 |     {
480 |         // remove every third td in tbody
481 |         $html = <<<END
482 | <table>
483 |     <thead>
484 |     <tr>
485 |         <th>A</th>
486 |         <th>B</th>
487 |     </tr>
488 |     </thead>
489 |     <tbody>
490 |     <tr class="r1">
491 |         <td class="c11">16.12.2013</td>
492 |         <td class="c12">asdf asdf</td>
493 |         <td class="c13">&nbsp;</td>
494 |     </tr>
495 |     <tr class="r2">
496 |         <td class="c21">02.12.2013 16:30</td>
497 |         <td class="c22">asdf asdf</td>
498 |         <td class="c23">&nbsp;</td>
499 |     </tr>
500 |     <tr class="r3">
501 |         <td class="c31">25.11.2013 16:30</td>
502 |         <td class="c32">asdf asdf</td>
503 |         <td class="c33">&nbsp;</td>
504 |     </tr>
505 |     <tr class="r4">
506 |         <td class="c41">18.11.2013 16:30</td>
507 |         <td class="c42">asdf asdf</td>
508 |         <td class="c43">&nbsp;</td>
509 |     </tr>
510 |     <tr class="r5">
511 |         <td class="c51">24.10.2013 16:30</td>
512 |         <td class="c52">asdf asdf</td>
513 |         <td class="c53">&nbsp;</td>
514 |     </tr>
515 |     <tr class="r6">
516 |         <td class="c61">10.10.2013 16:30</td>
517 |         <td class="c62">asdf asdf</td>
518 |         <td class="c63">&nbsp;</td>
519 |     </tr>
520 | </table>
521 | END;
522 |         $c = HtmlPageCrawler::create($html);
523 |         $this->assertEquals(1, count($c->filter('td.c23')));
524 |         $tbd = $c->filter('table > tbody > tr > td')
525 |             ->reduce(
526 |                 function ($c, $j) {
527 |                     if (($j+1) % 3 == 0) {
528 |                         return true;
529 |                     }
530 |                     return false;
531 |                 }
532 |             );
533 |         $this->assertEquals(6, count($tbd));
534 |         $tbd->remove();
535 |         $this->assertEquals(0, count($tbd));
536 |         $this->assertEquals(0, count($c->filter('td.c23')));
537 |     }
538 | 
539 |     public function testUTF8Characters()
540 |     {
541 |         $text = file_get_contents(__DIR__ . '/utf8.html');
542 |         $c = HtmlPageCrawler::create($text);
543 | 
544 |         $expected =<<< END
545 | <p style="margin: 0cm 0cm 0pt;"><span>Die Burse&nbsp;wurde unmittelbar (1478 bis 1482) nach der Universit&auml;tsgr&uuml;ndung als Studentenwohnhaus und -lehranstalt errichtet. Hier lehrte der Humanist und Reformator Philipp Melanchthon bis zu seiner Berufung nach Wittenberg 1518, an ihn erinnert eine Gedenktafel. 1803 bis 1805 wurde das Geb&auml;ude im Stil des Klassizismus zum ersten T&uuml;binger Klinikum umgebaut. Einer der ersten Patienten war Friedrich H&ouml;lderlin, der nach einer 231 Tage dauernden Behandlung am 3. Mai 1807 als unheilbar entlassen wurde.</span></p><p style="margin: 0cm 0cm 0pt;"><span>Einst Badeanstalt vor der Stadtmauer. Wer durch das kleine Stadttor geht, hat &ndash; r&uuml;ckw&auml;rts gewandt &ndash; einen guten Blick auf die Stadtbefestigung mit "Pechnasen" und Spuren des alten Wehrgangs.</span></p>
546 | END;
547 | 
548 |         $this->assertEquals($expected, $c->filter('p')->saveHTML());
549 |     }
550 | 
551 |     public function testAttr()
552 |     {
553 |         $c = HtmlPageCrawler::create('<div>');
554 |         $this->assertNull($c->attr('data-foo'));
555 |         $c->setAttribute('data-foo', 'bar');
556 |         $this->assertEquals('bar', $c->attr('data-foo'));
557 |         $this->assertEquals('bar', $c->getAttribute('data-foo'));
558 |         $c->removeAttribute('data-foo');
559 |         $this->assertNull($c->attr('data-foo'));
560 |         $c->setAttribute('data-foo', 'bar');
561 |         $this->assertEquals('bar', $c->attr('data-foo'));
562 |         // getAttribute is just an alias to attr() and should provide the same result
563 |         $this->assertEquals('bar', $c->getAttribute('data-foo'));
564 |         $c->removeAttr('data-foo');
565 |         $this->assertNull($c->attr('data-foo'));
566 | 
567 |     }
568 |     
569 |     public function testAttrOnInvalidNodeList()
570 |     {
571 |         $this->expectException(\InvalidArgumentException::class);
572 |         $c = HtmlPageCrawler::create(null);
573 |         $c->attr('data-foo');
574 |     }
575 | 
576 |     public function testSetInnerHtml()
577 |     {
578 |         $html = HtmlPageCrawler::create('<h1>Title</h1>');
579 |         $this->assertInstanceOf('Wa72\HtmlPageDom\HtmlPageCrawler', $html->setInnerHtml('<h2>Title</h2>'));
580 |         $this->assertEquals('<h2>Title</h2>', $html->html());
581 |         // getInnerHtml is just an alias for html() and should provide the same result
582 |         $this->assertEquals('<h2>Title</h2>', $html->getInnerHtml());
583 |     }
584 | 
585 |     public function testToString()
586 |     {
587 |         $html = HtmlPageCrawler::create('<h2>Title</h2>');
588 |         $this->assertEquals('<h2>Title</h2>', (string) $html);
589 |     }
590 | 
591 |     public function testGetDOMDocument()
592 |     {
593 |         $html = HtmlPageCrawler::create('<h2>Title</h2>');
594 |         $this->assertInstanceOf('\DOMDocument', $html->getDOMDocument());
595 |     }
596 | 
597 |     public function testAddOnCrawlerInstance()
598 |     {
599 |         $html = HtmlPageCrawler::create('<h1>Title</h1>');
600 |         $html->add($html);
601 |         $this->assertEquals('<h1>Title</h1>', (string) $html);
602 |     }
603 | 
604 |     public function testReturnValues()
605 |     {
606 |         // appendTo, insertBefore, insertAfter, replaceAll should always return new Crawler objects
607 |         // see http://jquery.com/upgrade-guide/1.9/#appendto-insertbefore-insertafter-and-replaceall
608 | 
609 |         $c1 = HtmlPageCrawler::create('<h1>Headline</h1>');
610 |         $c2 = HtmlPageCrawler::create('<p>1</p><p>2</p><p>3</p>');
611 |         $c3 = HtmlPageCrawler::create('<span>asdf</span>');
612 | 
613 |         $r1 = $c3->appendTo($c1);
614 |         $this->assertNotEquals(spl_object_hash($c3), spl_object_hash($r1));
615 | 
616 |         $r2 = $c3->insertBefore($c1);
617 |         $this->assertNotEquals(spl_object_hash($c3), spl_object_hash($r2));
618 | 
619 |         $r3 = $c3->insertAfter($c1);
620 |         $this->assertNotEquals(spl_object_hash($c3), spl_object_hash($r3));
621 | 
622 |         $r4 = $c3->replaceAll($c1);
623 |         $this->assertNotEquals(spl_object_hash($c3), spl_object_hash($r4));
624 | 
625 | 
626 |         $r1 = $c3->appendTo($c2);
627 |         $this->assertNotEquals(spl_object_hash($c2), spl_object_hash($r1));
628 | 
629 |         $r2 = $c3->insertBefore($c2);
630 |         $this->assertNotEquals(spl_object_hash($c2), spl_object_hash($r2));
631 | 
632 |         $r3 = $c3->insertAfter($c2);
633 |         $this->assertNotEquals(spl_object_hash($c2), spl_object_hash($r3));
634 | 
635 |         $r4 = $c3->replaceAll($c2);
636 |         $this->assertNotEquals(spl_object_hash($c2), spl_object_hash($r4));
637 | 
638 |     }
639 | 
640 |     public function testDisconnectedNodes()
641 |     {
642 |         // if after(), before() or replaceWith() is called on a node without parent,
643 |         // the unmodified Crawler object should be returned
644 |         //
645 |         // see http://jquery.com/upgrade-guide/1.9/#after-before-and-replacewith-with-disconnected-nodes
646 |         $c = HtmlPageCrawler::create('<div>abc</div>');
647 |         $r = HtmlPageCrawler::create('<div>def</div>');
648 | 
649 |         $r1 = $c->after($r);
650 |         $this->assertEquals(spl_object_hash($r1), spl_object_hash($c));
651 |         $this->assertEquals(count($r1), count($c));
652 | 
653 |         $r2 = $c->before($r);
654 |         $this->assertEquals(spl_object_hash($r2), spl_object_hash($c));
655 |         $this->assertEquals(count($r2), count($c));
656 | 
657 |         $r3 = $c->replaceWith($r);
658 |         $this->assertEquals(spl_object_hash($r3), spl_object_hash($c));
659 |         $this->assertEquals(count($r3), count($c));
660 |     }
661 | 
662 |     public function testClone()
663 |     {
664 |         $c = HtmlPageCrawler::create('<div><p class="x">asdf</p></div>');
665 |         $p = $c->filter('p');
666 | 
667 |         $p1 = $p->makeClone();
668 |         $this->assertNotEquals(spl_object_hash($p), spl_object_hash($p1));
669 |         $this->assertTrue($p1->hasClass('x'));
670 |         $p1->removeClass('x');
671 |         $this->assertTrue($p->hasClass('x'));
672 |         $this->assertFalse($p1->hasClass('x'));
673 |         $p->after($p1);
674 |         $this->assertEquals('<div><p class="x">asdf</p><p class="">asdf</p></div>', $c->saveHTML());
675 |     }
676 | 
677 |     public function testGetCombinedText()
678 |     {
679 |         $c = HtmlPageCrawler::create('<p>abc</p><p>def</p>');
680 |         $this->assertEquals('abcdef', $c->getCombinedText());
681 |         $c->setText('jklo');
682 |         $this->assertEquals('jklojklo', $c->getCombinedText());
683 |     }
684 | 
685 |     public function testSetText()
686 |     {
687 |         $c = HtmlPageCrawler::create('<div>&quot;</div>');
688 |         $this->assertEquals('"', $c->text());
689 |         $c->setText('&');
690 |         $this->assertEquals('&', $c->text());
691 |     }
692 | 
693 |     public function testMagicGet()
694 |     {
695 |         // $crawler->length should give us the number of nodes in the crawler
696 |         $c = HtmlPageCrawler::create('<p>abc</p><p>def</p>');
697 |         $this->assertEquals(2, $c->length);
698 | 
699 |         // not existing property throws exception
700 |         try {
701 |             $c->foo;
702 |         } catch (\Exception $e) {
703 |             $this->assertEquals('No such property foo', $e->getMessage());
704 |             return;
705 |         }
706 |         $this->fail();
707 |     }
708 | }
709 | 


--------------------------------------------------------------------------------
/Tests/HtmlPageTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | namespace Wa72\HtmlPageDom\Tests;
  3 | 
  4 | use Wa72\HtmlPageDom\HtmlPage;
  5 | use org\bovigo\vfs\vfsStream;
  6 | use PHPUnit\Framework\TestCase;
  7 | 
  8 | class HtmlPageTest extends TestCase
  9 | {
 10 |     public function setUp(): void
 11 |     {
 12 |         $this->root = vfsStream::setup('root');
 13 |     }
 14 | 
 15 |     public function testHtmlPage()
 16 |     {
 17 |         $hp = new HtmlPage;
 18 |         $this->assertEquals("<!DOCTYPE html>\n<html><head><title></title></head><body></body></html>\n", $hp->__toString());
 19 | 
 20 |         $title = 'Erste Testseite';
 21 |         $hp->setTitle($title);
 22 |         $this->assertEquals($title, $hp->getTitle());
 23 | 
 24 |         $title = 'Seite "schön & gut" >> so wird\'s, süß';
 25 |         $hp->setTitle($title);
 26 |         $this->assertEquals($title, $hp->getTitle());
 27 | 
 28 |         $description = 'Dies ist die erste "Testseite" >> so wird\'s, süß';
 29 |         $hp->setMeta('description', $description);
 30 |         $this->assertEquals($description, $hp->getMeta('description'));
 31 | 
 32 |         $hp->removeMeta('description');
 33 |         $this->assertNull($hp->getMeta('description'));
 34 | 
 35 |         $bodycontent = '<div id="content">Testcontent1</div>';
 36 |         $body = $hp->filter('body');
 37 |         $body->setInnerHtml($bodycontent);
 38 |         $this->assertEquals($bodycontent, $body->html());
 39 |         $this->assertEquals($bodycontent, $hp->filter('body')->html());
 40 | 
 41 |         $content = "<h1>Überschrift</h1>\n<p>bla bla <br><b>fett</b></p>";
 42 |         $hp->setHtmlById('content', $content);
 43 |         // echo $hp;
 44 |         $this->assertEquals($content, $hp->getElementById('content')->html());
 45 | 
 46 |         $url = 'http://www.tuebingen.de/';
 47 |         $hp->setBaseHref($url);
 48 |         $this->assertEquals($url, $hp->getBaseHref());
 49 |     }
 50 | 
 51 | 
 52 |     public function testClone()
 53 |     {
 54 |         $hp = new HtmlPage;
 55 |         $this->assertEquals("<!DOCTYPE html>\n<html><head><title></title></head><body></body></html>\n", $hp->__toString());
 56 | 
 57 |         $title = 'Erste Testseite';
 58 |         $hp->setTitle($title);
 59 |         $this->assertEquals($title, $hp->getTitle());
 60 | 
 61 |         $hp2 = clone $hp;
 62 | 
 63 |         $newtitle = 'Seitentitel neu';
 64 |         $hp->setTitle($newtitle);
 65 | 
 66 |         $this->assertEquals($title, $hp2->getTitle());
 67 |         $this->assertEquals($newtitle, $hp->getTitle());
 68 |     }
 69 | 
 70 |     public function testScript()
 71 |     {
 72 |         $html =<<<END
 73 | <!DOCTYPE html>
 74 | <html>
 75 | <head>
 76 | <title></title>
 77 | <script>
 78 | // this will be awesome
 79 | alert('Hello world');
 80 | </script>
 81 | </head>
 82 | <body>
 83 | </body>
 84 | </html>
 85 | 
 86 | END;
 87 |         $hp = new HtmlPage($html);
 88 |         $hp->getBody()->append('<h1>Script Test</h1>');
 89 |         $newhtml = $hp->save();
 90 | 
 91 |         $expected =<<<END
 92 | <!DOCTYPE html>
 93 | <html>
 94 | <head>
 95 | <title></title>
 96 | <script>
 97 | // this will be awesome
 98 | alert('Hello world');
 99 | </script>
100 | </head>
101 | <body>
102 | <h1>Script Test</h1></body>
103 | </html>
104 | 
105 | END;
106 |         $this->assertEquals($expected, $newhtml);
107 | 
108 |     }
109 | 
110 |     public function testMinify()
111 |     {
112 |         $html =<<<END
113 | <!DOCTYPE html>
114 | <html>
115 | <head>
116 | <title></title>
117 | <script>
118 | // this will be awesome
119 | alert('Hello world');
120 | </script>
121 | </head>
122 | <body>
123 |     <h1>TEST</h1>
124 |     <p class="">
125 |     asdf jksdlf ajsfk
126 |     <b>jasdf
127 |     jaksfd asdf</b>
128 |     <a>jasdf jaks</a>
129 |     </p>
130 | </body>
131 | </html>
132 | 
133 | END;
134 |         $hp = new HtmlPage($html);
135 | 
136 |         $expected = <<<END
137 | <!DOCTYPE html>
138 | <html><head><title></title><script>alert('Hello world');</script></head><body><h1>TEST</h1><p>asdf jksdlf ajsfk <b>jasdf jaksfd asdf</b> <a>jasdf jaks</a></p></body></html>
139 | 
140 | END;
141 |         $this->assertEquals($expected, $hp->minify()->save());
142 | 
143 |     }
144 | 
145 |     public function testIndent()
146 |     {
147 |         $html =<<<END
148 | <!DOCTYPE html>
149 | <html>
150 | <head>
151 | <title></title>
152 | <script>
153 | // this will be awesome
154 | alert('Hello world');
155 | </script>
156 | </head>
157 | <body>
158 |     <h1>TEST</h1>
159 |     <p>
160 |     asdf jksdlf ajsfk
161 |     <b>jasdf
162 |     jaksfd asdf</b>
163 |     <a>jasdf jaks</a>
164 |     </p>
165 | </body>
166 | </html>
167 | 
168 | END;
169 |         $hp = new HtmlPage($html);
170 | 
171 |         $expected = <<<END
172 | <!DOCTYPE html>
173 | <html>
174 | 	<head>
175 | 		<title></title>
176 | 		<script>
177 | // this will be awesome
178 | alert('Hello world');
179 | 		</script>
180 | 	</head>
181 | 	<body>
182 | 		<h1>TEST</h1>
183 | 		<p>asdf jksdlf ajsfk <b>jasdf jaksfd asdf</b> <a>jasdf jaks</a></p>
184 | 	</body>
185 | </html>
186 | 
187 | END;
188 |         $this->assertEquals($expected, $hp->indent()->save());
189 | 
190 |     }
191 | 
192 |     public function testGetCrawler()
193 |     {
194 |         $html = <<<END
195 | <!DOCTYPE html>
196 | <html>
197 | <head>
198 | <title></title>
199 | <script>
200 | // this will be awesome
201 | alert('Hello world');
202 | </script>
203 | </head>
204 | <body>
205 |     <h1>TEST</h1>
206 |     <p class="">
207 |     asdf jksdlf ajsfk
208 |     <b>jasdf
209 |     jaksfd asdf</b>
210 |     <a>jasdf jaks</a>
211 |     </p>
212 | </body>
213 | </html>
214 | 
215 | END;
216 | 
217 |         $hp = new HtmlPage($html);
218 |         $this->assertEquals('<h1>TEST</h1>', $hp->getCrawler()->filter('h1')->saveHtml());
219 |     }
220 | 
221 |     public function testGetDOMDocument()
222 |     {
223 |         $html = <<<END
224 | <!DOCTYPE html>
225 | <html>
226 | <head>
227 | <title></title>
228 | <script>
229 | // this will be awesome
230 | alert('Hello world');
231 | </script>
232 | </head>
233 | <body>
234 |     <h1>TEST</h1>
235 |     <p class="">
236 |     asdf jksdlf ajsfk
237 |     <b>jasdf
238 |     jaksfd asdf</b>
239 |     <a>jasdf jaks</a>
240 |     </p>
241 | </body>
242 | </html>
243 | 
244 | END;
245 | 
246 |         $hp = new HtmlPage($html);
247 |         $this->assertInstanceOf('\DOMDocument', $hp->getDOMDocument());
248 |     }
249 | 
250 |     public function testSetTitleOnNoTitleElement()
251 |     {
252 |         $html = <<<END
253 | <!DOCTYPE html>
254 | <html>
255 | <head>
256 | <script>
257 | // this will be awesome
258 | alert('Hello world');
259 | </script>
260 | </head>
261 | <body>
262 |     <h1>TEST</h1>
263 |     <p class="">
264 |     asdf jksdlf ajsfk
265 |     <b>jasdf
266 |     jaksfd asdf</b>
267 |     <a>jasdf jaks</a>
268 |     </p>
269 | </body>
270 | </html>
271 | 
272 | END;
273 | 
274 |         $hp = new HtmlPage($html);
275 |         $hp->setTitle('TEST');
276 |         $this->assertEquals('TEST', $hp->getTitle());
277 |     }
278 | 
279 |     public function testGetTitleShouldReturnNull()
280 |     {
281 |         $html = <<<END
282 | <!DOCTYPE html>
283 | <html>
284 | <head>
285 | <script>
286 | // this will be awesome
287 | alert('Hello world');
288 | </script>
289 | </head>
290 | <body>
291 |     <h1>TEST</h1>
292 |     <p class="">
293 |     asdf jksdlf ajsfk
294 |     <b>jasdf
295 |     jaksfd asdf</b>
296 |     <a>jasdf jaks</a>
297 |     </p>
298 | </body>
299 | </html>
300 | 
301 | END;
302 | 
303 |         $hp = new HtmlPage($html);
304 |         $this->assertNull($hp->getTitle());
305 |     }
306 | 
307 |     public function testGetBaseHrefShouldReturnNull()
308 |     {
309 |         $hp = new HtmlPage('<!DOCTYPE html><html><head><title>TEST</title></head><body>Hello</body></html>');
310 |         $this->assertNull($hp->getBaseHref());
311 |     }
312 | 
313 |     public function testGetHeadNodeShouldAddTheHeadTag()
314 |     {
315 |         $hp = new HtmlPage('<!DOCTYPE html><html><body>Hello</body></html>');
316 |         $this->assertInstanceOf('\DOMElement', $hp->getHeadNode());
317 |         $this->assertEquals('<head></head>', (string) $hp->getHead());
318 |     }
319 | 
320 |     public function testGetBodyNodeShouldAddTheBodyTag()
321 |     {
322 |         $hp = new HtmlPage('<!DOCTYPE html><html></html>');
323 |         $this->assertInstanceOf('\DOMElement', $hp->getBodyNode());
324 |         $this->assertEquals('<body></body>', (string) $hp->getBody());
325 |     }
326 | 
327 |     public function testTrimNewlines()
328 |     {
329 |         $html = <<<END
330 | <!DOCTYPE html>
331 | <html>
332 |     <head>
333 |     <title>TEST</title>
334 |     </head>
335 | </html>
336 | END;
337 | 
338 |         $this->assertEquals('<!DOCTYPE html> <html> <head> <title>TEST</title> </head> </html>', (string) HtmlPage::trimNewlines($html));
339 |     }
340 | 
341 |     public function testSaveOnFileName()
342 |     {
343 |         $hp = new HtmlPage('<!DOCTYPE html><html><head><title>TEST</title></head></html>');
344 |         $hp->save(vfsStream::url('root/save.html'));
345 |         $this->assertFileExists(vfsStream::url('root/save.html'));
346 |     }
347 | 
348 |     public function testEmbeddedScriptWithHtml()
349 |     {
350 |         // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
351 |         // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
352 |         $html = <<<END
353 | <!DOCTYPE html>
354 | <html lang="de">
355 | <head>
356 |     <title>test</title>
357 | </head>
358 | <body>
359 | <div>
360 |     <script>
361 |         var html = '<b>Status</b><div>' + it_status_text + '</div>';
362 |     </script>
363 | </div>
364 | </body>
365 | </html>
366 | END;
367 |         $hp = new HtmlPage($html);
368 |         $this->assertEquals($html . "\n", $hp->save());
369 |     }
370 | }
371 | 


--------------------------------------------------------------------------------
/Tests/phpunit_bootstrap.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | // if we are checked out as a stand-alone project
 3 | $loader = __DIR__ . '/../vendor/autoload.php';
 4 |  
 5 | // if we are within the vendor directory of another project
 6 | if (file_exists(__DIR__ . '/../../../../vendor/autoload.php')) {
 7 |     $loader = __DIR__ . '/../../../../vendor/autoload.php';
 8 | }
 9 |  
10 | if (!$loader = @include($loader)) {
11 |     echo <<<EOM
12 | You must set up the project dependencies by running the following commands:
13 |  
14 |     curl -s http://getcomposer.org/installer | php
15 |     php composer.phar install
16 |  
17 | EOM;
18 |  
19 |     exit(1);
20 | }
21 | 


--------------------------------------------------------------------------------
/Tests/utf8.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | 
 3 | <html lang="de">
 4 | <head>
 5 | <meta charset="utf-8" />
 6 | </head>
 7 | <body>
 8 |         <p style="margin: 0cm 0cm 0pt;"><span>Die Burse wurde unmittelbar (1478 bis 1482) nach der Universitätsgründung als Studentenwohnhaus und -lehranstalt errichtet. Hier lehrte der Humanist und Reformator Philipp Melanchthon bis zu seiner Berufung nach Wittenberg 1518, an ihn erinnert eine Gedenktafel. 1803 bis 1805 wurde das Gebäude im Stil des Klassizismus zum ersten Tübinger Klinikum umgebaut. Einer der ersten Patienten war Friedrich Hölderlin, der nach einer 231 Tage dauernden Behandlung am 3. Mai 1807 als unheilbar entlassen wurde.</span></p>
 9 |         <p style="margin: 0cm 0cm 0pt;"><span>Einst Badeanstalt vor der Stadtmauer. Wer durch das kleine Stadttor geht, hat – rückwärts gewandt – einen guten Blick auf die Stadtbefestigung mit "Pechnasen" und Spuren des alten Wehrgangs.</span></p>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/UPGRADE.md:
--------------------------------------------------------------------------------
 1 | Upgrade from 2.x to 3.0
 2 | -----------------------
 3 | 
 4 | Release 3.x is compatible only with Symfony 6, while older releases are compatible with Symfony up to 5.4.
 5 | Otherwise there are no changes in our API, so no changes should be required in your code using this lib. Just upgrade to Version 3 when you upgrade your project to Symfony 6 and all should be well.
 6 | 
 7 | 
 8 | Upgrade from 1.x to 2.0
 9 | ------------------------
10 | 
11 | Several changes have been made to the public API in 2.0 in order to keep
12 | compatibility with Symfony 4.3:
13 | 
14 | - `HtmlPageCrawler::html()` is now just the parent `Crawler::html()` and acts as *getter* only.
15 |   Setting HTML content via `HtmlPageCrawler::html($html)` is *not possible* any more,
16 |   use `HtmlPageCrawler::setInnerHtml($html)` instead
17 | 
18 | - `HtmlPageCrawler::text()` is now just the parent `Crawler::text()` and acts as *getter* only
19 |   that returns the text content from the *first* node only. For setting text content, use
20 |   `HtmlPageCrawler::setText($text)` instead.
21 |    
22 | - new method `HtmlPageCrawler::getCombinedText()` that returns the combined text from all nodes
23 |   (as jQuery's `text()` function does and previous versions of `HtmlPageCrawler::text()` did)
24 | 
25 | - `HtmlPageCrawler::attr()` is now just the parent `Crawler::attr()` and acts as *getter* only.
26 |   For setting attributes use `HtmlPageCrawler::setAttribute($name, $value)` 
27 | 
28 | - removed method `HtmlPageCrawler::isDisconnected()`
29 | 
30 | __To update your code, you have to:__
31 | 
32 | - replace all calls to `$MyCrawlerInstance->html($html)` used as *setter* by `$MyCrawlerInstance->setInnerHtml($html)`
33 | - replace all calls to `$MyCrawlerInstance->attr($name, $value)` used as *setter* by `$MyCrawlerInstance->setAttribute($name, $value)`
34 | - replace all calls to `$MyCrawlerInstance->text($text)` used as *setter* by `$MyCrawlerInstance->setText($text)`
35 | - replace all calls to `$MyCrawlerInstance->text()` (i.e. every call to `text()` not preceded by `first()`) by `$MyCrawlerInstance->getCombinedText()`
36 | - replace all calls to `$MyCrawlerInstance->first()->text()` by `$MyCrawlerInstance->text()`
37 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name":"wa72/htmlpagedom",
 3 |     "description":"jQuery-inspired DOM manipulation extension for Symfony's Crawler",
 4 |     "keywords":["HTML", "DOM", "Crawler"],
 5 |     "homepage":"http://github.com/wasinger/htmlpagedom",
 6 |     "type":"library",
 7 |     "license":"MIT",
 8 |     "authors":[
 9 |         {
10 |             "name":"Christoph Singer",
11 |             "email":"singer@webagentur72.de",
12 |             "homepage":"http://www.webagentur72.de"
13 |         }
14 |     ],
15 |     "require":{
16 |         "php":"^8.0",
17 |         "ext-dom":"*",
18 |         "ext-libxml":"*",
19 |         "symfony/polyfill-mbstring": "~1.0",
20 |         "symfony/dom-crawler":"^6.0 || ^7.0",
21 |         "symfony/css-selector":"^6.0 || ^7.0"
22 |     },
23 |     "require-dev": {
24 |         "phpunit/phpunit": "^9",
25 |         "wa72/html-pretty-min": "~0.1",
26 |         "mikey179/vfsstream": "^1.6.10",
27 |         "scrutinizer/ocular": "^1.9",
28 |         "clean/phpdoc-md": "^0.19.3"
29 |     },
30 |     "suggest": {
31 |         "wa72/html-pretty-min": "Minify or indent HTML documents"
32 |     },
33 |     "autoload":{
34 |         "psr-4":{
35 |             "Wa72\\HtmlPageDom\\":"src/"
36 |         }
37 |     },
38 |     "extra": {
39 |         "branch-alias": {
40 |             "dev-master": "3.0-dev"
41 |         }
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/doc/HtmlPage.md:
--------------------------------------------------------------------------------
  1 | # Wa72\HtmlPageDom\HtmlPage  
  2 | 
  3 | This class represents a complete HTML document.
  4 | 
  5 | It offers convenience functions for getting and setting elements of the document
  6 | such as setTitle(), getTitle(), setMeta($name, $value), getBody().
  7 | 
  8 | It uses HtmlPageCrawler to navigate and manipulate the DOM tree.  
  9 | 
 10 | ## Implements:
 11 | Stringable
 12 | 
 13 | 
 14 | 
 15 | ## Methods
 16 | 
 17 | | Name | Description |
 18 | |------|-------------|
 19 | |[__clone](#htmlpage__clone)||
 20 | |[__construct](#htmlpage__construct)||
 21 | |[__toString](#htmlpage__tostring)||
 22 | |[filter](#htmlpagefilter)|Filter nodes by using a CSS selector|
 23 | |[filterXPath](#htmlpagefilterxpath)|Filter nodes by XPath expression|
 24 | |[getBaseHref](#htmlpagegetbasehref)|Get the href attribute from the base tag, null if not present in document|
 25 | |[getBody](#htmlpagegetbody)|Get the document's body wrapped in a HtmlPageCrawler instance|
 26 | |[getBodyNode](#htmlpagegetbodynode)|Get the document's body as DOMElement|
 27 | |[getCrawler](#htmlpagegetcrawler)|Get a HtmlPageCrawler object containing the root node of the HTML document|
 28 | |[getDOMDocument](#htmlpagegetdomdocument)|Get a DOMDocument object for the HTML document|
 29 | |[getElementById](#htmlpagegetelementbyid)|Get an element in the document by it's id attribute|
 30 | |[getHead](#htmlpagegethead)|Get the document's HEAD section wrapped in a HtmlPageCrawler instance|
 31 | |[getHeadNode](#htmlpagegetheadnode)|Get the document's HEAD section as DOMElement|
 32 | |[getMeta](#htmlpagegetmeta)|Get the content attribute of a meta tag with the specified name attribute|
 33 | |[getTitle](#htmlpagegettitle)|Get the page title of the HTML document|
 34 | |[indent](#htmlpageindent)|indent the HTML document|
 35 | |[minify](#htmlpageminify)|minify the HTML document|
 36 | |[removeMeta](#htmlpageremovemeta)|Remove all meta tags with the specified name attribute|
 37 | |[save](#htmlpagesave)|Save this document to a HTML file or return HTML code as string|
 38 | |[setBaseHref](#htmlpagesetbasehref)|Set the base tag with href attribute set to parameter $url|
 39 | |[setHtmlById](#htmlpagesethtmlbyid)|Sets innerHTML content of an element specified by elementId|
 40 | |[setMeta](#htmlpagesetmeta)|Set a META tag with specified 'name' and 'content' attributes|
 41 | |[setTitle](#htmlpagesettitle)|Sets the page title of the HTML document|
 42 | |[trimNewlines](#htmlpagetrimnewlines)|remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space)|
 43 | 
 44 | 
 45 | 
 46 | 
 47 | ### HtmlPage::__clone  
 48 | 
 49 | **Description**
 50 | 
 51 | ```php
 52 |  __clone (void)
 53 | ```
 54 | 
 55 |  
 56 | 
 57 |  
 58 | 
 59 | **Parameters**
 60 | 
 61 | `This function has no parameters.`
 62 | 
 63 | **Return Values**
 64 | 
 65 | `void`
 66 | 
 67 | 
 68 | <hr />
 69 | 
 70 | 
 71 | ### HtmlPage::__construct  
 72 | 
 73 | **Description**
 74 | 
 75 | ```php
 76 |  __construct (void)
 77 | ```
 78 | 
 79 |  
 80 | 
 81 |  
 82 | 
 83 | **Parameters**
 84 | 
 85 | `This function has no parameters.`
 86 | 
 87 | **Return Values**
 88 | 
 89 | `void`
 90 | 
 91 | 
 92 | <hr />
 93 | 
 94 | 
 95 | ### HtmlPage::__toString  
 96 | 
 97 | **Description**
 98 | 
 99 | ```php
100 |  __toString (void)
101 | ```
102 | 
103 |  
104 | 
105 |  
106 | 
107 | **Parameters**
108 | 
109 | `This function has no parameters.`
110 | 
111 | **Return Values**
112 | 
113 | `void`
114 | 
115 | 
116 | <hr />
117 | 
118 | 
119 | ### HtmlPage::filter  
120 | 
121 | **Description**
122 | 
123 | ```php
124 | public filter (string $selector)
125 | ```
126 | 
127 | Filter nodes by using a CSS selector 
128 | 
129 |  
130 | 
131 | **Parameters**
132 | 
133 | * `(string) $selector`
134 | : CSS selector  
135 | 
136 | **Return Values**
137 | 
138 | `\HtmlPageCrawler`
139 | 
140 | 
141 | 
142 | 
143 | <hr />
144 | 
145 | 
146 | ### HtmlPage::filterXPath  
147 | 
148 | **Description**
149 | 
150 | ```php
151 | public filterXPath (string $xpath)
152 | ```
153 | 
154 | Filter nodes by XPath expression 
155 | 
156 |  
157 | 
158 | **Parameters**
159 | 
160 | * `(string) $xpath`
161 | : XPath expression  
162 | 
163 | **Return Values**
164 | 
165 | `\HtmlPageCrawler`
166 | 
167 | 
168 | 
169 | 
170 | <hr />
171 | 
172 | 
173 | ### HtmlPage::getBaseHref  
174 | 
175 | **Description**
176 | 
177 | ```php
178 | public getBaseHref (void)
179 | ```
180 | 
181 | Get the href attribute from the base tag, null if not present in document 
182 | 
183 |  
184 | 
185 | **Parameters**
186 | 
187 | `This function has no parameters.`
188 | 
189 | **Return Values**
190 | 
191 | `null|string`
192 | 
193 | 
194 | 
195 | 
196 | <hr />
197 | 
198 | 
199 | ### HtmlPage::getBody  
200 | 
201 | **Description**
202 | 
203 | ```php
204 | public getBody (void)
205 | ```
206 | 
207 | Get the document's body wrapped in a HtmlPageCrawler instance 
208 | 
209 |  
210 | 
211 | **Parameters**
212 | 
213 | `This function has no parameters.`
214 | 
215 | **Return Values**
216 | 
217 | `\HtmlPageCrawler`
218 | 
219 | 
220 | 
221 | 
222 | <hr />
223 | 
224 | 
225 | ### HtmlPage::getBodyNode  
226 | 
227 | **Description**
228 | 
229 | ```php
230 | public getBodyNode (void)
231 | ```
232 | 
233 | Get the document's body as DOMElement 
234 | 
235 |  
236 | 
237 | **Parameters**
238 | 
239 | `This function has no parameters.`
240 | 
241 | **Return Values**
242 | 
243 | `\DOMElement`
244 | 
245 | 
246 | 
247 | 
248 | <hr />
249 | 
250 | 
251 | ### HtmlPage::getCrawler  
252 | 
253 | **Description**
254 | 
255 | ```php
256 | public getCrawler (void)
257 | ```
258 | 
259 | Get a HtmlPageCrawler object containing the root node of the HTML document 
260 | 
261 |  
262 | 
263 | **Parameters**
264 | 
265 | `This function has no parameters.`
266 | 
267 | **Return Values**
268 | 
269 | `\HtmlPageCrawler`
270 | 
271 | 
272 | 
273 | 
274 | <hr />
275 | 
276 | 
277 | ### HtmlPage::getDOMDocument  
278 | 
279 | **Description**
280 | 
281 | ```php
282 | public getDOMDocument (void)
283 | ```
284 | 
285 | Get a DOMDocument object for the HTML document 
286 | 
287 |  
288 | 
289 | **Parameters**
290 | 
291 | `This function has no parameters.`
292 | 
293 | **Return Values**
294 | 
295 | `\DOMDocument`
296 | 
297 | 
298 | 
299 | 
300 | <hr />
301 | 
302 | 
303 | ### HtmlPage::getElementById  
304 | 
305 | **Description**
306 | 
307 | ```php
308 | public getElementById (string $id)
309 | ```
310 | 
311 | Get an element in the document by it's id attribute 
312 | 
313 |  
314 | 
315 | **Parameters**
316 | 
317 | * `(string) $id`
318 | 
319 | **Return Values**
320 | 
321 | `\HtmlPageCrawler`
322 | 
323 | 
324 | 
325 | 
326 | <hr />
327 | 
328 | 
329 | ### HtmlPage::getHead  
330 | 
331 | **Description**
332 | 
333 | ```php
334 | public getHead (void)
335 | ```
336 | 
337 | Get the document's HEAD section wrapped in a HtmlPageCrawler instance 
338 | 
339 |  
340 | 
341 | **Parameters**
342 | 
343 | `This function has no parameters.`
344 | 
345 | **Return Values**
346 | 
347 | `\HtmlPageCrawler`
348 | 
349 | 
350 | 
351 | 
352 | <hr />
353 | 
354 | 
355 | ### HtmlPage::getHeadNode  
356 | 
357 | **Description**
358 | 
359 | ```php
360 | public getHeadNode (void)
361 | ```
362 | 
363 | Get the document's HEAD section as DOMElement 
364 | 
365 |  
366 | 
367 | **Parameters**
368 | 
369 | `This function has no parameters.`
370 | 
371 | **Return Values**
372 | 
373 | `\DOMElement`
374 | 
375 | 
376 | 
377 | 
378 | <hr />
379 | 
380 | 
381 | ### HtmlPage::getMeta  
382 | 
383 | **Description**
384 | 
385 | ```php
386 | public getMeta (string $name)
387 | ```
388 | 
389 | Get the content attribute of a meta tag with the specified name attribute 
390 | 
391 |  
392 | 
393 | **Parameters**
394 | 
395 | * `(string) $name`
396 | 
397 | **Return Values**
398 | 
399 | `null|string`
400 | 
401 | 
402 | 
403 | 
404 | <hr />
405 | 
406 | 
407 | ### HtmlPage::getTitle  
408 | 
409 | **Description**
410 | 
411 | ```php
412 | public getTitle (void)
413 | ```
414 | 
415 | Get the page title of the HTML document 
416 | 
417 |  
418 | 
419 | **Parameters**
420 | 
421 | `This function has no parameters.`
422 | 
423 | **Return Values**
424 | 
425 | `null|string`
426 | 
427 | 
428 | 
429 | 
430 | <hr />
431 | 
432 | 
433 | ### HtmlPage::indent  
434 | 
435 | **Description**
436 | 
437 | ```php
438 | public indent (array $options)
439 | ```
440 | 
441 | indent the HTML document 
442 | 
443 |  
444 | 
445 | **Parameters**
446 | 
447 | * `(array) $options`
448 | : Options passed to PrettyMin::__construct()  
449 | 
450 | **Return Values**
451 | 
452 | `\HtmlPage`
453 | 
454 | 
455 | 
456 | 
457 | **Throws Exceptions**
458 | 
459 | 
460 | `\Exception`
461 | 
462 | 
463 | <hr />
464 | 
465 | 
466 | ### HtmlPage::minify  
467 | 
468 | **Description**
469 | 
470 | ```php
471 | public minify (array $options)
472 | ```
473 | 
474 | minify the HTML document 
475 | 
476 |  
477 | 
478 | **Parameters**
479 | 
480 | * `(array) $options`
481 | : Options passed to PrettyMin::__construct()  
482 | 
483 | **Return Values**
484 | 
485 | `\HtmlPage`
486 | 
487 | 
488 | 
489 | 
490 | **Throws Exceptions**
491 | 
492 | 
493 | `\Exception`
494 | 
495 | 
496 | <hr />
497 | 
498 | 
499 | ### HtmlPage::removeMeta  
500 | 
501 | **Description**
502 | 
503 | ```php
504 | public removeMeta (string $name)
505 | ```
506 | 
507 | Remove all meta tags with the specified name attribute 
508 | 
509 |  
510 | 
511 | **Parameters**
512 | 
513 | * `(string) $name`
514 | 
515 | **Return Values**
516 | 
517 | `void`
518 | 
519 | 
520 | <hr />
521 | 
522 | 
523 | ### HtmlPage::save  
524 | 
525 | **Description**
526 | 
527 | ```php
528 | public save (string $filename)
529 | ```
530 | 
531 | Save this document to a HTML file or return HTML code as string 
532 | 
533 |  
534 | 
535 | **Parameters**
536 | 
537 | * `(string) $filename`
538 | : If provided, output will be saved to this file, otherwise returned  
539 | 
540 | **Return Values**
541 | 
542 | `string|void`
543 | 
544 | 
545 | 
546 | 
547 | <hr />
548 | 
549 | 
550 | ### HtmlPage::setBaseHref  
551 | 
552 | **Description**
553 | 
554 | ```php
555 | public setBaseHref (string $url)
556 | ```
557 | 
558 | Set the base tag with href attribute set to parameter $url 
559 | 
560 |  
561 | 
562 | **Parameters**
563 | 
564 | * `(string) $url`
565 | 
566 | **Return Values**
567 | 
568 | `void`
569 | 
570 | 
571 | <hr />
572 | 
573 | 
574 | ### HtmlPage::setHtmlById  
575 | 
576 | **Description**
577 | 
578 | ```php
579 | public setHtmlById (string $elementId, string $html)
580 | ```
581 | 
582 | Sets innerHTML content of an element specified by elementId 
583 | 
584 |  
585 | 
586 | **Parameters**
587 | 
588 | * `(string) $elementId`
589 | * `(string) $html`
590 | 
591 | **Return Values**
592 | 
593 | `void`
594 | 
595 | 
596 | <hr />
597 | 
598 | 
599 | ### HtmlPage::setMeta  
600 | 
601 | **Description**
602 | 
603 | ```php
604 | public setMeta ( $name,  $content)
605 | ```
606 | 
607 | Set a META tag with specified 'name' and 'content' attributes 
608 | 
609 |  
610 | 
611 | **Parameters**
612 | 
613 | * `() $name`
614 | * `() $content`
615 | 
616 | **Return Values**
617 | 
618 | `void`
619 | 
620 | 
621 | <hr />
622 | 
623 | 
624 | ### HtmlPage::setTitle  
625 | 
626 | **Description**
627 | 
628 | ```php
629 | public setTitle (string $title)
630 | ```
631 | 
632 | Sets the page title of the HTML document 
633 | 
634 |  
635 | 
636 | **Parameters**
637 | 
638 | * `(string) $title`
639 | 
640 | **Return Values**
641 | 
642 | `void`
643 | 
644 | 
645 | <hr />
646 | 
647 | 
648 | ### HtmlPage::trimNewlines  
649 | 
650 | **Description**
651 | 
652 | ```php
653 | public static trimNewlines (string $string)
654 | ```
655 | 
656 | remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space) 
657 | 
658 | useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode) 
659 | 
660 | **Parameters**
661 | 
662 | * `(string) $string`
663 | 
664 | **Return Values**
665 | 
666 | `string`
667 | 
668 | 
669 | 
670 | 
671 | <hr />
672 | 
673 | 


--------------------------------------------------------------------------------
/doc/HtmlPageCrawler.md:
--------------------------------------------------------------------------------
   1 | # Wa72\HtmlPageDom\HtmlPageCrawler  
   2 | 
   3 | Extends \Symfony\Component\DomCrawler\Crawler by adding tree manipulation functions
   4 | for HTML documents inspired by jQuery such as setInnerHtml(), css(), append(), prepend(), before(),
   5 | addClass(), removeClass()
   6 | 
   7 | ## Implements:
   8 | Countable, IteratorAggregate, Traversable, Stringable
   9 | 
  10 | ## Extend:
  11 | 
  12 | Symfony\Component\DomCrawler\Crawler
  13 | 
  14 | ## Methods
  15 | 
  16 | | Name | Description |
  17 | |------|-------------|
  18 | |[__clone](#htmlpagecrawler__clone)||
  19 | |[__get](#htmlpagecrawler__get)||
  20 | |[__toString](#htmlpagecrawler__tostring)||
  21 | |[addClass](#htmlpagecrawleraddclass)|Adds the specified class(es) to each element in the set of matched elements.|
  22 | |[addHtmlFragment](#htmlpagecrawleraddhtmlfragment)||
  23 | |[after](#htmlpagecrawlerafter)|Insert content, specified by the parameter, after each element in the set of matched elements.|
  24 | |[append](#htmlpagecrawlerappend)|Insert HTML content as child nodes of each element after existing children|
  25 | |[appendTo](#htmlpagecrawlerappendto)|Insert every element in the set of matched elements to the end of the target.|
  26 | |[before](#htmlpagecrawlerbefore)|Insert content, specified by the parameter, before each element in the set of matched elements.|
  27 | |[create](#htmlpagecrawlercreate)|Get an HtmlPageCrawler object from a HTML string, DOMNode, DOMNodeList or HtmlPageCrawler|
  28 | |[css](#htmlpagecrawlercss)|Get one CSS style property of the first element or set it for all elements in the list|
  29 | |[getAttribute](#htmlpagecrawlergetattribute)|Returns the attribute value of the first node of the list.|
  30 | |[getCombinedText](#htmlpagecrawlergetcombinedtext)|Get the combined text contents of each element in the set of matched elements, including their descendants.|
  31 | |[getDOMDocument](#htmlpagecrawlergetdomdocument)|get ownerDocument of the first element|
  32 | |[getInnerHtml](#htmlpagecrawlergetinnerhtml)|Alias for Crawler::html() for naming consistency with setInnerHtml()|
  33 | |[getStyle](#htmlpagecrawlergetstyle)|get one CSS style property of the first element|
  34 | |[hasClass](#htmlpagecrawlerhasclass)|Determine whether any of the matched elements are assigned the given class.|
  35 | |[insertAfter](#htmlpagecrawlerinsertafter)|Insert every element in the set of matched elements after the target.|
  36 | |[insertBefore](#htmlpagecrawlerinsertbefore)|Insert every element in the set of matched elements before the target.|
  37 | |[isHtmlDocument](#htmlpagecrawlerishtmldocument)|checks whether the first node contains a complete html document (as opposed to a document fragment)|
  38 | |[makeClone](#htmlpagecrawlermakeclone)|Create a deep copy of the set of matched elements.|
  39 | |[makeEmpty](#htmlpagecrawlermakeempty)|Removes all child nodes and text from all nodes in set|
  40 | |[prepend](#htmlpagecrawlerprepend)|Insert content, specified by the parameter, to the beginning of each element in the set of matched elements.|
  41 | |[prependTo](#htmlpagecrawlerprependto)|Insert every element in the set of matched elements to the beginning of the target.|
  42 | |[remove](#htmlpagecrawlerremove)|Remove the set of matched elements from the DOM.|
  43 | |[removeAttr](#htmlpagecrawlerremoveattr)|Remove an attribute from each element in the set of matched elements.|
  44 | |[removeAttribute](#htmlpagecrawlerremoveattribute)|Remove an attribute from each element in the set of matched elements.|
  45 | |[removeClass](#htmlpagecrawlerremoveclass)|Remove a class from each element in the list|
  46 | |[replaceAll](#htmlpagecrawlerreplaceall)|Replace each target element with the set of matched elements.|
  47 | |[replaceWith](#htmlpagecrawlerreplacewith)|Replace each element in the set of matched elements with the provided new content and return the set of elements that was removed.|
  48 | |[saveHTML](#htmlpagecrawlersavehtml)|Get the HTML code fragment of all elements and their contents.|
  49 | |[setAttribute](#htmlpagecrawlersetattribute)|Sets an attribute on each element|
  50 | |[setInnerHtml](#htmlpagecrawlersetinnerhtml)|Set the HTML contents of each element|
  51 | |[setStyle](#htmlpagecrawlersetstyle)|set one CSS style property for all elements in the list|
  52 | |[setText](#htmlpagecrawlersettext)|Set the text contents of the matched elements.|
  53 | |[toggleClass](#htmlpagecrawlertoggleclass)|Add or remove one or more classes from each element in the set of matched elements, depending the class’s presence.|
  54 | |[unwrap](#htmlpagecrawlerunwrap)|Remove the parents of the set of matched elements from the DOM, leaving the matched elements in their place.|
  55 | |[unwrapInner](#htmlpagecrawlerunwrapinner)|Remove the matched elements, but promote the children to take their place.|
  56 | |[wrap](#htmlpagecrawlerwrap)|Wrap an HTML structure around each element in the set of matched elements|
  57 | |[wrapAll](#htmlpagecrawlerwrapall)|Wrap an HTML structure around all elements in the set of matched elements.|
  58 | |[wrapInner](#htmlpagecrawlerwrapinner)|Wrap an HTML structure around the content of each element in the set of matched elements.|
  59 | 
  60 | ## Inherited methods
  61 | 
  62 | | Name | Description |
  63 | |------|-------------|
  64 | | [__construct](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.__construct.php) | - |
  65 | |add|Adds a node to the current list of nodes.|
  66 | |addContent|Adds HTML/XML content.|
  67 | |addDocument|Adds a \DOMDocument to the list of nodes.|
  68 | |addHtmlContent|Adds an HTML content to the list of nodes.|
  69 | |addNode|Adds a \DOMNode instance to the list of nodes.|
  70 | |addNodeList|Adds a \DOMNodeList to the list of nodes.|
  71 | |addNodes|Adds an array of \DOMNode instances to the list of nodes.|
  72 | |addXmlContent|Adds an XML content to the list of nodes.|
  73 | |ancestors|Returns the ancestors of the current selection.|
  74 | |attr|Returns the attribute value of the first node of the list.|
  75 | |children|Returns the children nodes of the current selection.|
  76 | |clear|Removes all the nodes.|
  77 | |closest|Return first parents (heading toward the document root) of the Element that matches the provided selector.|
  78 | | [count](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.count.php) | - |
  79 | |each|Calls an anonymous function on each node of the list.|
  80 | |eq|Returns a node given its position in the node list.|
  81 | |evaluate|Evaluates an XPath expression.|
  82 | |extract|Extracts information from the list of nodes.|
  83 | |filter|Filters the list of nodes with a CSS selector.|
  84 | |filterXPath|Filters the list of nodes with an XPath expression.|
  85 | |first|Returns the first node of the current selection.|
  86 | |form|Returns a Form object for the first node in the list.|
  87 | |getBaseHref|Returns base href.|
  88 | | [getIterator](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.getiterator.php) | - |
  89 | | [getNode](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.getnode.php) | - |
  90 | |getUri|Returns the current URI.|
  91 | |html|Returns the first node of the list as HTML.|
  92 | |image|Returns an Image object for the first node in the list.|
  93 | |images|Returns an array of Image objects for the nodes in the list.|
  94 | |innerText|Returns only the inner text that is the direct descendent of the current node, excluding any child nodes.|
  95 | |last|Returns the last node of the current selection.|
  96 | |link|Returns a Link object for the first node in the list.|
  97 | |links|Returns an array of Link objects for the nodes in the list.|
  98 | | [matches](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.matches.php) | - |
  99 | |nextAll|Returns the next siblings nodes of the current selection.|
 100 | |nodeName|Returns the node name of the first node of the list.|
 101 | | [outerHtml](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.outerhtml.php) | - |
 102 | |previousAll|Returns the previous sibling nodes of the current selection.|
 103 | |reduce|Reduces the list of nodes by calling an anonymous function.|
 104 | | [registerNamespace](https://secure.php.net/manual/en/symfony\component\domcrawler\crawler.registernamespace.php) | - |
 105 | |selectButton|Selects a button by name or alt value for images.|
 106 | |selectImage|Selects images by alt value.|
 107 | |selectLink|Selects links by name or alt value for clickable images.|
 108 | |setDefaultNamespacePrefix|Overloads a default namespace prefix to be used with XPath and CSS expressions.|
 109 | |siblings|Returns the siblings nodes of the current selection.|
 110 | |slice|Slices the list of nodes by $offset and $length.|
 111 | |text|Returns the text of the first node of the list.|
 112 | |xpathLiteral|Converts string for XPath expressions.|
 113 | 
 114 | 
 115 | 
 116 | ### HtmlPageCrawler::__clone  
 117 | 
 118 | **Description**
 119 | 
 120 | ```php
 121 |  __clone (void)
 122 | ```
 123 | 
 124 |  
 125 | 
 126 |  
 127 | 
 128 | **Parameters**
 129 | 
 130 | `This function has no parameters.`
 131 | 
 132 | **Return Values**
 133 | 
 134 | `void`
 135 | 
 136 | 
 137 | <hr />
 138 | 
 139 | 
 140 | ### HtmlPageCrawler::__get  
 141 | 
 142 | **Description**
 143 | 
 144 | ```php
 145 |  __get (void)
 146 | ```
 147 | 
 148 |  
 149 | 
 150 |  
 151 | 
 152 | **Parameters**
 153 | 
 154 | `This function has no parameters.`
 155 | 
 156 | **Return Values**
 157 | 
 158 | `void`
 159 | 
 160 | 
 161 | <hr />
 162 | 
 163 | 
 164 | ### HtmlPageCrawler::__toString  
 165 | 
 166 | **Description**
 167 | 
 168 | ```php
 169 |  __toString (void)
 170 | ```
 171 | 
 172 |  
 173 | 
 174 |  
 175 | 
 176 | **Parameters**
 177 | 
 178 | `This function has no parameters.`
 179 | 
 180 | **Return Values**
 181 | 
 182 | `void`
 183 | 
 184 | 
 185 | <hr />
 186 | 
 187 | 
 188 | ### HtmlPageCrawler::addClass  
 189 | 
 190 | **Description**
 191 | 
 192 | ```php
 193 | public addClass (string $name)
 194 | ```
 195 | 
 196 | Adds the specified class(es) to each element in the set of matched elements. 
 197 | 
 198 |  
 199 | 
 200 | **Parameters**
 201 | 
 202 | * `(string) $name`
 203 | : One or more space-separated classes to be added to the class attribute of each matched element.  
 204 | 
 205 | **Return Values**
 206 | 
 207 | `\HtmlPageCrawler`
 208 | 
 209 | > $this for chaining
 210 | 
 211 | 
 212 | <hr />
 213 | 
 214 | 
 215 | ### HtmlPageCrawler::addHtmlFragment  
 216 | 
 217 | **Description**
 218 | 
 219 | ```php
 220 |  addHtmlFragment (void)
 221 | ```
 222 | 
 223 |  
 224 | 
 225 |  
 226 | 
 227 | **Parameters**
 228 | 
 229 | `This function has no parameters.`
 230 | 
 231 | **Return Values**
 232 | 
 233 | `void`
 234 | 
 235 | 
 236 | <hr />
 237 | 
 238 | 
 239 | ### HtmlPageCrawler::after  
 240 | 
 241 | **Description**
 242 | 
 243 | ```php
 244 | public after (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content)
 245 | ```
 246 | 
 247 | Insert content, specified by the parameter, after each element in the set of matched elements. 
 248 | 
 249 |  
 250 | 
 251 | **Parameters**
 252 | 
 253 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content`
 254 | 
 255 | **Return Values**
 256 | 
 257 | `\HtmlPageCrawler`
 258 | 
 259 | > $this for chaining
 260 | 
 261 | 
 262 | <hr />
 263 | 
 264 | 
 265 | ### HtmlPageCrawler::append  
 266 | 
 267 | **Description**
 268 | 
 269 | ```php
 270 | public append (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content)
 271 | ```
 272 | 
 273 | Insert HTML content as child nodes of each element after existing children 
 274 | 
 275 |  
 276 | 
 277 | **Parameters**
 278 | 
 279 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content`
 280 | : HTML code fragment or DOMNode to append  
 281 | 
 282 | **Return Values**
 283 | 
 284 | `\HtmlPageCrawler`
 285 | 
 286 | > $this for chaining
 287 | 
 288 | 
 289 | <hr />
 290 | 
 291 | 
 292 | ### HtmlPageCrawler::appendTo  
 293 | 
 294 | **Description**
 295 | 
 296 | ```php
 297 | public appendTo (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element)
 298 | ```
 299 | 
 300 | Insert every element in the set of matched elements to the end of the target. 
 301 | 
 302 |  
 303 | 
 304 | **Parameters**
 305 | 
 306 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element`
 307 | 
 308 | **Return Values**
 309 | 
 310 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
 311 | 
 312 | > A new Crawler object containing all elements appended to the target elements
 313 | 
 314 | 
 315 | <hr />
 316 | 
 317 | 
 318 | ### HtmlPageCrawler::before  
 319 | 
 320 | **Description**
 321 | 
 322 | ```php
 323 | public before (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content)
 324 | ```
 325 | 
 326 | Insert content, specified by the parameter, before each element in the set of matched elements. 
 327 | 
 328 |  
 329 | 
 330 | **Parameters**
 331 | 
 332 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content`
 333 | 
 334 | **Return Values**
 335 | 
 336 | `\HtmlPageCrawler`
 337 | 
 338 | > $this for chaining
 339 | 
 340 | 
 341 | <hr />
 342 | 
 343 | 
 344 | ### HtmlPageCrawler::create  
 345 | 
 346 | **Description**
 347 | 
 348 | ```php
 349 | public static create (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList|array $content)
 350 | ```
 351 | 
 352 | Get an HtmlPageCrawler object from a HTML string, DOMNode, DOMNodeList or HtmlPageCrawler 
 353 | 
 354 | This is the equivalent to jQuery's $() function when used for wrapping DOMNodes or creating DOMElements from HTML code. 
 355 | 
 356 | **Parameters**
 357 | 
 358 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList|array) $content`
 359 | 
 360 | **Return Values**
 361 | 
 362 | `\HtmlPageCrawler`
 363 | 
 364 | 
 365 | 
 366 | 
 367 | <hr />
 368 | 
 369 | 
 370 | ### HtmlPageCrawler::css  
 371 | 
 372 | **Description**
 373 | 
 374 | ```php
 375 | public css (string $key, null|string $value)
 376 | ```
 377 | 
 378 | Get one CSS style property of the first element or set it for all elements in the list 
 379 | 
 380 | Function is here for compatibility with jQuery; it is the same as getStyle() and setStyle() 
 381 | 
 382 | **Parameters**
 383 | 
 384 | * `(string) $key`
 385 | : The name of the style property  
 386 | * `(null|string) $value`
 387 | : The CSS value to set, or NULL to get the current value  
 388 | 
 389 | **Return Values**
 390 | 
 391 | `\HtmlPageCrawler|string`
 392 | 
 393 | > If no param is provided, returns the CSS styles of the first element
 394 | 
 395 | 
 396 | <hr />
 397 | 
 398 | 
 399 | ### HtmlPageCrawler::getAttribute  
 400 | 
 401 | **Description**
 402 | 
 403 | ```php
 404 | public getAttribute (string $name)
 405 | ```
 406 | 
 407 | Returns the attribute value of the first node of the list. 
 408 | 
 409 | This is just an alias for attr() for naming consistency with setAttribute() 
 410 | 
 411 | **Parameters**
 412 | 
 413 | * `(string) $name`
 414 | : The attribute name  
 415 | 
 416 | **Return Values**
 417 | 
 418 | `string|null`
 419 | 
 420 | > The attribute value or null if the attribute does not exist
 421 | 
 422 | 
 423 | **Throws Exceptions**
 424 | 
 425 | 
 426 | `\InvalidArgumentException`
 427 | > When current node is empty
 428 | 
 429 | <hr />
 430 | 
 431 | 
 432 | ### HtmlPageCrawler::getCombinedText  
 433 | 
 434 | **Description**
 435 | 
 436 | ```php
 437 | public getCombinedText (void)
 438 | ```
 439 | 
 440 | Get the combined text contents of each element in the set of matched elements, including their descendants. 
 441 | 
 442 | This is what the jQuery text() function does, contrary to the Crawler::text() method that returns only  
 443 | the text of the first node. 
 444 | 
 445 | **Parameters**
 446 | 
 447 | `This function has no parameters.`
 448 | 
 449 | **Return Values**
 450 | 
 451 | `string`
 452 | 
 453 | 
 454 | 
 455 | 
 456 | <hr />
 457 | 
 458 | 
 459 | ### HtmlPageCrawler::getDOMDocument  
 460 | 
 461 | **Description**
 462 | 
 463 | ```php
 464 | public getDOMDocument (void)
 465 | ```
 466 | 
 467 | get ownerDocument of the first element 
 468 | 
 469 |  
 470 | 
 471 | **Parameters**
 472 | 
 473 | `This function has no parameters.`
 474 | 
 475 | **Return Values**
 476 | 
 477 | `\DOMDocument|null`
 478 | 
 479 | 
 480 | 
 481 | 
 482 | <hr />
 483 | 
 484 | 
 485 | ### HtmlPageCrawler::getInnerHtml  
 486 | 
 487 | **Description**
 488 | 
 489 | ```php
 490 | public getInnerHtml (void)
 491 | ```
 492 | 
 493 | Alias for Crawler::html() for naming consistency with setInnerHtml() 
 494 | 
 495 |  
 496 | 
 497 | **Parameters**
 498 | 
 499 | `This function has no parameters.`
 500 | 
 501 | **Return Values**
 502 | 
 503 | `string`
 504 | 
 505 | 
 506 | 
 507 | 
 508 | <hr />
 509 | 
 510 | 
 511 | ### HtmlPageCrawler::getStyle  
 512 | 
 513 | **Description**
 514 | 
 515 | ```php
 516 | public getStyle (string $key)
 517 | ```
 518 | 
 519 | get one CSS style property of the first element 
 520 | 
 521 |  
 522 | 
 523 | **Parameters**
 524 | 
 525 | * `(string) $key`
 526 | : name of the property  
 527 | 
 528 | **Return Values**
 529 | 
 530 | `string|null`
 531 | 
 532 | > value of the property
 533 | 
 534 | 
 535 | <hr />
 536 | 
 537 | 
 538 | ### HtmlPageCrawler::hasClass  
 539 | 
 540 | **Description**
 541 | 
 542 | ```php
 543 | public hasClass (string $name)
 544 | ```
 545 | 
 546 | Determine whether any of the matched elements are assigned the given class. 
 547 | 
 548 |  
 549 | 
 550 | **Parameters**
 551 | 
 552 | * `(string) $name`
 553 | 
 554 | **Return Values**
 555 | 
 556 | `bool`
 557 | 
 558 | 
 559 | 
 560 | 
 561 | <hr />
 562 | 
 563 | 
 564 | ### HtmlPageCrawler::insertAfter  
 565 | 
 566 | **Description**
 567 | 
 568 | ```php
 569 | public insertAfter (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element)
 570 | ```
 571 | 
 572 | Insert every element in the set of matched elements after the target. 
 573 | 
 574 |  
 575 | 
 576 | **Parameters**
 577 | 
 578 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element`
 579 | 
 580 | **Return Values**
 581 | 
 582 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
 583 | 
 584 | > A new Crawler object containing all elements appended to the target elements
 585 | 
 586 | 
 587 | <hr />
 588 | 
 589 | 
 590 | ### HtmlPageCrawler::insertBefore  
 591 | 
 592 | **Description**
 593 | 
 594 | ```php
 595 | public insertBefore (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element)
 596 | ```
 597 | 
 598 | Insert every element in the set of matched elements before the target. 
 599 | 
 600 |  
 601 | 
 602 | **Parameters**
 603 | 
 604 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element`
 605 | 
 606 | **Return Values**
 607 | 
 608 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
 609 | 
 610 | > A new Crawler object containing all elements appended to the target elements
 611 | 
 612 | 
 613 | <hr />
 614 | 
 615 | 
 616 | ### HtmlPageCrawler::isHtmlDocument  
 617 | 
 618 | **Description**
 619 | 
 620 | ```php
 621 | public isHtmlDocument (void)
 622 | ```
 623 | 
 624 | checks whether the first node contains a complete html document (as opposed to a document fragment) 
 625 | 
 626 |  
 627 | 
 628 | **Parameters**
 629 | 
 630 | `This function has no parameters.`
 631 | 
 632 | **Return Values**
 633 | 
 634 | `bool`
 635 | 
 636 | 
 637 | 
 638 | 
 639 | <hr />
 640 | 
 641 | 
 642 | ### HtmlPageCrawler::makeClone  
 643 | 
 644 | **Description**
 645 | 
 646 | ```php
 647 | public makeClone (void)
 648 | ```
 649 | 
 650 | Create a deep copy of the set of matched elements. 
 651 | 
 652 | Equivalent to clone() in jQuery (clone is not a valid PHP function name) 
 653 | 
 654 | **Parameters**
 655 | 
 656 | `This function has no parameters.`
 657 | 
 658 | **Return Values**
 659 | 
 660 | `\HtmlPageCrawler`
 661 | 
 662 | 
 663 | 
 664 | 
 665 | <hr />
 666 | 
 667 | 
 668 | ### HtmlPageCrawler::makeEmpty  
 669 | 
 670 | **Description**
 671 | 
 672 | ```php
 673 | public makeEmpty (void)
 674 | ```
 675 | 
 676 | Removes all child nodes and text from all nodes in set 
 677 | 
 678 | Equivalent to jQuery's empty() function which is not a valid function name in PHP 
 679 | 
 680 | **Parameters**
 681 | 
 682 | `This function has no parameters.`
 683 | 
 684 | **Return Values**
 685 | 
 686 | `\HtmlPageCrawler`
 687 | 
 688 | > $this
 689 | 
 690 | 
 691 | <hr />
 692 | 
 693 | 
 694 | ### HtmlPageCrawler::prepend  
 695 | 
 696 | **Description**
 697 | 
 698 | ```php
 699 | public prepend (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content)
 700 | ```
 701 | 
 702 | Insert content, specified by the parameter, to the beginning of each element in the set of matched elements. 
 703 | 
 704 |  
 705 | 
 706 | **Parameters**
 707 | 
 708 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content`
 709 | : HTML code fragment  
 710 | 
 711 | **Return Values**
 712 | 
 713 | `\HtmlPageCrawler`
 714 | 
 715 | > $this for chaining
 716 | 
 717 | 
 718 | <hr />
 719 | 
 720 | 
 721 | ### HtmlPageCrawler::prependTo  
 722 | 
 723 | **Description**
 724 | 
 725 | ```php
 726 | public prependTo (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element)
 727 | ```
 728 | 
 729 | Insert every element in the set of matched elements to the beginning of the target. 
 730 | 
 731 |  
 732 | 
 733 | **Parameters**
 734 | 
 735 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element`
 736 | 
 737 | **Return Values**
 738 | 
 739 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
 740 | 
 741 | > A new Crawler object containing all elements prepended to the target elements
 742 | 
 743 | 
 744 | <hr />
 745 | 
 746 | 
 747 | ### HtmlPageCrawler::remove  
 748 | 
 749 | **Description**
 750 | 
 751 | ```php
 752 | public remove (void)
 753 | ```
 754 | 
 755 | Remove the set of matched elements from the DOM. 
 756 | 
 757 | (as opposed to Crawler::clear() which detaches the nodes only from Crawler  
 758 | but leaves them in the DOM) 
 759 | 
 760 | **Parameters**
 761 | 
 762 | `This function has no parameters.`
 763 | 
 764 | **Return Values**
 765 | 
 766 | `void`
 767 | 
 768 | 
 769 | <hr />
 770 | 
 771 | 
 772 | ### HtmlPageCrawler::removeAttr  
 773 | 
 774 | **Description**
 775 | 
 776 | ```php
 777 | public removeAttr (string $name)
 778 | ```
 779 | 
 780 | Remove an attribute from each element in the set of matched elements. 
 781 | 
 782 | Alias for removeAttribute for compatibility with jQuery 
 783 | 
 784 | **Parameters**
 785 | 
 786 | * `(string) $name`
 787 | 
 788 | **Return Values**
 789 | 
 790 | `\HtmlPageCrawler`
 791 | 
 792 | 
 793 | 
 794 | 
 795 | <hr />
 796 | 
 797 | 
 798 | ### HtmlPageCrawler::removeAttribute  
 799 | 
 800 | **Description**
 801 | 
 802 | ```php
 803 | public removeAttribute (string $name)
 804 | ```
 805 | 
 806 | Remove an attribute from each element in the set of matched elements. 
 807 | 
 808 |  
 809 | 
 810 | **Parameters**
 811 | 
 812 | * `(string) $name`
 813 | 
 814 | **Return Values**
 815 | 
 816 | `\HtmlPageCrawler`
 817 | 
 818 | 
 819 | 
 820 | 
 821 | <hr />
 822 | 
 823 | 
 824 | ### HtmlPageCrawler::removeClass  
 825 | 
 826 | **Description**
 827 | 
 828 | ```php
 829 | public removeClass (string $name)
 830 | ```
 831 | 
 832 | Remove a class from each element in the list 
 833 | 
 834 |  
 835 | 
 836 | **Parameters**
 837 | 
 838 | * `(string) $name`
 839 | 
 840 | **Return Values**
 841 | 
 842 | `\HtmlPageCrawler`
 843 | 
 844 | > $this for chaining
 845 | 
 846 | 
 847 | <hr />
 848 | 
 849 | 
 850 | ### HtmlPageCrawler::replaceAll  
 851 | 
 852 | **Description**
 853 | 
 854 | ```php
 855 | public replaceAll (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $element)
 856 | ```
 857 | 
 858 | Replace each target element with the set of matched elements. 
 859 | 
 860 |  
 861 | 
 862 | **Parameters**
 863 | 
 864 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $element`
 865 | 
 866 | **Return Values**
 867 | 
 868 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
 869 | 
 870 | > A new Crawler object containing all elements appended to the target elements
 871 | 
 872 | 
 873 | <hr />
 874 | 
 875 | 
 876 | ### HtmlPageCrawler::replaceWith  
 877 | 
 878 | **Description**
 879 | 
 880 | ```php
 881 | public replaceWith (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content)
 882 | ```
 883 | 
 884 | Replace each element in the set of matched elements with the provided new content and return the set of elements that was removed. 
 885 | 
 886 |  
 887 | 
 888 | **Parameters**
 889 | 
 890 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content`
 891 | 
 892 | **Return Values**
 893 | 
 894 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
 895 | 
 896 | > $this for chaining
 897 | 
 898 | 
 899 | <hr />
 900 | 
 901 | 
 902 | ### HtmlPageCrawler::saveHTML  
 903 | 
 904 | **Description**
 905 | 
 906 | ```php
 907 | public saveHTML (void)
 908 | ```
 909 | 
 910 | Get the HTML code fragment of all elements and their contents. 
 911 | 
 912 | If the first node contains a complete HTML document return only  
 913 | the full code of this document. 
 914 | 
 915 | **Parameters**
 916 | 
 917 | `This function has no parameters.`
 918 | 
 919 | **Return Values**
 920 | 
 921 | `string`
 922 | 
 923 | > HTML code (fragment)
 924 | 
 925 | 
 926 | <hr />
 927 | 
 928 | 
 929 | ### HtmlPageCrawler::setAttribute  
 930 | 
 931 | **Description**
 932 | 
 933 | ```php
 934 | public setAttribute (string $name, string $value)
 935 | ```
 936 | 
 937 | Sets an attribute on each element 
 938 | 
 939 |  
 940 | 
 941 | **Parameters**
 942 | 
 943 | * `(string) $name`
 944 | * `(string) $value`
 945 | 
 946 | **Return Values**
 947 | 
 948 | `\HtmlPageCrawler`
 949 | 
 950 | > $this for chaining
 951 | 
 952 | 
 953 | <hr />
 954 | 
 955 | 
 956 | ### HtmlPageCrawler::setInnerHtml  
 957 | 
 958 | **Description**
 959 | 
 960 | ```php
 961 | public setInnerHtml (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content)
 962 | ```
 963 | 
 964 | Set the HTML contents of each element 
 965 | 
 966 |  
 967 | 
 968 | **Parameters**
 969 | 
 970 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content`
 971 | : HTML code fragment  
 972 | 
 973 | **Return Values**
 974 | 
 975 | `\HtmlPageCrawler`
 976 | 
 977 | > $this for chaining
 978 | 
 979 | 
 980 | <hr />
 981 | 
 982 | 
 983 | ### HtmlPageCrawler::setStyle  
 984 | 
 985 | **Description**
 986 | 
 987 | ```php
 988 | public setStyle (string $key, string $value)
 989 | ```
 990 | 
 991 | set one CSS style property for all elements in the list 
 992 | 
 993 |  
 994 | 
 995 | **Parameters**
 996 | 
 997 | * `(string) $key`
 998 | : name of the property  
 999 | * `(string) $value`
1000 | : value of the property  
1001 | 
1002 | **Return Values**
1003 | 
1004 | `\HtmlPageCrawler`
1005 | 
1006 | > $this for chaining
1007 | 
1008 | 
1009 | <hr />
1010 | 
1011 | 
1012 | ### HtmlPageCrawler::setText  
1013 | 
1014 | **Description**
1015 | 
1016 | ```php
1017 | public setText (string $text)
1018 | ```
1019 | 
1020 | Set the text contents of the matched elements. 
1021 | 
1022 |  
1023 | 
1024 | **Parameters**
1025 | 
1026 | * `(string) $text`
1027 | 
1028 | **Return Values**
1029 | 
1030 | `\HtmlPageCrawler`
1031 | 
1032 | 
1033 | 
1034 | 
1035 | <hr />
1036 | 
1037 | 
1038 | ### HtmlPageCrawler::toggleClass  
1039 | 
1040 | **Description**
1041 | 
1042 | ```php
1043 | public toggleClass (string $classname)
1044 | ```
1045 | 
1046 | Add or remove one or more classes from each element in the set of matched elements, depending the class’s presence. 
1047 | 
1048 |  
1049 | 
1050 | **Parameters**
1051 | 
1052 | * `(string) $classname`
1053 | : One or more classnames separated by spaces  
1054 | 
1055 | **Return Values**
1056 | 
1057 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
1058 | 
1059 | > $this for chaining
1060 | 
1061 | 
1062 | <hr />
1063 | 
1064 | 
1065 | ### HtmlPageCrawler::unwrap  
1066 | 
1067 | **Description**
1068 | 
1069 | ```php
1070 | public unwrap (void)
1071 | ```
1072 | 
1073 | Remove the parents of the set of matched elements from the DOM, leaving the matched elements in their place. 
1074 | 
1075 |  
1076 | 
1077 | **Parameters**
1078 | 
1079 | `This function has no parameters.`
1080 | 
1081 | **Return Values**
1082 | 
1083 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
1084 | 
1085 | > $this for chaining
1086 | 
1087 | 
1088 | <hr />
1089 | 
1090 | 
1091 | ### HtmlPageCrawler::unwrapInner  
1092 | 
1093 | **Description**
1094 | 
1095 | ```php
1096 | public unwrapInner (void)
1097 | ```
1098 | 
1099 | Remove the matched elements, but promote the children to take their place. 
1100 | 
1101 |  
1102 | 
1103 | **Parameters**
1104 | 
1105 | `This function has no parameters.`
1106 | 
1107 | **Return Values**
1108 | 
1109 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
1110 | 
1111 | > $this for chaining
1112 | 
1113 | 
1114 | <hr />
1115 | 
1116 | 
1117 | ### HtmlPageCrawler::wrap  
1118 | 
1119 | **Description**
1120 | 
1121 | ```php
1122 | public wrap (string|\HtmlPageCrawler|\DOMNode $wrappingElement)
1123 | ```
1124 | 
1125 | Wrap an HTML structure around each element in the set of matched elements 
1126 | 
1127 | The HTML structure must contain only one root node, e.g.:  
1128 | Works: <div><div></div></div>  
1129 | Does not work: <div></div><div></div> 
1130 | 
1131 | **Parameters**
1132 | 
1133 | * `(string|\HtmlPageCrawler|\DOMNode) $wrappingElement`
1134 | 
1135 | **Return Values**
1136 | 
1137 | `\HtmlPageCrawler`
1138 | 
1139 | > $this for chaining
1140 | 
1141 | 
1142 | <hr />
1143 | 
1144 | 
1145 | ### HtmlPageCrawler::wrapAll  
1146 | 
1147 | **Description**
1148 | 
1149 | ```php
1150 | public wrapAll (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content)
1151 | ```
1152 | 
1153 | Wrap an HTML structure around all elements in the set of matched elements. 
1154 | 
1155 |  
1156 | 
1157 | **Parameters**
1158 | 
1159 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content`
1160 | 
1161 | **Return Values**
1162 | 
1163 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
1164 | 
1165 | > $this for chaining
1166 | 
1167 | 
1168 | **Throws Exceptions**
1169 | 
1170 | 
1171 | `\LogicException`
1172 | 
1173 | 
1174 | <hr />
1175 | 
1176 | 
1177 | ### HtmlPageCrawler::wrapInner  
1178 | 
1179 | **Description**
1180 | 
1181 | ```php
1182 | public wrapInner (string|\HtmlPageCrawler|\DOMNode|\DOMNodeList $content)
1183 | ```
1184 | 
1185 | Wrap an HTML structure around the content of each element in the set of matched elements. 
1186 | 
1187 |  
1188 | 
1189 | **Parameters**
1190 | 
1191 | * `(string|\HtmlPageCrawler|\DOMNode|\DOMNodeList) $content`
1192 | 
1193 | **Return Values**
1194 | 
1195 | `\Wa72\HtmlPageDom\HtmlPageCrawler`
1196 | 
1197 | > $this for chaining
1198 | 
1199 | 
1200 | <hr />
1201 | 
1202 | 


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
1 | # Wa72\HtmlPageDom
2 | 
3 | * [HtmlPage](HtmlPage.md) 
4 | * [HtmlPageCrawler](HtmlPageCrawler.md) 
5 | 


--------------------------------------------------------------------------------
/phpunit.xml.dist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" colors="true" bootstrap="./Tests/phpunit_bootstrap.php" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/9.3/phpunit.xsd">
 3 |   <coverage>
 4 |     <include>
 5 |       <directory suffix=".php">./src/</directory>
 6 |     </include>
 7 |   </coverage>
 8 |   <testsuites>
 9 |     <testsuite name="HtmlPageDom Test Suite">
10 |       <directory suffix="Test.php">./Tests/</directory>
11 |     </testsuite>
12 |   </testsuites>
13 | </phpunit>
14 | 


--------------------------------------------------------------------------------
/src/Helpers.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | namespace Wa72\HtmlPageDom;
  3 | 
  4 | /**
  5 |  * Static helper functions for HtmlPageDom
  6 |  *
  7 |  * @package Wa72\HtmlPageDom
  8 |  */
  9 | class Helpers {
 10 | 
 11 |     /**
 12 |      * remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space)
 13 |      * useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode)
 14 |      *
 15 |      * @param string $string
 16 |      * @return string
 17 |      */
 18 |     public static function trimNewlines($string)
 19 |     {
 20 |         $string = str_replace("\n", ' ', $string);
 21 |         $string = str_replace("\r", ' ', $string);
 22 |         $string = preg_replace('/\s+/', ' ', $string);
 23 |         return trim($string);
 24 |     }
 25 | 
 26 |     /**
 27 |      * Convert CSS string to array
 28 |      *
 29 |      * @param string $css list of CSS properties separated by ;
 30 |      * @return array name=>value pairs of CSS properties
 31 |      */
 32 |     public static function cssStringToArray($css)
 33 |     {
 34 |         $statements = explode(';', preg_replace('/\s+/s', ' ', $css));
 35 |         $styles = array();
 36 |         foreach ($statements as $statement) {
 37 |             $statement = trim($statement);
 38 |             if ('' === $statement) {
 39 |                 continue;
 40 |             }
 41 |             $p = strpos($statement, ':');
 42 |             if ($p <= 0) {
 43 |                 continue;
 44 |             } // invalid statement, just ignore it
 45 |             $key = trim(substr($statement, 0, $p));
 46 |             $value = trim(substr($statement, $p + 1));
 47 |             $styles[$key] = $value;
 48 |         }
 49 |         return $styles;
 50 |     }
 51 | 
 52 |     /**
 53 |      * Convert CSS name->value array to string
 54 |      *
 55 |      * @param array $array name=>value pairs of CSS properties
 56 |      * @return string list of CSS properties separated by ;
 57 |      */
 58 |     public static function cssArrayToString($array)
 59 |     {
 60 |         $styles = '';
 61 |         foreach ($array as $key => $value) {
 62 |             $styles .= $key . ': ' . $value . ';';
 63 |         }
 64 |         return $styles;
 65 |     }
 66 | 
 67 |     /**
 68 |      * Helper function for getting a body element
 69 |      * from an HTML fragment
 70 |      *
 71 |      * @param string $html A fragment of HTML code
 72 |      * @param string $charset
 73 |      * @return \DOMNode The body node containing child nodes created from the HTML fragment
 74 |      */
 75 |     public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
 76 |     {
 77 | 
 78 |         $html = '<html><body>' . $html . '</body></html>';
 79 |         $d = self::loadHtml($html, $charset);
 80 |         return $d->getElementsByTagName('body')->item(0);
 81 |     }
 82 | 
 83 |     public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument
 84 |     {
 85 |         return self::parseXhtml($html, $charset);
 86 |     }
 87 |     /**
 88 |      * Function originally taken from Symfony\Component\DomCrawler\Crawler
 89 |      * (c) Fabien Potencier <fabien@symfony.com>
 90 |      * License: MIT
 91 |      */
 92 |     private static function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
 93 |     {
 94 |         $htmlContent = self::convertToHtmlEntities($htmlContent, $charset);
 95 | 
 96 |         $internalErrors = libxml_use_internal_errors(true);
 97 | 
 98 |         $dom = new \DOMDocument('1.0', $charset);
 99 |         $dom->validateOnParse = true;
100 | 
101 |         if ('' !== trim($htmlContent)) {
102 |             // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
103 |             // Option LIBXML_SCHEMA_CREATE seems to prevent this
104 |             // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
105 |             @$dom->loadHTML($htmlContent, \LIBXML_SCHEMA_CREATE);
106 |         }
107 | 
108 |         libxml_use_internal_errors($internalErrors);
109 | 
110 |         return $dom;
111 |     }
112 | 
113 |     /**
114 |      * Converts charset to HTML-entities to ensure valid parsing.
115 |      * Function taken from Symfony\Component\DomCrawler\Crawler
116 |      * (c) Fabien Potencier <fabien@symfony.com>
117 |      * License: MIT
118 |      */
119 |     private static function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
120 |     {
121 |         set_error_handler(function () { throw new \Exception(); });
122 | 
123 |         try {
124 |             return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset);
125 |         } catch (\Exception|\ValueError) {
126 |             try {
127 |                 $htmlContent = iconv($charset, 'UTF-8', $htmlContent);
128 |                 $htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
129 |             } catch (\Exception|\ValueError) {
130 |             }
131 |             return $htmlContent;
132 |         } finally {
133 |             restore_error_handler();
134 |         }
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/HtmlPage.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | namespace Wa72\HtmlPageDom;
  3 | 
  4 | use Symfony\Component\CssSelector\CssSelector;
  5 | use Wa72\HtmlPrettymin\PrettyMin;
  6 | 
  7 | /**
  8 |  * This class represents a complete HTML document.
  9 |  *
 10 |  * It offers convenience functions for getting and setting elements of the document
 11 |  * such as setTitle(), getTitle(), setMeta($name, $value), getBody().
 12 |  *
 13 |  * It uses HtmlPageCrawler to navigate and manipulate the DOM tree.
 14 |  *
 15 |  * @author Christoph Singer
 16 |  * @license MIT
 17 |  */
 18 | class HtmlPage
 19 | {
 20 |     /**
 21 |      *
 22 |      * @var \DOMDocument
 23 |      */
 24 |     protected $dom;
 25 | 
 26 |     /**
 27 |      * @var string
 28 |      */
 29 |     protected $charset;
 30 | 
 31 |     /**
 32 |      * @var string
 33 |      */
 34 |     protected $url;
 35 | 
 36 |     /**
 37 |      *
 38 |      * @var HtmlPageCrawler
 39 |      */
 40 |     protected $crawler;
 41 | 
 42 |     public function __construct($content = '', $url = '', $charset = 'UTF-8')
 43 |     {
 44 |         $this->charset = $charset;
 45 |         $this->url = $url;
 46 |         if ($content == '') {
 47 |             $content = '<!DOCTYPE html><html><head><title></title></head><body></body></html>';
 48 |         }
 49 |         $this->dom = Helpers::loadHtml($content, $charset);
 50 |         $this->crawler = new HtmlPageCrawler($this->dom);
 51 |     }
 52 | 
 53 |     /**
 54 |      * Get a HtmlPageCrawler object containing the root node of the HTML document
 55 |      *
 56 |      * @return HtmlPageCrawler
 57 |      */
 58 |     public function getCrawler()
 59 |     {
 60 |         return $this->crawler;
 61 |     }
 62 | 
 63 |     /**
 64 |      * Get a DOMDocument object for the HTML document
 65 |      *
 66 |      * @return \DOMDocument
 67 |      */
 68 |     public function getDOMDocument()
 69 |     {
 70 |         return $this->dom;
 71 |     }
 72 | 
 73 |     /**
 74 |      * Sets the page title of the HTML document
 75 |      *
 76 |      * @param string $title
 77 |      */
 78 |     public function setTitle($title)
 79 |     {
 80 |         $t = $this->dom->getElementsByTagName('title')->item(0);
 81 |         if ($t == null) {
 82 |             $t = $this->dom->createElement('title');
 83 |             $this->getHeadNode()->appendChild($t);
 84 |         }
 85 |         $t->nodeValue = htmlspecialchars($title);
 86 |     }
 87 | 
 88 |     /**
 89 |      * Get the page title of the HTML document
 90 |      *
 91 |      * @return null|string
 92 |      */
 93 |     public function getTitle()
 94 |     {
 95 |         $t = $this->dom->getElementsByTagName('title')->item(0);
 96 |         if ($t == null) {
 97 |             return null;
 98 |         } else {
 99 |             return $t->nodeValue;
100 |         }
101 |     }
102 | 
103 |     /**
104 |      * Set a META tag with specified 'name' and 'content' attributes
105 |      *
106 |      * @TODO: add support for multiple meta tags with the same name but different languages
107 |      *
108 |      * @param $name
109 |      * @param $content
110 |      */
111 |     public function setMeta($name, $content)
112 |     {
113 |         $c = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']');
114 |         if (count($c) == 0) {
115 |             $node = $this->dom->createElement('meta');
116 |             $node->setAttribute('name', $name);
117 |             $this->getHeadNode()->appendChild($node);
118 |             $c->addNode($node);
119 |         }
120 |         $c->setAttribute('content', $content);
121 |     }
122 | 
123 |     /**
124 |      * Remove all meta tags with the specified name attribute
125 |      *
126 |      * @param string $name
127 |      */
128 |     public function removeMeta($name)
129 |     {
130 |         $meta = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']');
131 |         $meta->remove();
132 |     }
133 | 
134 |     /**
135 |      * Get the content attribute of a meta tag with the specified name attribute
136 |      *
137 |      * @param string $name
138 |      * @return null|string
139 |      */
140 |     public function getMeta($name)
141 |     {
142 |         $node = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']')->getNode(0);
143 |         if ($node instanceof \DOMElement) {
144 |             return $node->getAttribute('content');
145 |         } else {
146 |             return null;
147 |         }
148 |     }
149 | 
150 |     /**
151 |      * Set the base tag with href attribute set to parameter $url
152 |      *
153 |      * @param string $url
154 |      */
155 |     public function setBaseHref($url)
156 |     {
157 |         $node = $this->filterXPath('descendant-or-self::base')->getNode(0);
158 |         if ($node == null) {
159 |             $node = $this->dom->createElement('base');
160 |             $this->getHeadNode()->appendChild($node);
161 |         }
162 |         $node->setAttribute('href', $url);
163 |     }
164 | 
165 |     /**
166 |      * Get the href attribute from the base tag, null if not present in document
167 |      *
168 |      * @return null|string
169 |      */
170 |     public function getBaseHref()
171 |     {
172 |         $node = $this->filterXPath('descendant-or-self::base')->getNode(0);
173 |         if ($node instanceof \DOMElement) {
174 |             return $node->getAttribute('href');
175 |         } else {
176 |             return null;
177 |         }
178 |     }
179 | 
180 |     /**
181 |      * Sets innerHTML content of an element specified by elementId
182 |      *
183 |      * @param string $elementId
184 |      * @param string $html
185 |      */
186 |     public function setHtmlById($elementId, $html)
187 |     {
188 |         $this->getElementById($elementId)->setInnerHtml($html);
189 |     }
190 | 
191 |     /**
192 |      * Get the document's HEAD section as DOMElement
193 |      *
194 |      * @return \DOMElement
195 |      */
196 |     public function getHeadNode()
197 |     {
198 |         $head = $this->dom->getElementsByTagName('head')->item(0);
199 |         if ($head == null) {
200 |             $head = $this->dom->createElement('head');
201 |             $head = $this->dom->documentElement->insertBefore($head, $this->getBodyNode());
202 |         }
203 |         return $head;
204 |     }
205 | 
206 |     /**
207 |      * Get the document's body as DOMElement
208 |      *
209 |      * @return \DOMElement
210 |      */
211 |     public function getBodyNode()
212 |     {
213 |         $body = $this->dom->getElementsByTagName('body')->item(0);
214 |         if ($body == null) {
215 |             $body = $this->dom->createElement('body');
216 |             $body = $this->dom->documentElement->appendChild($body);
217 |         }
218 |         return $body;
219 |     }
220 | 
221 |     /**
222 |      * Get the document's HEAD section wrapped in a HtmlPageCrawler instance
223 |      *
224 |      * @return HtmlPageCrawler
225 |      */
226 |     public function getHead()
227 |     {
228 |         return new HtmlPageCrawler($this->getHeadNode());
229 |     }
230 | 
231 |     /**
232 |      * Get the document's body wrapped in a HtmlPageCrawler instance
233 |      *
234 |      * @return HtmlPageCrawler
235 |      */
236 |     public function getBody()
237 |     {
238 |         return new HtmlPageCrawler($this->getBodyNode());
239 |     }
240 | 
241 |     public function __toString()
242 |     {
243 |         return $this->dom->saveHTML();
244 |     }
245 | 
246 |     /**
247 |      * Save this document to a HTML file or return HTML code as string
248 |      *
249 |      * @param string $filename If provided, output will be saved to this file, otherwise returned
250 |      * @return string|void
251 |      */
252 |     public function save($filename = '')
253 |     {
254 |         if ($filename != '') {
255 |             file_put_contents($filename, (string) $this);
256 |             return;
257 |         } else {
258 |             return (string) $this;
259 |         }
260 |     }
261 | 
262 |     /**
263 |      * Get an element in the document by it's id attribute
264 |      *
265 |      * @param string $id
266 |      * @return HtmlPageCrawler
267 |      */
268 |     public function getElementById($id)
269 |     {
270 |         return $this->filterXPath('descendant-or-self::*[@id = \'' . $id . '\']');
271 |     }
272 | 
273 |     /**
274 |      * Filter nodes by using a CSS selector
275 |      *
276 |      * @param string $selector CSS selector
277 |      * @return HtmlPageCrawler
278 |      */
279 |     public function filter($selector)
280 |     {
281 |         //echo "\n" . CssSelector::toXPath($selector) . "\n";
282 |         return $this->crawler->filter($selector);
283 |     }
284 | 
285 |     /**
286 |      * Filter nodes by XPath expression
287 |      *
288 |      * @param string $xpath XPath expression
289 |      * @return HtmlPageCrawler
290 |      */
291 |     public function filterXPath($xpath)
292 |     {
293 |         return $this->crawler->filterXPath($xpath);
294 |     }
295 | 
296 |     /**
297 |      * remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space)
298 |      *
299 |      * useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode)
300 |      *
301 |      * @param string $string
302 |      * @return string
303 |      */
304 |     public static function trimNewlines($string)
305 |     {
306 |         return Helpers::trimNewlines($string);
307 |     }
308 | 
309 |     public function __clone()
310 |     {
311 |         $this->dom = $this->dom->cloneNode(true);
312 |         $this->crawler = new HtmlPageCrawler($this->dom);
313 |     }
314 | 
315 |     /**
316 |      * minify the HTML document
317 |      *
318 |      * @param array $options Options passed to PrettyMin::__construct()
319 |      * @return HtmlPage
320 |      * @throws \Exception
321 |      */
322 |     public function minify(array $options = array())
323 |     {
324 |         if (!class_exists('Wa72\\HtmlPrettymin\\PrettyMin')) {
325 |             throw new \Exception('Function minify needs composer package wa72/html-pretty-min');
326 |         }
327 |         $pm = new PrettyMin($options);
328 |         $pm->load($this->dom)->minify();
329 |         return $this;
330 |     }
331 | 
332 |     /**
333 |      * indent the HTML document
334 |      *
335 |      * @param array $options Options passed to PrettyMin::__construct()
336 |      * @return HtmlPage
337 |      * @throws \Exception
338 |      */
339 |     public function indent(array $options = array())
340 |     {
341 |         if (!class_exists('Wa72\\HtmlPrettymin\\PrettyMin')) {
342 |             throw new \Exception('Function indent needs composer package wa72/html-pretty-min');
343 |         }
344 |         $pm = new PrettyMin($options);
345 |         $pm->load($this->dom)->indent();
346 |         return $this;
347 |     }
348 | }
349 | 


--------------------------------------------------------------------------------
/src/HtmlPageCrawler.php:
--------------------------------------------------------------------------------
   1 | <?php
   2 | namespace Wa72\HtmlPageDom;
   3 | 
   4 | use Symfony\Component\DomCrawler\Crawler;
   5 | 
   6 | /**
   7 |  * Extends \Symfony\Component\DomCrawler\Crawler by adding tree manipulation functions
   8 |  * for HTML documents inspired by jQuery such as setInnerHtml(), css(), append(), prepend(), before(),
   9 |  * addClass(), removeClass()
  10 |  *
  11 |  * @author Christoph Singer
  12 |  * @license MIT
  13 |  *
  14 |  */
  15 | class HtmlPageCrawler extends Crawler
  16 | {
  17 |     /**
  18 |      * the (internal) root element name used when importing html fragments
  19 |      * */
  20 |     const FRAGMENT_ROOT_TAGNAME = '_root';
  21 | 
  22 |     /**
  23 |      * Get an HtmlPageCrawler object from a HTML string, DOMNode, DOMNodeList or HtmlPageCrawler
  24 |      *
  25 |      * This is the equivalent to jQuery's $() function when used for wrapping DOMNodes or creating DOMElements from HTML code.
  26 |      *
  27 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList|array $content
  28 |      * @return HtmlPageCrawler
  29 |      * @api
  30 |      */
  31 |     public static function create($content)
  32 |     {
  33 |         if ($content instanceof HtmlPageCrawler) {
  34 |             return $content;
  35 |         } else {
  36 |             return new HtmlPageCrawler($content);
  37 |         }
  38 |     }
  39 | 
  40 |     /**
  41 |      * Adds the specified class(es) to each element in the set of matched elements.
  42 |      *
  43 |      * @param string $name One or more space-separated classes to be added to the class attribute of each matched element.
  44 |      * @return HtmlPageCrawler $this for chaining
  45 |      * @api
  46 |      */
  47 |     public function addClass($name)
  48 |     {
  49 |         foreach ($this as $node) {
  50 |             if ($node instanceof \DOMElement) {
  51 |                 /** @var \DOMElement $node */
  52 |                 $classes = preg_split('/\s+/s', $node->getAttribute('class'));
  53 |                 $found = false;
  54 |                 $count = count($classes);
  55 |                 for ($i = 0; $i < $count; $i++) {
  56 |                     if ($classes[$i] == $name) {
  57 |                         $found = true;
  58 |                     }
  59 |                 }
  60 |                 if (!$found) {
  61 |                     $classes[] = $name;
  62 |                     $node->setAttribute('class', trim(join(' ', $classes)));
  63 |                 }
  64 |             }
  65 |         }
  66 |         return $this;
  67 |     }
  68 | 
  69 |     /**
  70 |      * Insert content, specified by the parameter, after each element in the set of matched elements.
  71 |      *
  72 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content
  73 |      * @return HtmlPageCrawler $this for chaining
  74 |      * @api
  75 |      */
  76 |     public function after($content)
  77 |     {
  78 |         $content = self::create($content);
  79 |         $newnodes = array();
  80 |         foreach ($this as $i => $node) {
  81 |             /** @var \DOMNode $node */
  82 |             $refnode = $node->nextSibling;
  83 |             foreach ($content as $newnode) {
  84 |                 /** @var \DOMNode $newnode */
  85 |                 $newnode = static::importNewnode($newnode, $node, $i);
  86 |                 if ($refnode === null) {
  87 |                     $node->parentNode->appendChild($newnode);
  88 |                 } else {
  89 |                     $node->parentNode->insertBefore($newnode, $refnode);
  90 |                 }
  91 |                 $newnodes[] = $newnode;
  92 |             }
  93 |         }
  94 |         $content->clear();
  95 |         $content->add($newnodes);
  96 |         return $this;
  97 |     }
  98 | 
  99 |     /**
 100 |      * Insert HTML content as child nodes of each element after existing children
 101 |      *
 102 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment or DOMNode to append
 103 |      * @return HtmlPageCrawler $this for chaining
 104 |      * @api
 105 |      */
 106 |     public function append($content)
 107 |     {
 108 |         $content = self::create($content);
 109 |         $newnodes = array();
 110 |         foreach ($this as $i => $node) {
 111 |             /** @var \DOMNode $node */
 112 |             foreach ($content as $newnode) {
 113 |                 /** @var \DOMNode $newnode */
 114 |                 $newnode = static::importNewnode($newnode, $node, $i);
 115 |                 $node->appendChild($newnode);
 116 |                 $newnodes[] = $newnode;
 117 |             }
 118 |         }
 119 |         $content->clear();
 120 |         $content->add($newnodes);
 121 |         return $this;
 122 |     }
 123 | 
 124 |     /**
 125 |      * Insert every element in the set of matched elements to the end of the target.
 126 |      *
 127 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element
 128 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements
 129 |      * @api
 130 |      */
 131 |     public function appendTo($element)
 132 |     {
 133 |         $e = self::create($element);
 134 |         $newnodes = array();
 135 |         foreach ($e as $i => $node) {
 136 |             /** @var \DOMNode $node */
 137 |             foreach ($this as $newnode) {
 138 |                 /** @var \DOMNode $newnode */
 139 |                 if ($node !== $newnode) {
 140 |                     $newnode = static::importNewnode($newnode, $node, $i);
 141 |                     $node->appendChild($newnode);
 142 |                 }
 143 |                 $newnodes[] = $newnode;
 144 |             }
 145 |         }
 146 |         return self::create($newnodes);
 147 |     }
 148 | 
 149 |     /**
 150 |      * Sets an attribute on each element
 151 |      *
 152 |      * @param string $name
 153 |      * @param string $value
 154 |      * @return HtmlPageCrawler $this for chaining
 155 |      * @api
 156 |      */
 157 |     public function setAttribute($name, $value)
 158 |     {
 159 |         foreach ($this as $node) {
 160 |             if ($node instanceof \DOMElement) {
 161 |                 /** @var \DOMElement $node */
 162 |                 $node->setAttribute($name, $value);
 163 |             }
 164 |         }
 165 |         return $this;
 166 |     }
 167 | 
 168 |     /**
 169 |      * Returns the attribute value of the first node of the list.
 170 |      * This is just an alias for attr() for naming consistency with setAttribute()
 171 |      *
 172 |      * @param string $name The attribute name
 173 |      * @return string|null The attribute value or null if the attribute does not exist
 174 |      * @throws \InvalidArgumentException When current node is empty
 175 |      */
 176 |     public function getAttribute($name)
 177 |     {
 178 |         return parent::attr($name);
 179 |     }
 180 | 
 181 |     /**
 182 |      * Insert content, specified by the parameter, before each element in the set of matched elements.
 183 |      *
 184 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content
 185 |      * @return HtmlPageCrawler $this for chaining
 186 |      * @api
 187 |      */
 188 |     public function before($content)
 189 |     {
 190 |         $content = self::create($content);
 191 |         $newnodes = array();
 192 |         foreach ($this as $i => $node) {
 193 |             /** @var \DOMNode $node */
 194 |             foreach ($content as $newnode) {
 195 |                 /** @var \DOMNode $newnode */
 196 |                 if ($node !== $newnode) {
 197 |                     $newnode = static::importNewnode($newnode, $node, $i);
 198 |                     $node->parentNode->insertBefore($newnode, $node);
 199 |                     $newnodes[] = $newnode;
 200 |                 }
 201 |             }
 202 |         }
 203 |         $content->clear();
 204 |         $content->add($newnodes);
 205 |         return $this;
 206 |     }
 207 | 
 208 |     /**
 209 |      * Create a deep copy of the set of matched elements.
 210 |      *
 211 |      * Equivalent to clone() in jQuery (clone is not a valid PHP function name)
 212 |      *
 213 |      * @return HtmlPageCrawler
 214 |      * @api
 215 |      */
 216 |     public function makeClone()
 217 |     {
 218 |         return clone $this;
 219 |     }
 220 | 
 221 |     public function __clone()
 222 |     {
 223 |         $newnodes = array();
 224 |         foreach ($this as $node) {
 225 |             /** @var \DOMNode $node */
 226 |             $newnodes[] = $node->cloneNode(true);
 227 |         }
 228 |         $this->clear();
 229 |         $this->add($newnodes);
 230 |     }
 231 | 
 232 |     /**
 233 |      * Get one CSS style property of the first element or set it for all elements in the list
 234 |      *
 235 |      * Function is here for compatibility with jQuery; it is the same as getStyle() and setStyle()
 236 |      *
 237 |      * @see HtmlPageCrawler::getStyle()
 238 |      * @see HtmlPageCrawler::setStyle()
 239 |      *
 240 |      * @param string $key The name of the style property
 241 |      * @param null|string $value The CSS value to set, or NULL to get the current value
 242 |      * @return HtmlPageCrawler|string If no param is provided, returns the CSS styles of the first element
 243 |      * @api
 244 |      */
 245 |     public function css($key, $value = null)
 246 |     {
 247 |         if (null === $value) {
 248 |             return $this->getStyle($key);
 249 |         } else {
 250 |             return $this->setStyle($key, $value);
 251 |         }
 252 |     }
 253 | 
 254 |     /**
 255 |      * get one CSS style property of the first element
 256 |      *
 257 |      * @param string $key name of the property
 258 |      * @return string|null value of the property
 259 |      */
 260 |     public function getStyle($key)
 261 |     {
 262 |         $styles = Helpers::cssStringToArray($this->getAttribute('style'));
 263 |         return (isset($styles[$key]) ? $styles[$key] : null);
 264 |     }
 265 | 
 266 |     /**
 267 |      * set one CSS style property for all elements in the list
 268 |      *
 269 |      * @param string $key name of the property
 270 |      * @param string $value value of the property
 271 |      * @return HtmlPageCrawler $this for chaining
 272 |      */
 273 |     public function setStyle($key, $value)
 274 |     {
 275 |         foreach ($this as $node) {
 276 |             if ($node instanceof \DOMElement) {
 277 |                 /** @var \DOMElement $node */
 278 |                 $styles = Helpers::cssStringToArray($node->getAttribute('style'));
 279 |                 if ($value != '') {
 280 |                     $styles[$key] = $value;
 281 |                 } elseif (isset($styles[$key])) {
 282 |                     unset($styles[$key]);
 283 |                 }
 284 |                 $node->setAttribute('style', Helpers::cssArrayToString($styles));
 285 |             }
 286 |         }
 287 |         return $this;
 288 |     }
 289 | 
 290 |     /**
 291 |      * Removes all child nodes and text from all nodes in set
 292 |      *
 293 |      * Equivalent to jQuery's empty() function which is not a valid function name in PHP
 294 |      * @return HtmlPageCrawler $this
 295 |      * @api
 296 |      */
 297 |     public function makeEmpty()
 298 |     {
 299 |         foreach ($this as $node) {
 300 |             $node->nodeValue = '';
 301 |         }
 302 |         return $this;
 303 |     }
 304 | 
 305 |     /**
 306 |      * Determine whether any of the matched elements are assigned the given class.
 307 |      *
 308 |      * @param string $name
 309 |      * @return bool
 310 |      * @api
 311 |      */
 312 |     public function hasClass($name)
 313 |     {
 314 |         foreach ($this as $node) {
 315 |             if ($node instanceof \DOMElement && $class = $node->getAttribute('class')) {
 316 |                 $classes = preg_split('/\s+/s', $class);
 317 |                 if (in_array($name, $classes)) {
 318 |                     return true;
 319 |                 }
 320 |             }
 321 |         }
 322 |         return false;
 323 |     }
 324 | 
 325 |     /**
 326 |      * Set the HTML contents of each element
 327 |      *
 328 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment
 329 |      * @return HtmlPageCrawler $this for chaining
 330 |      * @api
 331 |      */
 332 |     public function setInnerHtml($content)
 333 |     {
 334 |         $content = self::create($content);
 335 |         foreach ($this as $node) {
 336 |             $node->nodeValue = '';
 337 |             foreach ($content as $newnode) {
 338 |                 /** @var \DOMNode $node */
 339 |                 /** @var \DOMNode $newnode */
 340 |                 $newnode = static::importNewnode($newnode, $node);
 341 |                 $node->appendChild($newnode);
 342 |             }
 343 |         }
 344 |         return $this;
 345 |     }
 346 | 
 347 |     /**
 348 |      * Alias for Crawler::html() for naming consistency with setInnerHtml()
 349 |      *
 350 |      * @return string
 351 |      * @api
 352 |      */
 353 |     public function getInnerHtml()
 354 |     {
 355 |         return parent::html();
 356 |     }
 357 | 
 358 |     /**
 359 |      * Insert every element in the set of matched elements after the target.
 360 |      *
 361 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element
 362 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements
 363 |      * @api
 364 |      */
 365 |     public function insertAfter($element)
 366 |     {
 367 |         $e = self::create($element);
 368 |         $newnodes = array();
 369 |         foreach ($e as $i => $node) {
 370 |             /** @var \DOMNode $node */
 371 |             $refnode = $node->nextSibling;
 372 |             foreach ($this as $newnode) {
 373 |                 /** @var \DOMNode $newnode */
 374 |                 $newnode = static::importNewnode($newnode, $node, $i);
 375 |                 if ($refnode === null) {
 376 |                     $node->parentNode->appendChild($newnode);
 377 |                 } else {
 378 |                     $node->parentNode->insertBefore($newnode, $refnode);
 379 |                 }
 380 |                 $newnodes[] = $newnode;
 381 |             }
 382 |         }
 383 |         return self::create($newnodes);
 384 |     }
 385 | 
 386 |     /**
 387 |      * Insert every element in the set of matched elements before the target.
 388 |      *
 389 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element
 390 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements
 391 |      * @api
 392 |      */
 393 |     public function insertBefore($element)
 394 |     {
 395 |         $e = self::create($element);
 396 |         $newnodes = array();
 397 |         foreach ($e as $i => $node) {
 398 |             /** @var \DOMNode $node */
 399 |             foreach ($this as $newnode) {
 400 |                 /** @var \DOMNode $newnode */
 401 |                 $newnode = static::importNewnode($newnode, $node, $i);
 402 |                 if ($newnode !== $node) {
 403 |                     $node->parentNode->insertBefore($newnode, $node);
 404 |                 }
 405 |                 $newnodes[] = $newnode;
 406 |             }
 407 |         }
 408 |         return self::create($newnodes);
 409 |     }
 410 | 
 411 |     /**
 412 |      * Insert content, specified by the parameter, to the beginning of each element in the set of matched elements.
 413 |      *
 414 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment
 415 |      * @return HtmlPageCrawler $this for chaining
 416 |      * @api
 417 |      */
 418 |     public function prepend($content)
 419 |     {
 420 |         $content = self::create($content);
 421 |         $newnodes = array();
 422 |         foreach ($this as $i => $node) {
 423 |             $refnode = $node->firstChild;
 424 |             /** @var \DOMNode $node */
 425 |             foreach ($content as $newnode) {
 426 |                 /** @var \DOMNode $newnode */
 427 |                 $newnode = static::importNewnode($newnode, $node, $i);
 428 |                 if ($refnode === null) {
 429 |                     $node->appendChild($newnode);
 430 |                 } else if ($refnode !== $newnode) {
 431 |                     $node->insertBefore($newnode, $refnode);
 432 |                 }
 433 |                 $newnodes[] = $newnode;
 434 |             }
 435 |         }
 436 |         $content->clear();
 437 |         $content->add($newnodes);
 438 |         return $this;
 439 |     }
 440 | 
 441 |     /**
 442 |      * Insert every element in the set of matched elements to the beginning of the target.
 443 |      *
 444 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element
 445 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements prepended to the target elements
 446 |      * @api
 447 |      */
 448 |     public function prependTo($element)
 449 |     {
 450 |         $e = self::create($element);
 451 |         $newnodes = array();
 452 |         foreach ($e as $i => $node) {
 453 |             $refnode = $node->firstChild;
 454 |             /** @var \DOMNode $node */
 455 |             foreach ($this as $newnode) {
 456 |                 /** @var \DOMNode $newnode */
 457 |                 $newnode = static::importNewnode($newnode, $node, $i);
 458 |                 if ($newnode !== $node) {
 459 |                     if ($refnode === null) {
 460 |                         $node->appendChild($newnode);
 461 |                     } else {
 462 |                         $node->insertBefore($newnode, $refnode);
 463 |                     }
 464 |                 }
 465 |                 $newnodes[] = $newnode;
 466 |             }
 467 |         }
 468 |         return self::create($newnodes);
 469 |     }
 470 | 
 471 |     /**
 472 |      * Remove the set of matched elements from the DOM.
 473 |      *
 474 |      * (as opposed to Crawler::clear() which detaches the nodes only from Crawler
 475 |      * but leaves them in the DOM)
 476 |      *
 477 |      * @api
 478 |      */
 479 |     public function remove()
 480 |     {
 481 |         foreach ($this as $node) {
 482 |             /**
 483 |              * @var \DOMNode $node
 484 |              */
 485 |             if ($node->parentNode instanceof \DOMElement) {
 486 |                 $node->parentNode->removeChild($node);
 487 |             }
 488 |         }
 489 |         $this->clear();
 490 |     }
 491 | 
 492 |     /**
 493 |      * Remove an attribute from each element in the set of matched elements.
 494 |      *
 495 |      * Alias for removeAttribute for compatibility with jQuery
 496 |      *
 497 |      * @param string $name
 498 |      * @return HtmlPageCrawler
 499 |      * @api
 500 |      */
 501 |     public function removeAttr($name)
 502 |     {
 503 |         return $this->removeAttribute($name);
 504 |     }
 505 | 
 506 |     /**
 507 |      * Remove an attribute from each element in the set of matched elements.
 508 |      *
 509 |      * @param string $name
 510 |      * @return HtmlPageCrawler
 511 |      */
 512 |     public function removeAttribute($name)
 513 |     {
 514 |         foreach ($this as $node) {
 515 |             if ($node instanceof \DOMElement) {
 516 |                 /** @var \DOMElement $node */
 517 |                 if ($node->hasAttribute($name)) {
 518 |                     $node->removeAttribute($name);
 519 |                 }
 520 |             }
 521 |         }
 522 |         return $this;
 523 |     }
 524 | 
 525 |     /**
 526 |      * Remove a class from each element in the list
 527 |      *
 528 |      * @param string $name
 529 |      * @return HtmlPageCrawler $this for chaining
 530 |      * @api
 531 |      */
 532 |     public function removeClass($name)
 533 |     {
 534 |         foreach ($this as $node) {
 535 |             if ($node instanceof \DOMElement) {
 536 |                 /** @var \DOMElement $node */
 537 |                 $classes = preg_split('/\s+/s', $node->getAttribute('class'));
 538 |                 $count = count($classes);
 539 |                 for ($i = 0; $i < $count; $i++) {
 540 |                     if ($classes[$i] == $name) {
 541 |                         unset($classes[$i]);
 542 |                     }
 543 |                 }
 544 |                 $node->setAttribute('class', trim(join(' ', $classes)));
 545 |             }
 546 |         }
 547 |         return $this;
 548 |     }
 549 | 
 550 |     /**
 551 |      * Replace each target element with the set of matched elements.
 552 |      *
 553 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element
 554 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements
 555 |      * @api
 556 |      */
 557 |     public function replaceAll($element)
 558 |     {
 559 |         $e = self::create($element);
 560 |         $newnodes = array();
 561 |         foreach ($e as $i => $node) {
 562 |             /** @var \DOMNode $node */
 563 |             $parent = $node->parentNode;
 564 |             $refnode  = $node->nextSibling;
 565 |             foreach ($this as $j => $newnode) {
 566 |                 /** @var \DOMNode $newnode */
 567 |                 $newnode = static::importNewnode($newnode, $node, $i);
 568 |                 if ($j == 0) {
 569 |                     $parent->replaceChild($newnode, $node);
 570 |                 } else {
 571 |                     $parent->insertBefore($newnode, $refnode);
 572 |                 }
 573 |                 $newnodes[] = $newnode;
 574 |             }
 575 |         }
 576 |         return self::create($newnodes);
 577 |     }
 578 | 
 579 |     /**
 580 |      * Replace each element in the set of matched elements with the provided new content and return the set of elements that was removed.
 581 |      *
 582 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content
 583 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining
 584 |      * @api
 585 |      */
 586 |     public function replaceWith($content)
 587 |     {
 588 |         $content = self::create($content);
 589 |         $newnodes = array();
 590 |         foreach ($this as $i => $node) {
 591 |             /** @var \DOMNode $node */
 592 |             $parent = $node->parentNode;
 593 |             $refnode  = $node->nextSibling;
 594 |             foreach ($content as $j => $newnode) {
 595 |                 /** @var \DOMNode $newnode */
 596 |                 $newnode = static::importNewnode($newnode, $node, $i);
 597 |                 if ($j == 0) {
 598 |                     $parent->replaceChild($newnode, $node);
 599 |                 } else {
 600 |                     $parent->insertBefore($newnode, $refnode);
 601 |                 }
 602 |                 $newnodes[] = $newnode;
 603 |             }
 604 |         }
 605 |         $content->clear();
 606 |         $content->add($newnodes);
 607 |         return $this;
 608 |     }
 609 | 
 610 |     /**
 611 |      * Get the combined text contents of each element in the set of matched elements, including their descendants.
 612 |      * This is what the jQuery text() function does, contrary to the Crawler::text() method that returns only
 613 |      * the text of the first node.
 614 |      *
 615 |      * @return string
 616 |      * @api
 617 |      */
 618 |     public function getCombinedText()
 619 |     {
 620 |         $text = '';
 621 |         foreach ($this as $node) {
 622 |             /** @var \DOMNode $node */
 623 |             $text .= $node->nodeValue;
 624 |         }
 625 |         return $text;
 626 |     }
 627 | 
 628 |     /**
 629 |      * Set the text contents of the matched elements.
 630 |      *
 631 |      * @param string $text
 632 |      * @return HtmlPageCrawler
 633 |      * @api
 634 |      */
 635 |     public function setText($text)
 636 |     {
 637 |         $text = htmlspecialchars($text);
 638 |         foreach ($this as $node) {
 639 |             /** @var \DOMNode $node */
 640 |             $node->nodeValue = $text;
 641 |         }
 642 |         return $this;
 643 |     }
 644 | 
 645 |     /**
 646 |      * Add or remove one or more classes from each element in the set of matched elements, depending the class’s presence.
 647 |      *
 648 |      * @param string $classname One or more classnames separated by spaces
 649 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining
 650 |      * @api
 651 |      */
 652 |     public function toggleClass($classname)
 653 |     {
 654 |         $classes = explode(' ', $classname);
 655 |         foreach ($this as $i => $node) {
 656 |             $c = self::create($node);
 657 |             /** @var \DOMNode $node */
 658 |             foreach ($classes as $class) {
 659 |                 if ($c->hasClass($class)) {
 660 |                     $c->removeClass($class);
 661 |                 } else {
 662 |                     $c->addClass($class);
 663 |                 }
 664 |             }
 665 |         }
 666 |         return $this;
 667 |     }
 668 | 
 669 |     /**
 670 |      * Remove the parents of the set of matched elements from the DOM, leaving the matched elements in their place.
 671 |      *
 672 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining
 673 |      * @api
 674 |      */
 675 |     public function unwrap()
 676 |     {
 677 |         $parents = array();
 678 |         foreach($this as $i => $node) {
 679 |             $parents[] = $node->parentNode;
 680 |         }
 681 | 
 682 |         self::create($parents)->unwrapInner();
 683 |         return $this;
 684 |     }
 685 | 
 686 |     /**
 687 |      * Remove the matched elements, but promote the children to take their place.
 688 |      *
 689 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining
 690 |      * @api
 691 |      */
 692 |     public function unwrapInner()
 693 |     {
 694 |         foreach($this as $i => $node) {
 695 |             if (!$node->parentNode instanceof \DOMElement) {
 696 |                 throw new \InvalidArgumentException('DOMElement does not have a parent DOMElement node.');
 697 |             }
 698 | 
 699 |             /** @var \DOMNode[] $children */
 700 |             $children = iterator_to_array($node->childNodes);
 701 |             foreach ($children as $child) {
 702 |                 $node->parentNode->insertBefore($child, $node);
 703 |             }
 704 | 
 705 |             $node->parentNode->removeChild($node);
 706 |         }
 707 |     }
 708 | 
 709 | 
 710 |     /**
 711 |      * Wrap an HTML structure around each element in the set of matched elements
 712 |      *
 713 |      * The HTML structure must contain only one root node, e.g.:
 714 |      * Works: <div><div></div></div>
 715 |      * Does not work: <div></div><div></div>
 716 |      *
 717 |      * @param string|HtmlPageCrawler|\DOMNode $wrappingElement
 718 |      * @return HtmlPageCrawler $this for chaining
 719 |      * @api
 720 |      */
 721 |     public function wrap($wrappingElement)
 722 |     {
 723 |         $content = self::create($wrappingElement);
 724 |         $newnodes = array();
 725 |         foreach ($this as $i => $node) {
 726 |             /** @var \DOMNode $node */
 727 |             $newnode = $content->getNode(0);
 728 |             /** @var \DOMNode $newnode */
 729 | //            $newnode = static::importNewnode($newnode, $node, $i);
 730 |             if ($newnode->ownerDocument !== $node->ownerDocument) {
 731 |                 $newnode = $node->ownerDocument->importNode($newnode, true);
 732 |             } else {
 733 |                 if ($i > 0) {
 734 |                     $newnode = $newnode->cloneNode(true);
 735 |                 }
 736 |             }
 737 |             $oldnode = $node->parentNode->replaceChild($newnode, $node);
 738 |             while ($newnode->hasChildNodes()) {
 739 |                 $elementFound = false;
 740 |                 foreach ($newnode->childNodes as $child) {
 741 |                     if ($child instanceof \DOMElement) {
 742 |                         $newnode = $child;
 743 |                         $elementFound = true;
 744 |                         break;
 745 |                     }
 746 |                 }
 747 |                 if (!$elementFound) {
 748 |                     break;
 749 |                 }
 750 |             }
 751 |             $newnode->appendChild($oldnode);
 752 |             $newnodes[] = $newnode;
 753 |         }
 754 |         $content->clear();
 755 |         $content->add($newnodes);
 756 |         return $this;
 757 |     }
 758 | 
 759 |     /**
 760 |      * Wrap an HTML structure around all elements in the set of matched elements.
 761 |      *
 762 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content
 763 |      * @throws \LogicException
 764 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining
 765 |      * @api
 766 |      */
 767 |     public function wrapAll($content)
 768 |     {
 769 |         $content = self::create($content);
 770 |         $parent = $this->getNode(0)->parentNode;
 771 |         foreach ($this as $i => $node) {
 772 |             /** @var \DOMNode $node */
 773 |             if ($node->parentNode !== $parent) {
 774 |                 throw new \LogicException('Nodes to be wrapped with wrapAll() must all have the same parent');
 775 |             }
 776 |         }
 777 | 
 778 |         $newnode = $content->getNode(0);
 779 |         /** @var \DOMNode $newnode */
 780 |         $newnode = static::importNewnode($newnode, $parent);
 781 | 
 782 |         $newnode = $parent->insertBefore($newnode,$this->getNode(0));
 783 |         $content->clear();
 784 |         $content->add($newnode);
 785 | 
 786 |         while ($newnode->hasChildNodes()) {
 787 |             $elementFound = false;
 788 |             foreach ($newnode->childNodes as $child) {
 789 |                 if ($child instanceof \DOMElement) {
 790 |                     $newnode = $child;
 791 |                     $elementFound = true;
 792 |                     break;
 793 |                 }
 794 |             }
 795 |             if (!$elementFound) {
 796 |                 break;
 797 |             }
 798 |         }
 799 |         foreach ($this as $i => $node) {
 800 |             /** @var \DOMNode $node */
 801 |             $newnode->appendChild($node);
 802 |         }
 803 |         return $this;
 804 |     }
 805 | 
 806 |     /**
 807 |      * Wrap an HTML structure around the content of each element in the set of matched elements.
 808 |      *
 809 |      * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content
 810 |      * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining
 811 |      * @api
 812 |      */
 813 |     public function wrapInner($content)
 814 |     {
 815 |         foreach ($this as $i => $node) {
 816 |             /** @var \DOMNode $node */
 817 |             self::create($node->childNodes)->wrapAll($content);
 818 |         }
 819 |         return $this;
 820 |     }
 821 | 
 822 |     /**
 823 |      * Get the HTML code fragment of all elements and their contents.
 824 |      *
 825 |      * If the first node contains a complete HTML document return only
 826 |      * the full code of this document.
 827 |      *
 828 |      * @return string HTML code (fragment)
 829 |      * @api
 830 |      */
 831 |     public function saveHTML()
 832 |     {
 833 |         if ($this->isHtmlDocument()) {
 834 |             return $this->getDOMDocument()->saveHTML();
 835 |         } else {
 836 |             $doc = new \DOMDocument('1.0', 'UTF-8');
 837 |             $root = $doc->appendChild($doc->createElement('_root'));
 838 |             foreach ($this as $node) {
 839 |                 $root->appendChild($doc->importNode($node, true));
 840 |             }
 841 |             $html = trim($doc->saveHTML());
 842 |             return preg_replace('@^<'.self::FRAGMENT_ROOT_TAGNAME.'[^>]*>|</'.self::FRAGMENT_ROOT_TAGNAME.'>$@', '', $html);
 843 |         }
 844 |     }
 845 | 
 846 |     public function __toString()
 847 |     {
 848 |         return $this->saveHTML();
 849 |     }
 850 | 
 851 |     /**
 852 |      * checks whether the first node contains a complete html document
 853 |      * (as opposed to a document fragment)
 854 |      *
 855 |      * @return boolean
 856 |      */
 857 |     public function isHtmlDocument()
 858 |     {
 859 |         $node = $this->getNode(0);
 860 |         if ($node instanceof \DOMElement
 861 |             && $node->ownerDocument instanceof \DOMDocument
 862 |             && $node->ownerDocument->documentElement === $node
 863 |             && $node->nodeName == 'html'
 864 |         ) {
 865 |             return true;
 866 |         } else {
 867 |             return false;
 868 |         }
 869 |     }
 870 | 
 871 |     /**
 872 |      * get ownerDocument of the first element
 873 |      *
 874 |      * @return \DOMDocument|null
 875 |      */
 876 |     public function getDOMDocument()
 877 |     {
 878 |         $node = $this->getNode(0);
 879 |         $r = null;
 880 |         if ($node instanceof \DOMElement
 881 |             && $node->ownerDocument instanceof \DOMDocument
 882 |         ) {
 883 |             $r = $node->ownerDocument;
 884 |         }
 885 |         return $r;
 886 |     }
 887 | 
 888 |     /**
 889 |      * Filters the list of nodes with a CSS selector.
 890 |      *
 891 |      * @param string $selector
 892 |      * @return HtmlPageCrawler
 893 |      */
 894 |     public function filter(string $selector): static
 895 |     {
 896 |         return parent::filter($selector);
 897 |     }
 898 | 
 899 |     /**
 900 |      * Filters the list of nodes with an XPath expression.
 901 |      *
 902 |      * @param string $xpath An XPath expression
 903 |      *
 904 |      * @return HtmlPageCrawler A new instance of Crawler with the filtered list of nodes
 905 |      *
 906 |      * @api
 907 |      */
 908 |     public function filterXPath($xpath): static
 909 |     {
 910 |         return parent::filterXPath($xpath);
 911 |     }
 912 | 
 913 |     /**
 914 |      * Adds HTML/XML content to the HtmlPageCrawler object (but not to the DOM of an already attached node).
 915 |      *
 916 |      * Function overriden from Crawler because HTML fragments are always added as complete documents there
 917 |      *
 918 |      *
 919 |      * @param string      $content A string to parse as HTML/XML
 920 |      * @param null|string $type    The content type of the string
 921 |      *
 922 |      * @return null|void
 923 |      */
 924 |     public function addContent($content, $type = null): void
 925 |     {
 926 |         if (empty($type)) {
 927 |             $type = 'text/html;charset=UTF-8';
 928 |         }
 929 |         if (substr($type, 0, 9) == 'text/html' && !preg_match('/<html\b[^>]*>/i', $content)) {
 930 |             // string contains no <html> Tag => no complete document but an HTML fragment!
 931 |             $this->addHtmlFragment($content);
 932 |         } else {
 933 |             parent::addContent($content, $type);
 934 |         }
 935 |     }
 936 | 
 937 |     public function addHtmlFragment($content, $charset = 'UTF-8')
 938 |     {
 939 |         $d = new \DOMDocument('1.0', $charset);
 940 |         $d->preserveWhiteSpace = false;
 941 |         $root = $d->appendChild($d->createElement(self::FRAGMENT_ROOT_TAGNAME));
 942 |         $bodynode = Helpers::getBodyNodeFromHtmlFragment($content, $charset);
 943 |         foreach ($bodynode->childNodes as $child) {
 944 |             $inode = $root->appendChild($d->importNode($child, true));
 945 |             if ($inode) {
 946 |                 $this->addNode($inode);
 947 |             }
 948 |         }
 949 |     }
 950 | 
 951 |     /**
 952 |      * Adds a node to the current list of nodes.
 953 |      *
 954 |      * This method uses the appropriate specialized add*() method based
 955 |      * on the type of the argument.
 956 |      *
 957 |      * Overwritten from parent to allow Crawler to be added
 958 |      *
 959 |      * @param \DOMNodeList|\DOMNode|array|string|Crawler|null $node A node
 960 |      *
 961 |      * @api
 962 |      */
 963 |     public function add(\DOMNodeList|\DOMNode|array|string|Crawler|null $node): void
 964 |     {
 965 |         if ($node instanceof Crawler) {
 966 |             foreach ($node as $childnode) {
 967 |                 $this->addNode($childnode);
 968 |             }
 969 |         } else {
 970 |             parent::add($node);
 971 |         }
 972 |     }
 973 | 
 974 |     /**
 975 |      * @param \DOMNode $newnode
 976 |      * @param \DOMNode $referencenode
 977 |      * @param int $clone
 978 |      * @return \DOMNode
 979 |      */
 980 |     protected static function importNewnode(\DOMNode $newnode, \DOMNode $referencenode, $clone = 0) {
 981 |         if ($newnode->ownerDocument !== $referencenode->ownerDocument) {
 982 |             $referencenode->ownerDocument->preserveWhiteSpace = false;
 983 |             $newnode = $referencenode->ownerDocument->importNode($newnode, true);
 984 |         } else {
 985 |             if ($clone > 0) {
 986 |                 $newnode = $newnode->cloneNode(true);
 987 |             }
 988 |         }
 989 |         return $newnode;
 990 |     }
 991 | 
 992 | //    /**
 993 | //     * Checks whether the first node in the set is disconnected (has no parent node)
 994 | //     *
 995 | //     * @return bool
 996 | //     */
 997 | //    public function isDisconnected()
 998 | //    {
 999 | //        $parent = $this->getNode(0)->parentNode;
1000 | //        return ($parent == null || $parent->tagName == self::FRAGMENT_ROOT_TAGNAME);
1001 | //    }
1002 | 
1003 |     public function __get($name)
1004 |     {
1005 |         switch ($name) {
1006 |             case 'count':
1007 |             case 'length':
1008 |                 return count($this);
1009 |         }
1010 |         throw new \Exception('No such property ' . $name);
1011 |     }
1012 | }
1013 | 


--------------------------------------------------------------------------------