├── README.md
├── composer.json
├── docs
├── accessing.md
└── traversing.md
└── src
├── Collectors
└── NodesCollector.php
├── Contracts
├── DomContract.php
├── NodeContract.php
└── NodesCollectorContract.php
├── Dom.php
├── DomFactory.php
├── Node.php
├── NodeFactory.php
└── Sources
└── simple_html_dom.php
/README.md:
--------------------------------------------------------------------------------
1 | # HTML DOM parser for PHP
2 |
3 | [](https://travis-ci.com/zenthangplus/HTMLDomParser)
4 | [](https://app.codeship.com/projects/339848)
5 | [](https://circleci.com/gh/zenthangplus/HTMLDomParser)
6 |
7 | A Simple HTML DOM parser written in PHP let you manipulate HTML in a easy way with selectors just like CSS or jQuery.
8 |
9 | > This is modern version of [Simple HTML DOM](https://simplehtmldom.sourceforge.io/).
10 | You can install by using [Composer](https://getcomposer.org/) and import to your project as a package.
11 |
12 | ### Features
13 |
14 | - Parse and modify HTML document.
15 | - Find tags (elements) on HTML with selectors just like jQuery.
16 | - Extract contents from HTML in a single line.
17 | - Export elements or a special node to a single file.
18 | - Supports HTML document with invalid structure.
19 |
20 | ## Installation
21 |
22 | You can use [Composer](https://getcomposer.org/) to install this package to your project by running following command:
23 |
24 | ```bash
25 | composer require zenthangplus/html-dom-parser
26 | ```
27 |
28 | **The minimum PHP version requirement is 5.6**. If you are using PHP < 5.6, please use [the original version](https://simplehtmldom.sourceforge.io/).
29 |
30 | ## Usage
31 | The following example is the simple usage of this package:
32 |
33 | ```php
34 |
');
36 | $a = $dom->findOne('.container a');
37 | echo $a->text();
38 | // Output: Test
39 | ```
40 |
41 | ### DOM
42 | Dom is the root [Node](#node) of the HTML document.
43 |
44 | You can load DOM from `string` or `file`.
45 |
46 | ```php
47 | Test');
49 | ```
50 |
51 | ```php
52 | Test');
64 | ```
65 |
66 | ```php
67 | find('div');
79 | $dom->find('#container');
80 | $dom->find('#container .content ul>li a.external-link');
81 | $dom->find('#container .content ul>li a[data-id=link-1]');
82 | ```
83 |
84 | Similar to Dom, a Node also traversable:
85 | ```php
86 | findOne('#container .content ul>li');
89 | $anchorNode = $node->findOne('a.external-link');
90 |
91 | // Even traverse in a separate Node
92 | $node = \HTMLDomParser\NodeFactory::load('');
93 | $node->find('ul.list li');
94 | ```
95 |
96 | ##### List of supported selectors:
97 |
98 | | Selector example | Description |
99 | | --- | --- |
100 | | `div` | Find elements with the `div` tag |
101 | | `#container` | Find elements with the `container` id |
102 | | `.wrapper` | Find elements with the `wrapper` class |
103 | | `[data-id]` | Find elements with the `data-id` attribute |
104 | | `[data-id=12]` | Find elements with the attribute `data-id=12` |
105 | | `a[data-id=12]` | Find anchor tags with the attribute `data-id=12` |
106 | | `*[class]` | Find all elements with `class` attribute |
107 | | `a, img` | Find all anchors and images |
108 | | `a[title], img[title]` | Find all anchors and images with the `title` attribute |
109 | | `#container ul` | By using `space` between selectors, you can find nested elements |
110 | | `#container>ul` | By using `>` between selectors, you can find the closest children |
111 | | `#container, #side` | By using `,` between selectors, you can find elements by multiple selectors in one query |
112 | | `#container div.content ul>li, #side div[role=main] ul li` | You can combine selectors in one query |
113 |
114 | ##### List of function you can use with above selectors:
115 |
116 | - [`find()` Find elements](docs/traversing.md#find-elements)
117 | - [`findOne()` Find one element](docs/traversing.md#find-one-element)
118 |
119 | ##### Specific find functions:
120 |
121 | - [`getElementById()` Get a element by ID](docs/traversing.md#get-element-by-id)
122 | - [`getElementByTagName()` Get a element by tag name](docs/traversing.md#get-a-element-by-tag-name)
123 | - [`getElementsByTagName()` Get elements by tag name](docs/traversing.md#get-elements-by-tag-name)
124 |
125 | ##### Accessing the node's data:
126 |
127 | - [`text()` Get the text contents](docs/accessing.md#get-the-text-contents)
128 | - [`getAttributes()` Get attributes](docs/accessing.md#get-all-attributes)
129 | - [`getAttribute()` Get a attribute](docs/accessing.md#get-a-attribute)
130 | - [`hasAttribute()` Check element has a attribute](docs/accessing.md#check-element-has-a-attribute)
131 | - [`hasChild()` Check element has child](docs/accessing.md#check-element-has-child)
132 | - [`innerHtml()` Get inner HTML](docs/accessing.md#get-inner-html)
133 | - [`outerHtml()` Get outer HTML](docs/accessing.md#get-outer-html)
134 | - [`innerXml()` Get inner XML](docs/accessing.md#get-inner-xml)
135 | - [Get node's HTML](docs/accessing.md#get-nodes-html)
136 |
137 | ##### Modifying the Node's data
138 | - [`setAttribute()` Set a attribute](docs/accessing.md#set-a-attribute)
139 | - [`removeAttribute()` Remove a attribute](docs/accessing.md#remove-a-attribute)
140 | - [`appendChild()` Append child](docs/accessing.md#append-child)
141 | - [`save()` Save DOM or even a node](docs/accessing.md#save-dom-or-even-a-node)
142 |
143 | ##### Traversing the Node tree
144 |
145 | - [`getChild()` Get child element](docs/accessing.md#get-child-element)
146 | - [`getChildren()` Get all children](docs/accessing.md#get-all-children)
147 | - [`getFirstChild()` Get first child](docs/accessing.md#get-first-child)
148 | - [`getLastChild()` Get last child](docs/accessing.md#get-last-child)
149 | - [`getNextSibling()` Get next sibling](docs/accessing.md#get-next-sibling)
150 | - [`getPrevSibling()` Get previous sibling](docs/accessing.md#get-previous-sibling)
151 | - [`findAncestorTag()` Find ancestor tag](docs/accessing.md#find-ancestor-tag)
152 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "zenthangplus/html-dom-parser",
3 | "type": "library",
4 | "description": "A Simple HTML DOM parser written in PHP let you manipulate HTML in a easy way with CSS Selector.",
5 | "keywords": ["parser","dom", "html", "selector"],
6 | "license": "MIT",
7 | "authors": [
8 | {
9 | "name": "zenthangplus",
10 | "email": "zenthangplus@gmail.com"
11 | }
12 | ],
13 | "autoload": {
14 | "files": ["src/Sources/simple_html_dom.php"],
15 | "psr-4": {"HTMLDomParser\\": "src/"}
16 | },
17 | "autoload-dev": {
18 | "psr-4": {
19 | "HTMLDomParserTests\\": "tests/"
20 | }
21 | },
22 | "require": {
23 | "php": ">=5.6"
24 | },
25 | "require-dev": {
26 | "phpunit/phpunit": "^5"
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/docs/accessing.md:
--------------------------------------------------------------------------------
1 | # Accessing the Node
2 | You can access to the Node's data such as: contents, attributes by using bellow functions.
3 |
4 | ##### Accessing the node's data:
5 |
6 | - [`text()` Get the text contents](#get-the-text-contents)
7 | - [`getAttributes()` Get attributes](#get-all-attributes)
8 | - [`getAttribute()` Get a attribute](#get-a-attribute)
9 | - [`hasAttribute()` Check element has a attribute](#check-element-has-a-attribute)
10 | - [`hasChild()` Check element has child](#check-element-has-child)
11 | - [`innerHtml()` Get inner HTML](#get-inner-html)
12 | - [`outerHtml()` Get outer HTML](#get-outer-html)
13 | - [`innerXml()` Get inner XML](#get-inner-xml)
14 | - [Get node's HTML](#get-nodes-html)
15 |
16 | ##### Modifying the Node's data
17 | - [`setAttribute()` Set a attribute](#set-a-attribute)
18 | - [`removeAttribute()` Remove a attribute](#remove-a-attribute)
19 | - [`appendChild()` Append child](#append-child)
20 | - [`save()` Save DOM or even a node](#save-dom-or-even-a-node)
21 |
22 | ##### Traversing the Node tree
23 |
24 | - [`getChild()` Get child element](#get-child-element)
25 | - [`getChildren()` Get all children](#get-all-children)
26 | - [`getFirstChild()` Get first child](#get-first-child)
27 | - [`getLastChild()` Get last child](#get-last-child)
28 | - [`getNextSibling()` Get next sibling](#get-next-sibling)
29 | - [`getPrevSibling()` Get previous sibling](#get-previous-sibling)
30 | - [`findAncestorTag()` Find ancestor tag](#find-ancestor-tag)
31 |
32 | ## Accessing the Node's data
33 | ### Get the text contents
34 |
35 | The `text()` method returns the element's text contents.
36 |
37 | ```php
38 | function text(): string
39 | ```
40 |
41 | ### Get all attributes
42 |
43 | The `getAttributes()` method returns all element's attributes.
44 |
45 | ```php
46 | function getAttributes(): array
47 | ```
48 |
49 | ### Get a attribute
50 |
51 | The `getAttribute()` method returns the element's attribute by name.
52 |
53 | ```php
54 | function getAttribute(string $name): string
55 | ```
56 |
57 | ### Check element has a attribute
58 |
59 | The `hasAttribute()` method will check current element has a attribute or not?
60 |
61 | ```php
62 | function hasAttribute(string $name): bool
63 | ```
64 |
65 | ### Check element has child
66 |
67 | The `hasChild()` method will check current element has a child or not?
68 |
69 | ```php
70 | function hasChild(): bool
71 | ```
72 |
73 | ### Get inner HTML
74 |
75 | The `innerHtml()` method returns the element's inner HTML.
76 |
77 | ```php
78 | function innerHtml(): string
79 | ```
80 |
81 | ### Get outer HTML
82 |
83 | The `outerHtml()` method returns element's outer HTML.
84 |
85 | ```php
86 | function outerHtml(): string
87 | ```
88 |
89 | ### Get inner XML
90 |
91 | The `innerHtml()` method returns the element's inner XML.
92 |
93 | ```php
94 | function innerXml(): string
95 | ```
96 |
97 | ### Get node's HTML
98 |
99 | A Node can be converted to HTML by using casting functions or print it directly.
100 |
101 | Example:
102 | ```php
103 | echo $node;
104 | $html = (string)$node;
105 | ```
106 |
107 |
108 | ## Modifying the Node's data
109 | ### Set a attribute
110 |
111 | The `setAttribute()` method will set value for a attribute by name.
112 |
113 | ```php
114 | function setAttribute(string $name, mixed $value): void
115 | ```
116 |
117 | ### Remove a attribute
118 |
119 | The `removeAttribute()` method will remove a attribute from current element by name.
120 |
121 | ```php
122 | function removeAttribute(string $name): void
123 | ```
124 |
125 | ### Set parent
126 |
127 | The `setParent()` method will set parent for current element.
128 |
129 | ```php
130 | function setParent(NodeContract $node): void
131 | ```
132 |
133 | ### Append child
134 |
135 | The `appendChild()` method will append a Node as a child of current element.
136 |
137 | ```php
138 | function appendChild(NodeContract $node): void
139 | ```
140 |
141 | ### Save DOM or even a node
142 |
143 | The `save()` method returns the current DOM (or element) as HTML then save it to a file if you provide a file path.
144 |
145 | ```php
146 | function save(string $filePath): string
147 | ```
148 |
149 |
150 | ## Traversing the Node tree
151 | ### Get child element
152 | The `getChild()` method returns the Nth child element.
153 |
154 | ```php
155 | function getChild(int $idx): NodeContract|null
156 | ```
157 |
158 | ### Get all children
159 | The `getChildren()` method returns a list of children elements.
160 |
161 | ```php
162 | function getChildren(): []NodeContract
163 | ```
164 |
165 | ### Get first child
166 | The `getFirstChild()` method returns the first child element.
167 |
168 | ```php
169 | function getFirstChild(): NodeContract|null
170 | ```
171 |
172 | ### Get last child
173 | The `getLastChild()` method returns the last child element.
174 |
175 | ```php
176 | function getLastChild(): NodeContract|null
177 | ```
178 |
179 | ### Get next sibling
180 | The `getNextSibling()` method returns the next sibling element.
181 |
182 | ```php
183 | function getNextSibling(): NodeContract|null
184 | ```
185 |
186 | ### Get previous sibling
187 | The `getPrevSibling()` method returns the previous sibling element.
188 |
189 | ```php
190 | function getPrevSibling(): NodeContract|null
191 | ```
192 |
193 | ### Find ancestor tag
194 | The `findAncestorTag()` method returns the first ancestor tag.
195 |
196 | ```php
197 | function findAncestorTag(string $tag): NodeContract|null
198 | ```
199 |
--------------------------------------------------------------------------------
/docs/traversing.md:
--------------------------------------------------------------------------------
1 | # Traversing the DOM
2 | By using selectors like jQuery or CSS, you can traverse easy in the Dom or even in a Node.
3 |
4 | ##### List of supported selectors:
5 |
6 | | Selector example | Description |
7 | | --- | --- |
8 | | `div` | Find elements with the `div` tag |
9 | | `#container` | Find elements with the `container` id |
10 | | `.wrapper` | Find elements with the `wrapper` class |
11 | | `[data-id]` | Find elements with the `data-id` attribute |
12 | | `[data-id=12]` | Find elements with the attribute `data-id=12` |
13 | | `a[data-id=12]` | Find anchor tags with the attribute `data-id=12` |
14 | | `*[class]` | Find all elements with `class` attribute |
15 | | `a, img` | Find all anchors and images |
16 | | `a[title], img[title]` | Find all anchors and images with the `title` attribute |
17 | | `#container ul` | By using `space` between selectors, you can find nested elements |
18 | | `#container>ul` | By using `>` between selectors, you can find the closest children |
19 | | `#container, #side` | By using `,` between selectors, you can find elements by multiple selectors in one query |
20 | | `#container div.content ul>li, #side div[role=main] ul li` | You can combine selectors in one query |
21 |
22 | ##### List of function you can use with above selectors:
23 |
24 | - [`find()` Find elements](#find-elements)
25 | - [`findOne()` Find one element](#find-one-element)
26 |
27 | ##### Specific find functions:
28 |
29 | - [`getElementById()` Get a element by ID](#get-element-by-id)
30 | - [`getElementByTagName()` Get a element by tag name](#get-a-element-by-tag-name)
31 | - [`getElementsByTagName()` Get elements by tag name](#get-elements-by-tag-name)
32 |
33 | ##### Traversing the NODE tree
34 |
35 | - [`getChild()` Get child element](accessing.md#get-child-element)
36 | - [`getChildren()` Get all children](accessing.md#get-all-children)
37 | - [`getFirstChild()` Get first child](accessing.md#get-first-child)
38 | - [`getLastChild()` Get last child](accessing.md#get-last-child)
39 | - [`getNextSibling()` Get next sibling](accessing.md#get-next-sibling)
40 | - [`getPrevSibling()` Get previous sibling](accessing.md#get-previous-sibling)
41 | - [`findAncestorTag()` Find ancestor tag](accessing.md#find-ancestor-tag)
42 |
43 | ## Find elements
44 | The `find()` method returns a collection of an element's child elements by [selectors](#list-of-supported-selectors).
45 |
46 | ```php
47 | function find(string $selector): []NodeContract
48 | ```
49 |
50 | Example:
51 | ```php
52 | find('#container a');
55 | foreach ($elements as $element) {
56 | echo $element->text();
57 | }
58 | ```
59 |
60 | ## Find one element
61 | The `findOne()` method returns only 1 child element by selectors and an index.
62 |
63 | ```php
64 | function findOne(string $selector, int $index): NodeContract|null
65 | ```
66 |
67 | Example:
68 | ```php
69 | findOne('#container a');// Return the first anchor tag (with index=0) inside #container
72 | $anchor = $dom->findOne('#container a', 1);// Return the anchor tag with index=1 inside #container
73 | $anchor = $dom->findOne('#container a', 2);// Return the anchor tag with index=2 inside #container
74 | $anchor = $dom->findOne('#container a', -1);// Reverse search, return the last anchor tag inside #container
75 | ```
76 |
77 | ## Get element by ID
78 | The `getElementById()` method returns an element's child element by ID.
79 |
80 | ```php
81 | function getElementById(string $id): NodeContract|null
82 | ```
83 |
84 | Example:
85 | ```php
86 | getElementById('container');
89 | echo $container->innerHtml();
90 | ```
91 |
92 | ## Get a element by tag name
93 | The `getElementByTagName()` method returns only 1 child element with the specified tag name and an index.
94 |
95 | ```php
96 | function getElementByTagName(string $tagName): NodeContract
97 | ```
98 |
99 | Example:
100 | ```php
101 | getElementByTagName('p');// Return the first paragraph (with index=0)
104 | $paragraph = $dom->getElementByTagName('p', 1);// Return the paragraph with index=1
105 | $paragraph = $dom->getElementByTagName('p', 2);// Return the paragraph with index=2
106 | $paragraph = $dom->getElementByTagName('p', -1);// Reverse search, return the last paragraph
107 | ```
108 |
109 | ## Get elements by tag name
110 | The `getElementsByTagName()` method returns a collection of an element's child elements with the specified tag name.
111 |
112 | ```php
113 | function getElementsByTagName(string $tagName): []NodeContract
114 | ```
115 |
116 | Example:
117 | ```php
118 | getElementsByTagName('p');
121 | foreach ($paragraphs as $paragraph) {
122 | echo $paragraph->text();
123 | }
124 | ```
125 |
--------------------------------------------------------------------------------
/src/Collectors/NodesCollector.php:
--------------------------------------------------------------------------------
1 | rawNodes = $rawNodes;
43 | }
44 |
45 | /**
46 | * Get the number of nodes
47 | *
48 | * @return int
49 | */
50 | public function count()
51 | {
52 | return count($this->rawNodes);
53 | }
54 |
55 | /**
56 | * Return the current element
57 | *
58 | * @return \HTMLDomParser\Contracts\NodeContract
59 | */
60 | public function current()
61 | {
62 | // Ensure current node already transformed
63 | if (!isset($this->nodes[$this->currentIndex]) && isset($this->rawNodes[$this->currentIndex])) {
64 | $this->nodes[$this->currentIndex] = new Node($this->rawNodes[$this->currentIndex]);
65 | }
66 | return $this->nodes[$this->currentIndex];
67 | }
68 |
69 | /**
70 | * Move forward to next element
71 | */
72 | public function next()
73 | {
74 | $this->currentIndex++;
75 | }
76 |
77 | /**
78 | * Return the key of the current element
79 | *
80 | * @return int
81 | */
82 | public function key()
83 | {
84 | return $this->currentIndex;
85 | }
86 |
87 | /**
88 | * Checks if current position is valid
89 | *
90 | * @return bool
91 | */
92 | public function valid()
93 | {
94 | return isset($this->rawNodes[$this->currentIndex]);
95 | }
96 |
97 | /**
98 | * Rewind the collector to the first element
99 | */
100 | public function rewind()
101 | {
102 | $this->currentIndex = 0;
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/Contracts/DomContract.php:
--------------------------------------------------------------------------------
1 | load($html);
31 | } else {
32 | $this->loadObject($html);
33 | }
34 | }
35 | parent::__construct(null);
36 | }
37 |
38 | /**
39 | * Load DOM from string
40 | *
41 | * @param string $html
42 | */
43 | public function load($html)
44 | {
45 | $dom = parent::newSimpleDom();
46 | $dom->load($html);
47 | $this->loadObject($dom);
48 | }
49 |
50 | /**
51 | * Load DOM from file
52 | *
53 | * @param string $htmlFile
54 | */
55 | public function loadFile($htmlFile)
56 | {
57 | $dom = parent::newSimpleDom();
58 | $dom->load_file($htmlFile);
59 | $this->loadObject($dom);
60 | }
61 |
62 | /**
63 | * Load DOM from simple_html_dom
64 | *
65 | * @param simple_html_dom|object $dom
66 | */
67 | protected function loadObject($dom)
68 | {
69 | $this->dom = $dom;
70 | parent::loadObject($dom->root);
71 | }
72 |
73 | /**
74 | * Register the callback function,
75 | * this function will be invoked while dumping
76 | *
77 | * @param callable $callback
78 | */
79 | public function setCallback($callback)
80 | {
81 | $this->dom->set_callback(function ($element) use ($callback) {
82 | /** @var simple_html_dom_node $element */
83 | $node = new Node;
84 | $node->loadObject($element);
85 | $callback($node);
86 | });
87 | }
88 |
89 | /**
90 | * Remove callback function
91 | */
92 | public function removeCallback()
93 | {
94 | $this->dom->remove_callback();
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/DomFactory.php:
--------------------------------------------------------------------------------
1 | load($html);
24 | return $node;
25 | }
26 |
27 | /**
28 | * Load DOM from html file
29 | *
30 | * @see Dom::loadFile()
31 | * @param string $htmlFile
32 | * @return DomContract
33 | */
34 | public static function loadFile($htmlFile)
35 | {
36 | $node = new Dom;
37 | $node->loadFile($htmlFile);
38 | return $node;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/Node.php:
--------------------------------------------------------------------------------
1 | load($html);
33 | return;
34 | }
35 | $this->loadObject($html);
36 | }
37 | }
38 |
39 | /**
40 | * Load node from string
41 | *
42 | * @param string $html
43 | */
44 | public function load($html)
45 | {
46 | $dom = self::newSimpleDom();
47 | $dom->load($html);
48 | $this->loadObject($dom->root);
49 | }
50 |
51 | /**
52 | * Load node from file
53 | *
54 | * @param string $htmlFile
55 | */
56 | public function loadFile($htmlFile)
57 | {
58 | $dom = self::newSimpleDom();
59 | $dom->load_file($htmlFile);
60 | $this->loadObject($dom->root);
61 | }
62 |
63 | /**
64 | * Load node from simple_html_dom_node object
65 | *
66 | * @param simple_html_dom_node|object $node
67 | */
68 | protected function loadObject($node)
69 | {
70 | $this->node = $node;
71 | }
72 |
73 | /**
74 | * Get the node's name
75 | *
76 | * @return string
77 | */
78 | public function getName()
79 | {
80 | return $this->node->nodeName();
81 | }
82 |
83 | /**
84 | * Get the parent node
85 | *
86 | * @return NodeContract|null
87 | */
88 | public function getParent()
89 | {
90 | return self::nullOrNode($this->node->parent());
91 | }
92 |
93 | /**
94 | * Set the parent node
95 | *
96 | * @param NodeContract $parent
97 | */
98 | public function setParent($parent)
99 | {
100 | $this->node->parent($parent->getSimpleNode());
101 | }
102 |
103 | /**
104 | * Check the current node has child node
105 | *
106 | * @return bool
107 | */
108 | public function hasChild()
109 | {
110 | return $this->node->has_child();
111 | }
112 |
113 | /**
114 | * Get child node by index
115 | *
116 | * @param int $idx
117 | * @return NodeContract|null
118 | */
119 | public function getChild($idx)
120 | {
121 | if ($idx < 0) {
122 | return null;
123 | }
124 | return self::nullOrNode($this->node->children($idx));
125 | }
126 |
127 | /**
128 | * Get all child nodes
129 | *
130 | * @return NodesCollectorContract|NodeContract[]
131 | */
132 | public function getChildren()
133 | {
134 | return new NodesCollector($this->node->children(-1));
135 | }
136 |
137 | /**
138 | * Get the fist child element
139 | *
140 | * @return NodeContract|null
141 | */
142 | public function getFirstChild()
143 | {
144 | return self::nullOrNode($this->node->first_child());
145 | }
146 |
147 | /**
148 | * Get the last child element
149 | *
150 | * @return NodeContract|null
151 | */
152 | public function getLastChild()
153 | {
154 | return self::nullOrNode($this->node->last_child());
155 | }
156 |
157 | /**
158 | * Get the next sibling node
159 | *
160 | * @return NodeContract|null
161 | */
162 | public function getNextSibling()
163 | {
164 | return self::nullOrNode($this->node->next_sibling());
165 | }
166 |
167 | /**
168 | * Get the previous sibling node
169 | *
170 | * @return NodeContract|null
171 | */
172 | public function getPrevSibling()
173 | {
174 | return self::nullOrNode($this->node->prev_sibling());
175 | }
176 |
177 | /**
178 | * Traverse ancestors to the first matching tag.
179 | *
180 | * @param $tag
181 | * @return NodeContract|null
182 | */
183 | public function findAncestorTag($tag)
184 | {
185 | return self::nullOrNode($this->node->find_ancestor_tag($tag));
186 | }
187 |
188 | /**
189 | * Find elements by CSS selector
190 | *
191 | * @param string $selector
192 | * @param bool $lowercase
193 | * @return NodesCollectorContract|NodeContract[]
194 | */
195 | public function find($selector, $lowercase = false)
196 | {
197 | return new NodesCollector($this->node->find($selector, null, $lowercase));
198 | }
199 |
200 | /**
201 | * Find a element by CSS selector,
202 | * if current node contains multiple elements with same selector, return the first one
203 | *
204 | * @param string $selector
205 | * @param int $index
206 | * @param bool $lowercase
207 | * @return NodeContract|null
208 | */
209 | public function findOne($selector, $index = 0, $lowercase = false)
210 | {
211 | return self::nullOrNode($this->node->find($selector, $index, $lowercase));
212 | }
213 |
214 | /**
215 | * Get element by it's ID
216 | *
217 | * @param string $id
218 | * @return NodeContract|null
219 | */
220 | public function getElementById($id)
221 | {
222 | return self::nullOrNode($this->node->getElementById($id));
223 | }
224 |
225 | /**
226 | * Get a element by tag name,
227 | * if current node has multiple tags with same name, return the first one
228 | *
229 | * @param string $tag
230 | * @param int $index
231 | * @return NodeContract|null
232 | */
233 | public function getElementByTagName($tag, $index = 0)
234 | {
235 | return $this->findOne($tag, $index);
236 | }
237 |
238 | /**
239 | * Get all elements by a tag name
240 | *
241 | * @param string $tag
242 | * @return NodesCollectorContract|NodeContract[]
243 | */
244 | public function getElementsByTagName($tag)
245 | {
246 | return $this->find($tag);
247 | }
248 |
249 | /**
250 | * Get node's inner text (everything inside the opening and closing tags)
251 | *
252 | * @return string
253 | */
254 | public function innerHtml()
255 | {
256 | return $this->node->innertext();
257 | }
258 |
259 | /**
260 | * Get node's xml text (inner text as a CDATA section)
261 | *
262 | * @return string
263 | */
264 | public function innerXml()
265 | {
266 | return $this->node->xmltext();
267 | }
268 |
269 | /**
270 | * Get node's outer text (everything including the opening and closing tags)
271 | *
272 | * @return string
273 | */
274 | public function outerHtml()
275 | {
276 | return $this->node->outertext();
277 | }
278 |
279 | /**
280 | * Get node's plain text (everything excluding all tags)
281 | *
282 | * @return string
283 | */
284 | public function text()
285 | {
286 | return $this->node->text();
287 | }
288 |
289 | /**
290 | * Get a attribute by name
291 | *
292 | * @param string $name
293 | * @return string
294 | */
295 | public function getAttribute($name)
296 | {
297 | $value = $this->node->getAttribute($name);
298 | return is_bool($value) ? '' : $value;
299 | }
300 |
301 | /**
302 | * Set attribute for current node
303 | *
304 | * @param string $name
305 | * @param $value
306 | */
307 | public function setAttribute($name, $value)
308 | {
309 | $this->node->setAttribute($name, $value);
310 | }
311 |
312 | /**
313 | * Get all attributes for current node
314 | *
315 | * @return array
316 | */
317 | public function getAttributes()
318 | {
319 | return $this->node->getAllAttributes();
320 | }
321 |
322 | /**
323 | * Check an attribute exists in current node
324 | *
325 | * @param string $name
326 | * @return bool
327 | */
328 | public function hasAttribute($name)
329 | {
330 | return $this->node->hasAttribute($name);
331 | }
332 |
333 | /**
334 | * Remove an attribute from current node
335 | *
336 | * @param string $name
337 | */
338 | public function removeAttribute($name)
339 | {
340 | $this->node->removeAttribute($name);
341 | }
342 |
343 | /**
344 | * Append a node to current node
345 | *
346 | * @param NodeContract $node
347 | */
348 | public function appendChild($node)
349 | {
350 | $this->node->appendChild($node->getSimpleNode());
351 | }
352 |
353 | /**
354 | * Convert current node to string
355 | *
356 | * @return string
357 | */
358 | public function __toString()
359 | {
360 | return $this->outerHtml();
361 | }
362 |
363 | /**
364 | * Save current node to file and get html
365 | *
366 | * @param string $filePath
367 | * @return string
368 | */
369 | public function save($filePath = '')
370 | {
371 | $html = (string)$this;
372 | if ($filePath !== '') {
373 | file_put_contents($filePath, $html, LOCK_EX);
374 | }
375 | return $html;
376 | }
377 |
378 | /**
379 | * Get raw node from simple_html_dom
380 | *
381 | * @return simple_html_dom_node
382 | */
383 | public function getSimpleNode()
384 | {
385 | return $this->node;
386 | }
387 |
388 | /**
389 | * If element is null, return null
390 | * else return new Node
391 | *
392 | * @param $element
393 | * @return NodeContract|null
394 | */
395 | protected static function nullOrNode($element)
396 | {
397 | if (is_null($element)) {
398 | return null;
399 | }
400 | return new Node($element);
401 | }
402 |
403 | /**
404 | * Create new simple_html_dom instance
405 | *
406 | * @return simple_html_dom
407 | */
408 | protected static function newSimpleDom()
409 | {
410 | return new simple_html_dom();
411 | }
412 | }
413 |
--------------------------------------------------------------------------------
/src/NodeFactory.php:
--------------------------------------------------------------------------------
1 | load($html);
24 | return $node;
25 | }
26 |
27 | /**
28 | * Load node from html file
29 | *
30 | * @see Node::loadFile()
31 | * @param string $htmlFile
32 | * @return NodeContract
33 | */
34 | public static function loadFile($htmlFile)
35 | {
36 | $node = new Node;
37 | $node->loadFile($htmlFile);
38 | return $node;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/Sources/simple_html_dom.php:
--------------------------------------------------------------------------------
1 | size is the "real"
24 | * number of bytes the dom was created from. But for most purposes, it's a
25 | * really good estimation.
26 | *
27 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
28 | * closed is great for malformed html, but it CAN lead to parsing errors.
29 | *
30 | * Allow the user to tell us how much they trust the html.
31 | *
32 | * Paperg add the text and plaintext to the selectors for the find syntax.
33 | * plaintext implies text in the innertext of a node. text implies that the
34 | * tag is a text node. This allows for us to find tags based on the text they
35 | * contain.
36 | *
37 | * Create find_ancestor_tag to see if a tag is - at any level - inside of
38 | * another specific tag.
39 | *
40 | * Paperg: added parse_charset so that we know about the character set of
41 | * the source document. NOTE: If the user's system has a routine called
42 | * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
43 | * returning the content-type header from the last transfer or curl_exec, and
44 | * we will parse that and use it in preference to any other method of charset
45 | * detection.
46 | *
47 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to
48 | * protect from that.
49 | *
50 | * PaperG (John Schlick) Added get_display_size for "IMG" tags.
51 | *
52 | * Licensed under The MIT License
53 | * Redistributions of files must retain the above copyright notice.
54 | *
55 | * @author S.C. Chen
56 | * @author John Schlick
57 | * @author Rus Carroll
58 | * @version Rev. 1.8.1 (247)
59 | * @package PlaceLocalInclude
60 | * @subpackage simple_html_dom
61 | */
62 |
63 | /**
64 | * All of the Defines for the classes below.
65 | * @author S.C. Chen
66 | */
67 | define('HDOM_TYPE_ELEMENT', 1);
68 | define('HDOM_TYPE_COMMENT', 2);
69 | define('HDOM_TYPE_TEXT', 3);
70 | define('HDOM_TYPE_ENDTAG', 4);
71 | define('HDOM_TYPE_ROOT', 5);
72 | define('HDOM_TYPE_UNKNOWN', 6);
73 | define('HDOM_QUOTE_DOUBLE', 0);
74 | define('HDOM_QUOTE_SINGLE', 1);
75 | define('HDOM_QUOTE_NO', 3);
76 | define('HDOM_INFO_BEGIN', 0);
77 | define('HDOM_INFO_END', 1);
78 | define('HDOM_INFO_QUOTE', 2);
79 | define('HDOM_INFO_SPACE', 3);
80 | define('HDOM_INFO_TEXT', 4);
81 | define('HDOM_INFO_INNER', 5);
82 | define('HDOM_INFO_OUTER', 6);
83 | define('HDOM_INFO_ENDSPACE', 7);
84 |
85 | /** The default target charset */
86 | defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
87 |
88 | /** The default
text used instead of
tags when returning text */
89 | defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
90 |
91 | /** The default text used instead of tags when returning text */
92 | defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
93 |
94 | /** The maximum file size the parser should load */
95 | defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
96 |
97 | /** Contents between curly braces "{" and "}" are interpreted as text */
98 | define('HDOM_SMARTY_AS_TEXT', 1);
99 |
100 | /**
101 | * simple html dom node
102 | * PaperG - added ability for "find" routine to lowercase the value of the
103 | * selector.
104 | *
105 | * PaperG - added $tag_start to track the start position of the tag in the total
106 | * byte index
107 | *
108 | * @package PlaceLocalInclude
109 | */
110 | class simple_html_dom_node
111 | {
112 | /**
113 | * Node type
114 | *
115 | * Default is {@see HDOM_TYPE_TEXT}
116 | *
117 | * @var int
118 | */
119 | public $nodetype = HDOM_TYPE_TEXT;
120 |
121 | /**
122 | * Tag name
123 | *
124 | * Default is 'text'
125 | *
126 | * @var string
127 | */
128 | public $tag = 'text';
129 |
130 | /**
131 | * List of attributes
132 | *
133 | * @var array
134 | */
135 | public $attr = array();
136 |
137 | /**
138 | * List of child node objects
139 | *
140 | * @var array
141 | */
142 | public $children = array();
143 | public $nodes = array();
144 |
145 | /**
146 | * The parent node object
147 | *
148 | * @var object|null
149 | */
150 | public $parent = null;
151 |
152 | // The "info" array - see HDOM_INFO_... for what each element contains.
153 | public $_ = array();
154 |
155 | /**
156 | * Start position of the tag in the document
157 | *
158 | * @var int
159 | */
160 | public $tag_start = 0;
161 |
162 | /**
163 | * The DOM object
164 | *
165 | * @var object|null
166 | */
167 | private $dom = null;
168 |
169 | /**
170 | * Construct new node object
171 | *
172 | * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
173 | */
174 | function __construct($dom)
175 | {
176 | $this->dom = $dom;
177 | $dom->nodes[] = $this;
178 | }
179 |
180 | function __destruct()
181 | {
182 | $this->clear();
183 | }
184 |
185 | function __toString()
186 | {
187 | return $this->outertext();
188 | }
189 |
190 | // clean up memory due to php5 circular references memory leak...
191 | function clear()
192 | {
193 | $this->dom = null;
194 | $this->nodes = null;
195 | $this->parent = null;
196 | $this->children = null;
197 | }
198 |
199 | // dump node's tree
200 | function dump($show_attr = true, $deep = 0)
201 | {
202 | $lead = str_repeat(' ', $deep);
203 |
204 | echo $lead . $this->tag;
205 |
206 | if ($show_attr && count($this->attr) > 0) {
207 | echo '(';
208 | foreach ($this->attr as $k => $v) {
209 | echo "[$k]=>\"" . $this->$k . '", ';
210 | }
211 | echo ')';
212 | }
213 |
214 | echo "\n";
215 |
216 | if ($this->nodes) {
217 | foreach ($this->nodes as $c) {
218 | $c->dump($show_attr, $deep + 1);
219 | }
220 | }
221 | }
222 |
223 |
224 | // Debugging function to dump a single dom node with a bunch of information about it.
225 | function dump_node($echo = true)
226 | {
227 | $string = $this->tag;
228 |
229 | if (count($this->attr) > 0) {
230 | $string .= '(';
231 | foreach ($this->attr as $k => $v) {
232 | $string .= "[$k]=>\"" . $this->$k . '", ';
233 | }
234 | $string .= ')';
235 | }
236 |
237 | if (count($this->_) > 0) {
238 | $string .= ' $_ (';
239 | foreach ($this->_ as $k => $v) {
240 | if (is_array($v)) {
241 | $string .= "[$k]=>(";
242 | foreach ($v as $k2 => $v2) {
243 | $string .= "[$k2]=>\"" . $v2 . '", ';
244 | }
245 | $string .= ')';
246 | } else {
247 | $string .= "[$k]=>\"" . $v . '", ';
248 | }
249 | }
250 | $string .= ')';
251 | }
252 |
253 | if (isset($this->text)) {
254 | $string .= ' text: (' . $this->text . ')';
255 | }
256 |
257 | $string .= " HDOM_INNER_INFO: '";
258 |
259 | if (isset($node->_[HDOM_INFO_INNER])) {
260 | $string .= $node->_[HDOM_INFO_INNER] . "'";
261 | } else {
262 | $string .= ' NULL ';
263 | }
264 |
265 | $string .= ' children: ' . count($this->children);
266 | $string .= ' nodes: ' . count($this->nodes);
267 | $string .= ' tag_start: ' . $this->tag_start;
268 | $string .= "\n";
269 |
270 | if ($echo) {
271 | echo $string;
272 | return;
273 | } else {
274 | return $string;
275 | }
276 | }
277 |
278 | /**
279 | * Return or set parent node
280 | *
281 | * @param object|null $parent (optional) The parent node, `null` to return
282 | * the current parent node.
283 | * @return object|null The parent node
284 | */
285 | function parent($parent = null)
286 | {
287 | // I am SURE that this doesn't work properly.
288 | // It fails to unset the current node from it's current parents nodes or
289 | // children list first.
290 | if ($parent !== null) {
291 | $this->parent = $parent;
292 | $this->parent->nodes[] = $this;
293 | $this->parent->children[] = $this;
294 | }
295 |
296 | return $this->parent;
297 | }
298 |
299 | /**
300 | * @return bool True if the node has at least one child node
301 | */
302 | function has_child()
303 | {
304 | return !empty($this->children);
305 | }
306 |
307 | /**
308 | * Get child node at specified index
309 | *
310 | * @param int $idx The index of the child node to return, `-1` to return all
311 | * child nodes.
312 | * @return object|array|null The child node at the specified index, all child
313 | * nodes or null if the index is invalid.
314 | */
315 | function children($idx = -1)
316 | {
317 | if ($idx === -1) {
318 | return $this->children;
319 | }
320 |
321 | if (isset($this->children[$idx])) {
322 | return $this->children[$idx];
323 | }
324 |
325 | return null;
326 | }
327 |
328 | /**
329 | * Get first child node
330 | *
331 | * @return object|null The first child node or null if the current node has
332 | * no child nodes.
333 | *
334 | * @todo Use `empty()` instead of `count()` to improve performance on large
335 | * arrays.
336 | */
337 | function first_child()
338 | {
339 | if (count($this->children) > 0) {
340 | return $this->children[0];
341 | }
342 | return null;
343 | }
344 |
345 | /**
346 | * Get last child node
347 | *
348 | * @return object|null The last child node or null if the current node has
349 | * no child nodes.
350 | *
351 | * @todo Use `end()` to slightly improve performance on large arrays.
352 | */
353 | function last_child()
354 | {
355 | if (($count = count($this->children)) > 0) {
356 | return $this->children[$count - 1];
357 | }
358 | return null;
359 | }
360 |
361 | /**
362 | * Get next sibling node
363 | *
364 | * @return object|null The sibling node or null if the current node has no
365 | * sibling nodes.
366 | */
367 | function next_sibling()
368 | {
369 | if ($this->parent === null) {
370 | return null;
371 | }
372 |
373 | $idx = 0;
374 | $count = count($this->parent->children);
375 |
376 | while ($idx < $count && $this !== $this->parent->children[$idx]) {
377 | ++$idx;
378 | }
379 |
380 | if (++$idx >= $count) {
381 | return null;
382 | }
383 |
384 | return $this->parent->children[$idx];
385 | }
386 |
387 | /**
388 | * Get previous sibling node
389 | *
390 | * @return object|null The sibling node or null if the current node has no
391 | * sibling nodes.
392 | */
393 | function prev_sibling()
394 | {
395 | if ($this->parent === null) { return null; }
396 |
397 | $idx = 0;
398 | $count = count($this->parent->children);
399 |
400 | while ($idx < $count && $this !== $this->parent->children[$idx]) {
401 | ++$idx;
402 | }
403 |
404 | if (--$idx < 0) { return null; }
405 |
406 | return $this->parent->children[$idx];
407 | }
408 |
409 | /**
410 | * Traverse ancestors to the first matching tag.
411 | *
412 | * @param string $tag Tag to find
413 | * @return object|null First matching node in the DOM tree or null if no
414 | * match was found.
415 | *
416 | * @todo Null is returned implicitly by calling ->parent on the root node.
417 | * This behaviour could change at any time, rendering this function invalid.
418 | */
419 | function find_ancestor_tag($tag)
420 | {
421 | global $debug_object;
422 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
423 |
424 | // Start by including ourselves in the comparison.
425 | $returnDom = $this;
426 |
427 | while (!is_null($returnDom)) {
428 | if (is_object($debug_object)) {
429 | $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
430 | }
431 |
432 | if ($returnDom->tag == $tag) {
433 | break;
434 | }
435 |
436 | $returnDom = $returnDom->parent;
437 | }
438 |
439 | return $returnDom;
440 | }
441 |
442 | /**
443 | * Get node's inner text (everything inside the opening and closing tags)
444 | *
445 | * @return string
446 | */
447 | function innertext()
448 | {
449 | if (isset($this->_[HDOM_INFO_INNER])) {
450 | return $this->_[HDOM_INFO_INNER];
451 | }
452 |
453 | if (isset($this->_[HDOM_INFO_TEXT])) {
454 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
455 | }
456 |
457 | $ret = '';
458 |
459 | foreach ($this->nodes as $n) {
460 | $ret .= $n->outertext();
461 | }
462 |
463 | return $ret;
464 | }
465 |
466 | /**
467 | * Get node's outer text (everything including the opening and closing tags)
468 | *
469 | * @return string
470 | */
471 | function outertext()
472 | {
473 | global $debug_object;
474 |
475 | if (is_object($debug_object)) {
476 | $text = '';
477 |
478 | if ($this->tag === 'text') {
479 | if (!empty($this->text)) {
480 | $text = ' with text: ' . $this->text;
481 | }
482 | }
483 |
484 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
485 | }
486 |
487 | if ($this->tag === 'root') return $this->innertext();
488 |
489 | // trigger callback
490 | if ($this->dom && $this->dom->callback !== null) {
491 | call_user_func_array($this->dom->callback, array($this));
492 | }
493 |
494 | if (isset($this->_[HDOM_INFO_OUTER])) {
495 | return $this->_[HDOM_INFO_OUTER];
496 | }
497 |
498 | if (isset($this->_[HDOM_INFO_TEXT])) {
499 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
500 | }
501 |
502 | // render begin tag
503 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
504 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
505 | } else {
506 | $ret = '';
507 | }
508 |
509 | // render inner text
510 | if (isset($this->_[HDOM_INFO_INNER])) {
511 | // If it's a br tag... don't return the HDOM_INNER_INFO that we
512 | // may or may not have added.
513 | if ($this->tag !== 'br') {
514 | $ret .= $this->_[HDOM_INFO_INNER];
515 | }
516 | } else {
517 | if ($this->nodes) {
518 | foreach ($this->nodes as $n) {
519 | $ret .= $this->convert_text($n->outertext());
520 | }
521 | }
522 | }
523 |
524 | // render end tag
525 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
526 | $ret .= '' . $this->tag . '>';
527 | }
528 |
529 | return $ret;
530 | }
531 |
532 | /**
533 | * Get node's plain text (everything excluding all tags)
534 | *
535 | * @return string
536 | */
537 | function text()
538 | {
539 | if (isset($this->_[HDOM_INFO_INNER])) {
540 | return $this->_[HDOM_INFO_INNER];
541 | }
542 |
543 | switch ($this->nodetype) {
544 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
545 | case HDOM_TYPE_COMMENT: return '';
546 | case HDOM_TYPE_UNKNOWN: return '';
547 | }
548 |
549 | if (strcasecmp($this->tag, 'script') === 0) { return ''; }
550 | if (strcasecmp($this->tag, 'style') === 0) { return ''; }
551 |
552 | $ret = '';
553 |
554 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
555 | // for some span tags, and some p tags) $this->nodes is set to NULL.
556 | // NOTE: This indicates that there is a problem where it's set to NULL
557 | // without a clear happening.
558 | // WHY is this happening?
559 | if (!is_null($this->nodes)) {
560 | foreach ($this->nodes as $n) {
561 | // Start paragraph after a blank line
562 | if ($n->tag === 'p') {
563 | $ret .= "\n\n";
564 | }
565 |
566 | $ret .= $this->convert_text($n->text());
567 |
568 | // If this node is a span... add a space at the end of it so
569 | // multiple spans don't run into each other. This is plaintext
570 | // after all.
571 | if ($n->tag === 'span') {
572 | $ret .= $this->dom->default_span_text;
573 | }
574 | }
575 | }
576 | return trim($ret);
577 | }
578 |
579 | /**
580 | * Get node's xml text (inner text as a CDATA section)
581 | *
582 | * @return string
583 | */
584 | function xmltext()
585 | {
586 | $ret = $this->innertext();
587 | $ret = str_ireplace('', '', $ret);
589 | return $ret;
590 | }
591 |
592 | // build node's text with tag
593 | function makeup()
594 | {
595 | // text, comment, unknown
596 | if (isset($this->_[HDOM_INFO_TEXT])) {
597 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
598 | }
599 |
600 | $ret = '<' . $this->tag;
601 | $i = -1;
602 |
603 | foreach ($this->attr as $key => $val) {
604 | ++$i;
605 |
606 | // skip removed attribute
607 | if ($val === null || $val === false) { continue; }
608 |
609 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
610 |
611 | //no value attr: nowrap, checked selected...
612 | if ($val === true) {
613 | $ret .= $key;
614 | } else {
615 | switch ($this->_[HDOM_INFO_QUOTE][$i])
616 | {
617 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
618 | case HDOM_QUOTE_SINGLE: $quote = '\''; break;
619 | default: $quote = '';
620 | }
621 |
622 | $ret .= $key
623 | . $this->_[HDOM_INFO_SPACE][$i][1]
624 | . '='
625 | . $this->_[HDOM_INFO_SPACE][$i][2]
626 | . $quote
627 | . $val
628 | . $quote;
629 | }
630 | }
631 |
632 | $ret = $this->dom->restore_noise($ret);
633 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
634 | }
635 |
636 | /**
637 | * Find elements by CSS selector
638 | *
639 | * @param string $selector The CSS selector
640 | * @param int|null $idx Index of element to return form the list of matching
641 | * elements (default: `null` = disabled).
642 | * @param bool $lowercase Matches tag names case insensitive (lowercase) if
643 | * enabled (default: `false`)
644 | * @return array|object|null A list of elements matching the specified CSS
645 | * selector or a single element if $idx is specified or null if no element
646 | * was found.
647 | */
648 | function find($selector, $idx = null, $lowercase = false)
649 | {
650 | $selectors = $this->parse_selector($selector);
651 | if (($count = count($selectors)) === 0) { return array(); }
652 | $found_keys = array();
653 |
654 | // find each selector
655 | for ($c = 0; $c < $count; ++$c) {
656 | // The change on the below line was documented on the sourceforge
657 | // code tracker id 2788009
658 | // used to be: if (($levle=count($selectors[0]))===0) return array();
659 | if (($levle = count($selectors[$c])) === 0) { return array(); }
660 | if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
661 |
662 | $head = array($this->_[HDOM_INFO_BEGIN] => 1);
663 | $cmd = ' '; // Combinator
664 |
665 | // handle descendant selectors, no recursive!
666 | for ($l = 0; $l < $levle; ++$l) {
667 | $ret = array();
668 |
669 | foreach ($head as $k => $v) {
670 | $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
671 | //PaperG - Pass this optional parameter on to the seek function.
672 | $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
673 | }
674 |
675 | $head = $ret;
676 | $cmd = $selectors[$c][$l][4]; // Next Combinator
677 | }
678 |
679 | foreach ($head as $k => $v) {
680 | if (!isset($found_keys[$k])) {
681 | $found_keys[$k] = 1;
682 | }
683 | }
684 | }
685 |
686 | // sort keys
687 | ksort($found_keys);
688 |
689 | $found = array();
690 | foreach ($found_keys as $k => $v) {
691 | $found[] = $this->dom->nodes[$k];
692 | }
693 |
694 | // return nth-element or array
695 | if (is_null($idx)) { return $found; }
696 | elseif ($idx < 0) { $idx = count($found) + $idx; }
697 | return (isset($found[$idx])) ? $found[$idx] : null;
698 | }
699 |
700 | /**
701 | * Seek DOM elements by selector
702 | *
703 | * **Note**
704 | * The selector element must be compatible to a selector from
705 | * {@see simple_html_dom_node::parse_selector()}
706 | *
707 | * @param array $selector A selector element
708 | * @param array $ret An array of matches
709 | * @param bool $lowercase Matches tag names case insensitive (lowercase) if
710 | * enabled (default: `false`)
711 | * @return void
712 | */
713 | protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
714 | {
715 | global $debug_object;
716 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
717 |
718 | list($tag, $id, $class, $attributes, $cmb) = $selector;
719 | $nodes = array();
720 |
721 | if ($parent_cmd === ' ') { // Descendant Combinator
722 | // Find parent closing tag if the current element doesn't have a closing
723 | // tag (i.e. void element)
724 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
725 | if ($end == 0) {
726 | $parent = $this->parent;
727 | while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
728 | $end -= 1;
729 | $parent = $parent->parent;
730 | }
731 | $end += $parent->_[HDOM_INFO_END];
732 | }
733 |
734 | // Get list of target nodes
735 | $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
736 | $nodes_count = $end - $nodes_start;
737 | $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
738 | } elseif ($parent_cmd === '>') { // Child Combinator
739 | $nodes = $this->children;
740 | } elseif ($parent_cmd === '+'
741 | && $this->parent
742 | && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
743 | $index = array_search($this, $this->parent->children, true) + 1;
744 | $nodes[] = $this->parent->children[$index];
745 | } elseif ($parent_cmd === '~'
746 | && $this->parent
747 | && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
748 | $index = array_search($this, $this->parent->children, true);
749 | $nodes = array_slice($this->parent->children, $index);
750 | }
751 |
752 | // Go throgh each element starting at this element until the end tag
753 | // Note: If this element is a void tag, any previous void element is
754 | // skipped.
755 | foreach($nodes as $node) {
756 | $pass = true;
757 |
758 | // Skip root nodes
759 | if(!$node->parent) {
760 | $pass = false;
761 | }
762 |
763 | // Skip if node isn't a child node (i.e. text nodes)
764 | if($pass && !in_array($node, $node->parent->children, true)) {
765 | $pass = false;
766 | }
767 |
768 | // Skip if tag doesn't match
769 | if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
770 | $pass = false;
771 | }
772 |
773 | // Skip if ID doesn't exist
774 | if ($pass && $id !== '' && !isset($node->attr['id'])) {
775 | $pass = false;
776 | }
777 |
778 | // Check if ID matches
779 | if ($pass && $id !== '' && isset($node->attr['id'])) {
780 | // Note: Only consider the first ID (as browsers do)
781 | $node_id = explode(' ', trim($node->attr['id']))[0];
782 |
783 | if($id !== $node_id) { $pass = false; }
784 | }
785 |
786 | // Check if all class(es) exist
787 | if ($pass && $class !== '' && is_array($class) && !empty($class)) {
788 | if (isset($node->attr['class'])) {
789 | $node_classes = explode(' ', $node->attr['class']);
790 |
791 | if ($lowercase) {
792 | $node_classes = array_map('strtolower', $node_classes);
793 | }
794 |
795 | foreach($class as $c) {
796 | if(!in_array($c, $node_classes)) {
797 | $pass = false;
798 | break;
799 | }
800 | }
801 | } else {
802 | $pass = false;
803 | }
804 | }
805 |
806 | // Check attributes
807 | if ($pass
808 | && $attributes !== ''
809 | && is_array($attributes)
810 | && !empty($attributes)) {
811 | foreach($attributes as $a) {
812 | list (
813 | $att_name,
814 | $att_expr,
815 | $att_val,
816 | $att_inv,
817 | $att_case_sensitivity
818 | ) = $a;
819 |
820 | // Handle indexing attributes (i.e. "[2]")
821 | /**
822 | * Note: This is not supported by the CSS Standard but adds
823 | * the ability to select items compatible to XPath (i.e.
824 | * the 3rd element within it's parent).
825 | *
826 | * Note: This doesn't conflict with the CSS Standard which
827 | * doesn't work on numeric attributes anyway.
828 | */
829 | if (is_numeric($att_name)
830 | && $att_expr === ''
831 | && $att_val === '') {
832 | $count = 0;
833 |
834 | // Find index of current element in parent
835 | foreach ($node->parent->children as $c) {
836 | if ($c->tag === $node->tag) ++$count;
837 | if ($c === $node) break;
838 | }
839 |
840 | // If this is the correct node, continue with next
841 | // attribute
842 | if ($count === (int)$att_name) continue;
843 | }
844 |
845 | // Check attribute availability
846 | if ($att_inv) { // Attribute should NOT be set
847 | if (isset($node->attr[$att_name])) {
848 | $pass = false;
849 | break;
850 | }
851 | } else { // Attribute should be set
852 | // todo: "plaintext" is not a valid CSS selector!
853 | if ($att_name !== 'plaintext'
854 | && !isset($node->attr[$att_name])) {
855 | $pass = false;
856 | break;
857 | }
858 | }
859 |
860 | // Continue with next attribute if expression isn't defined
861 | if ($att_expr === '') continue;
862 |
863 | // If they have told us that this is a "plaintext"
864 | // search then we want the plaintext of the node - right?
865 | // todo "plaintext" is not a valid CSS selector!
866 | if ($att_name === 'plaintext') {
867 | $nodeKeyValue = $node->text();
868 | } else {
869 | $nodeKeyValue = $node->attr[$att_name];
870 | }
871 |
872 | if (is_object($debug_object)) {
873 | $debug_object->debug_log(2,
874 | 'testing node: '
875 | . $node->tag
876 | . ' for attribute: '
877 | . $att_name
878 | . $att_expr
879 | . $att_val
880 | . ' where nodes value is: '
881 | . $nodeKeyValue
882 | );
883 | }
884 |
885 | // If lowercase is set, do a case insensitive test of
886 | // the value of the selector.
887 | if ($lowercase) {
888 | $check = $this->match(
889 | $att_expr,
890 | strtolower($att_val),
891 | strtolower($nodeKeyValue),
892 | $att_case_sensitivity
893 | );
894 | } else {
895 | $check = $this->match(
896 | $att_expr,
897 | $att_val,
898 | $nodeKeyValue,
899 | $att_case_sensitivity
900 | );
901 | }
902 |
903 | if (is_object($debug_object)) {
904 | $debug_object->debug_log(2,
905 | 'after match: '
906 | . ($check ? 'true' : 'false')
907 | );
908 | }
909 |
910 | if (!$check) {
911 | $pass = false;
912 | break;
913 | }
914 | }
915 | }
916 |
917 | // Found a match. Add to list and clear node
918 | if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
919 | unset($node);
920 | }
921 | // It's passed by reference so this is actually what this function returns.
922 | if (is_object($debug_object)) {
923 | $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
924 | }
925 | }
926 |
927 | /**
928 | * Match value and pattern for a given CSS expression
929 | *
930 | * **Supported Expressions**
931 | *
932 | * | Expression | Description
933 | * | ---------- | -----------
934 | * | `=` | $value and $pattern must be equal
935 | * | `!=` | $value and $pattern must not be equal
936 | * | `^=` | $value must start with $pattern
937 | * | `$=` | $value must end with $pattern
938 | * | `*=` | $value must contain $pattern
939 | *
940 | * @param string $exp The expression.
941 | * @param string $pattern The pattern
942 | * @param string $value The value
943 | * @value bool True if $value matches $pattern
944 | */
945 | protected function match($exp, $pattern, $value, $case_sensitivity)
946 | {
947 | global $debug_object;
948 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
949 |
950 | if ($case_sensitivity === 'i') {
951 | $pattern = strtolower($pattern);
952 | $value = strtolower($value);
953 | }
954 |
955 | switch ($exp) {
956 | case '=':
957 | return ($value === $pattern);
958 | case '!=':
959 | return ($value !== $pattern);
960 | case '^=':
961 | return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
962 | case '$=':
963 | return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
964 | case '*=':
965 | return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
966 | case '|=':
967 | /**
968 | * [att|=val]
969 | *
970 | * Represents an element with the att attribute, its value
971 | * either being exactly "val" or beginning with "val"
972 | * immediately followed by "-" (U+002D).
973 | */
974 | return strpos($value, $pattern) === 0;
975 | case '~=':
976 | /**
977 | * [att~=val]
978 | *
979 | * Represents an element with the att attribute whose value is a
980 | * whitespace-separated list of words, one of which is exactly
981 | * "val". If "val" contains whitespace, it will never represent
982 | * anything (since the words are separated by spaces). Also if
983 | * "val" is the empty string, it will never represent anything.
984 | */
985 | return in_array($pattern, explode(' ', trim($value)), true);
986 | }
987 | return false;
988 | }
989 |
990 | /**
991 | * Parse CSS selector
992 | *
993 | * @param string $selector_string CSS selector string
994 | * @return array List of CSS selectors. The format depends on the type of
995 | * selector:
996 | *
997 | * ```php
998 | *
999 | * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
1000 | * array( // list of combinator selectors, i.e. 'img > p > div'
1001 | * array( // selector element
1002 | * [0], // (string) The element tag
1003 | * [1], // (string) The element id
1004 | * [2], // (array) The element classes
1005 | * [3], // (array>) The list of attributes, each
1006 | * // with four elements: name, expression, value, inverted
1007 | * [4] // (string) The selector combinator (' ' | '>' | '+' | '~')
1008 | * )
1009 | * )
1010 | * )
1011 | * ```
1012 | *
1013 | * @link https://www.w3.org/TR/selectors/#compound Compound selector
1014 | */
1015 | protected function parse_selector($selector_string)
1016 | {
1017 | global $debug_object;
1018 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1019 |
1020 | /**
1021 | * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
1022 | *
1023 | * Paperg: Add the colon to the attribute, so that it properly finds
1024 | * like google does.
1025 | *
1026 | * Note: if you try to look at this attribute, you MUST use getAttribute
1027 | * since $dom->x:y will fail the php syntax check.
1028 | *
1029 | * Notice the \[ starting the attribute? and the @? following? This
1030 | * implies that an attribute can begin with an @ sign that is not
1031 | * captured. This implies that an html attribute specifier may start
1032 | * with an @ sign that is NOT captured by the expression. Farther study
1033 | * is required to determine of this should be documented or removed.
1034 | *
1035 | * Matches selectors in this order:
1036 | *
1037 | * [0] - full match
1038 | *
1039 | * [1] - tag name
1040 | * ([\w:\*-]*)
1041 | * Matches the tag name consisting of zero or more words, colons,
1042 | * asterisks and hyphens.
1043 | *
1044 | * [2] - id name
1045 | * (?:\#([\w-]+))
1046 | * Optionally matches a id name, consisting of an "#" followed by
1047 | * the id name (one or more words and hyphens).
1048 | *
1049 | * [3] - class names (including dots)
1050 | * (?:\.([\w\.-]+))?
1051 | * Optionally matches a list of classs, consisting of an "."
1052 | * followed by the class name (one or more words and hyphens)
1053 | * where multiple classes can be chained (i.e. ".foo.bar.baz")
1054 | *
1055 | * [4] - attributes
1056 | * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
1057 | * Optionally matches the attributes list
1058 | *
1059 | * [5] - separator
1060 | * ([\/, >+~]+)
1061 | * Matches the selector list separator
1062 | */
1063 | // phpcs:ignore Generic.Files.LineLength
1064 | $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
1065 |
1066 | preg_match_all(
1067 | $pattern,
1068 | trim($selector_string) . ' ', // Add final ' ' as pseudo separator
1069 | $matches,
1070 | PREG_SET_ORDER
1071 | );
1072 |
1073 | if (is_object($debug_object)) {
1074 | $debug_object->debug_log(2, 'Matches Array: ', $matches);
1075 | }
1076 |
1077 | $selectors = array();
1078 | $result = array();
1079 |
1080 | foreach ($matches as $m) {
1081 | $m[0] = trim($m[0]);
1082 |
1083 | // Skip NoOps
1084 | if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
1085 |
1086 | // Convert to lowercase
1087 | if ($this->dom->lowercase) {
1088 | $m[1] = strtolower($m[1]);
1089 | }
1090 |
1091 | // Extract classes
1092 | if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
1093 |
1094 | /* Extract attributes (pattern based on the pattern above!)
1095 |
1096 | * [0] - full match
1097 | * [1] - attribute name
1098 | * [2] - attribute expression
1099 | * [3] - attribute value
1100 | * [4] - case sensitivity
1101 | *
1102 | * Note: Attributes can be negated with a "!" prefix to their name
1103 | */
1104 | if($m[4] !== '') {
1105 | preg_match_all(
1106 | "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
1107 | trim($m[4]),
1108 | $attributes,
1109 | PREG_SET_ORDER
1110 | );
1111 |
1112 | // Replace element by array
1113 | $m[4] = array();
1114 |
1115 | foreach($attributes as $att) {
1116 | // Skip empty matches
1117 | if(trim($att[0]) === '') { continue; }
1118 |
1119 | $inverted = (isset($att[1][0]) && $att[1][0] === '!');
1120 | $m[4][] = array(
1121 | $inverted ? substr($att[1], 1) : $att[1], // Name
1122 | (isset($att[2])) ? $att[2] : '', // Expression
1123 | (isset($att[3])) ? $att[3] : '', // Value
1124 | $inverted, // Inverted Flag
1125 | (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
1126 | );
1127 | }
1128 | }
1129 |
1130 | // Sanitize Separator
1131 | if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
1132 | $m[5] = ' ';
1133 | } else { // Other Separator
1134 | $m[5] = trim($m[5]);
1135 | }
1136 |
1137 | // Clear Separator if it's a Selector List
1138 | if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
1139 |
1140 | // Remove full match before adding to results
1141 | array_shift($m);
1142 | $result[] = $m;
1143 |
1144 | if ($is_list) { // Selector List
1145 | $selectors[] = $result;
1146 | $result = array();
1147 | }
1148 | }
1149 |
1150 | if (count($result) > 0) { $selectors[] = $result; }
1151 | return $selectors;
1152 | }
1153 |
1154 | function __get($name)
1155 | {
1156 | if (isset($this->attr[$name])) {
1157 | return $this->convert_text($this->attr[$name]);
1158 | }
1159 | switch ($name) {
1160 | case 'outertext': return $this->outertext();
1161 | case 'innertext': return $this->innertext();
1162 | case 'plaintext': return $this->text();
1163 | case 'xmltext': return $this->xmltext();
1164 | default: return array_key_exists($name, $this->attr);
1165 | }
1166 | }
1167 |
1168 | function __set($name, $value)
1169 | {
1170 | global $debug_object;
1171 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1172 |
1173 | switch ($name) {
1174 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
1175 | case 'innertext':
1176 | if (isset($this->_[HDOM_INFO_TEXT])) {
1177 | return $this->_[HDOM_INFO_TEXT] = $value;
1178 | }
1179 | return $this->_[HDOM_INFO_INNER] = $value;
1180 | }
1181 |
1182 | if (!isset($this->attr[$name])) {
1183 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1184 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1185 | }
1186 |
1187 | $this->attr[$name] = $value;
1188 | }
1189 |
1190 | function __isset($name)
1191 | {
1192 | switch ($name) {
1193 | case 'outertext': return true;
1194 | case 'innertext': return true;
1195 | case 'plaintext': return true;
1196 | }
1197 | //no value attr: nowrap, checked selected...
1198 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1199 | }
1200 |
1201 | function __unset($name)
1202 | {
1203 | if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1204 | }
1205 |
1206 | // PaperG - Function to convert the text from one character set to another
1207 | // if the two sets are not the same.
1208 | function convert_text($text)
1209 | {
1210 | global $debug_object;
1211 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1212 |
1213 | $converted_text = $text;
1214 |
1215 | $sourceCharset = '';
1216 | $targetCharset = '';
1217 |
1218 | if ($this->dom) {
1219 | $sourceCharset = strtoupper($this->dom->_charset);
1220 | $targetCharset = strtoupper($this->dom->_target_charset);
1221 | }
1222 |
1223 | if (is_object($debug_object)) {
1224 | $debug_object->debug_log(3,
1225 | 'source charset: '
1226 | . $sourceCharset
1227 | . ' target charaset: '
1228 | . $targetCharset
1229 | );
1230 | }
1231 |
1232 | if (!empty($sourceCharset)
1233 | && !empty($targetCharset)
1234 | && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1235 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1236 | if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1237 | && ($this->is_utf8($text))) {
1238 | $converted_text = $text;
1239 | } else {
1240 | $converted_text = iconv($sourceCharset, $targetCharset, $text);
1241 | }
1242 | }
1243 |
1244 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1245 | if ($targetCharset === 'UTF-8') {
1246 | if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1247 | $converted_text = substr($converted_text, 3);
1248 | }
1249 |
1250 | if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1251 | $converted_text = substr($converted_text, 0, -3);
1252 | }
1253 | }
1254 |
1255 | return $converted_text;
1256 | }
1257 |
1258 | /**
1259 | * Returns true if $string is valid UTF-8 and false otherwise.
1260 | *
1261 | * @param mixed $str String to be tested
1262 | * @return boolean
1263 | */
1264 | static function is_utf8($str)
1265 | {
1266 | $c = 0; $b = 0;
1267 | $bits = 0;
1268 | $len = strlen($str);
1269 | for($i = 0; $i < $len; $i++) {
1270 | $c = ord($str[$i]);
1271 | if($c > 128) {
1272 | if(($c >= 254)) { return false; }
1273 | elseif($c >= 252) { $bits = 6; }
1274 | elseif($c >= 248) { $bits = 5; }
1275 | elseif($c >= 240) { $bits = 4; }
1276 | elseif($c >= 224) { $bits = 3; }
1277 | elseif($c >= 192) { $bits = 2; }
1278 | else { return false; }
1279 | if(($i + $bits) > $len) { return false; }
1280 | while($bits > 1) {
1281 | $i++;
1282 | $b = ord($str[$i]);
1283 | if($b < 128 || $b > 191) { return false; }
1284 | $bits--;
1285 | }
1286 | }
1287 | }
1288 | return true;
1289 | }
1290 |
1291 | /**
1292 | * Function to try a few tricks to determine the displayed size of an img on
1293 | * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
1294 | * other tag types.
1295 | *
1296 | * @author John Schlick
1297 | * @version April 19 2012
1298 | * @return array an array containing the 'height' and 'width' of the image
1299 | * on the page or -1 if we can't figure it out.
1300 | */
1301 | function get_display_size()
1302 | {
1303 | global $debug_object;
1304 |
1305 | $width = -1;
1306 | $height = -1;
1307 |
1308 | if ($this->tag !== 'img') {
1309 | return false;
1310 | }
1311 |
1312 | // See if there is aheight or width attribute in the tag itself.
1313 | if (isset($this->attr['width'])) {
1314 | $width = $this->attr['width'];
1315 | }
1316 |
1317 | if (isset($this->attr['height'])) {
1318 | $height = $this->attr['height'];
1319 | }
1320 |
1321 | // Now look for an inline style.
1322 | if (isset($this->attr['style'])) {
1323 | // Thanks to user gnarf from stackoverflow for this regular expression.
1324 | $attributes = array();
1325 |
1326 | preg_match_all(
1327 | '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1328 | $this->attr['style'],
1329 | $matches,
1330 | PREG_SET_ORDER
1331 | );
1332 |
1333 | foreach ($matches as $match) {
1334 | $attributes[$match[1]] = $match[2];
1335 | }
1336 |
1337 | // If there is a width in the style attributes:
1338 | if (isset($attributes['width']) && $width == -1) {
1339 | // check that the last two characters are px (pixels)
1340 | if (strtolower(substr($attributes['width'], -2)) === 'px') {
1341 | $proposed_width = substr($attributes['width'], 0, -2);
1342 | // Now make sure that it's an integer and not something stupid.
1343 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1344 | $width = $proposed_width;
1345 | }
1346 | }
1347 | }
1348 |
1349 | // If there is a width in the style attributes:
1350 | if (isset($attributes['height']) && $height == -1) {
1351 | // check that the last two characters are px (pixels)
1352 | if (strtolower(substr($attributes['height'], -2)) == 'px') {
1353 | $proposed_height = substr($attributes['height'], 0, -2);
1354 | // Now make sure that it's an integer and not something stupid.
1355 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1356 | $height = $proposed_height;
1357 | }
1358 | }
1359 | }
1360 |
1361 | }
1362 |
1363 | // Future enhancement:
1364 | // Look in the tag to see if there is a class or id specified that has
1365 | // a height or width attribute to it.
1366 |
1367 | // Far future enhancement
1368 | // Look at all the parent tags of this image to see if they specify a
1369 | // class or id that has an img selector that specifies a height or width
1370 | // Note that in this case, the class or id will have the img subselector
1371 | // for it to apply to the image.
1372 |
1373 | // ridiculously far future development
1374 | // If the class or id is specified in a SEPARATE css file thats not on
1375 | // the page, go get it and do what we were just doing for the ones on
1376 | // the page.
1377 |
1378 | $result = array(
1379 | 'height' => $height,
1380 | 'width' => $width
1381 | );
1382 |
1383 | return $result;
1384 | }
1385 |
1386 | // camel naming conventions
1387 | function getAllAttributes()
1388 | {
1389 | return $this->attr;
1390 | }
1391 |
1392 | function getAttribute($name)
1393 | {
1394 | return $this->__get($name);
1395 | }
1396 |
1397 | function setAttribute($name, $value)
1398 | {
1399 | $this->__set($name, $value);
1400 | }
1401 |
1402 | function hasAttribute($name)
1403 | {
1404 | return $this->__isset($name);
1405 | }
1406 |
1407 | function removeAttribute($name)
1408 | {
1409 | $this->__set($name, null);
1410 | }
1411 |
1412 | function getElementById($id)
1413 | {
1414 | return $this->find("#$id", 0);
1415 | }
1416 |
1417 | function getElementsById($id, $idx = null)
1418 | {
1419 | return $this->find("#$id", $idx);
1420 | }
1421 |
1422 | function getElementByTagName($name)
1423 | {
1424 | return $this->find($name, 0);
1425 | }
1426 |
1427 | function getElementsByTagName($name, $idx = null)
1428 | {
1429 | return $this->find($name, $idx);
1430 | }
1431 |
1432 | function parentNode()
1433 | {
1434 | return $this->parent();
1435 | }
1436 |
1437 | function childNodes($idx = -1)
1438 | {
1439 | return $this->children($idx);
1440 | }
1441 |
1442 | function firstChild()
1443 | {
1444 | return $this->first_child();
1445 | }
1446 |
1447 | function lastChild()
1448 | {
1449 | return $this->last_child();
1450 | }
1451 |
1452 | function nextSibling()
1453 | {
1454 | return $this->next_sibling();
1455 | }
1456 |
1457 | function previousSibling()
1458 | {
1459 | return $this->prev_sibling();
1460 | }
1461 |
1462 | function hasChildNodes()
1463 | {
1464 | return $this->has_child();
1465 | }
1466 |
1467 | function nodeName()
1468 | {
1469 | return $this->tag;
1470 | }
1471 |
1472 | function appendChild($node)
1473 | {
1474 | $node->parent($this);
1475 | return $node;
1476 | }
1477 |
1478 | }
1479 |
1480 | /**
1481 | * simple html dom parser
1482 | *
1483 | * Paperg - in the find routine: allow us to specify that we want case
1484 | * insensitive testing of the value of the selector.
1485 | *
1486 | * Paperg - change $size from protected to public so we can easily access it
1487 | *
1488 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we
1489 | * trust the html or not. Default is to NOT trust it.
1490 | *
1491 | * @package PlaceLocalInclude
1492 | */
1493 | class simple_html_dom
1494 | {
1495 | /**
1496 | * The root node of the document
1497 | *
1498 | * @var object
1499 | */
1500 | public $root = null;
1501 |
1502 | /**
1503 | * List of nodes in the current DOM
1504 | *
1505 | * @var array
1506 | */
1507 | public $nodes = array();
1508 |
1509 | /**
1510 | * Callback function to run for each element in the DOM.
1511 | *
1512 | * @var callable|null
1513 | */
1514 | public $callback = null;
1515 |
1516 | /**
1517 | * Indicates how tags and attributes are matched
1518 | *
1519 | * @var bool When set to **true** tags and attributes will be converted to
1520 | * lowercase before matching.
1521 | */
1522 | public $lowercase = false;
1523 |
1524 | /**
1525 | * Original document size
1526 | *
1527 | * Holds the original document size.
1528 | *
1529 | * @var int
1530 | */
1531 | public $original_size;
1532 |
1533 | /**
1534 | * Current document size
1535 | *
1536 | * Holds the current document size. The document size is determined by the
1537 | * string length of ({@see simple_html_dom::$doc}).
1538 | *
1539 | * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1540 | *
1541 | * @var int
1542 | * */
1543 | public $size;
1544 |
1545 | /**
1546 | * Current position in the document
1547 | *
1548 | * @var int
1549 | */
1550 | protected $pos;
1551 |
1552 | /**
1553 | * The document
1554 | *
1555 | * @var string
1556 | */
1557 | protected $doc;
1558 |
1559 | /**
1560 | * Current character
1561 | *
1562 | * Holds the current character at position {@see simple_html_dom::$pos} in
1563 | * the document {@see simple_html_dom::$doc}
1564 | *
1565 | * _Note_: Using this variable is more efficient than calling
1566 | * `substr($doc, $pos, 1)`
1567 | *
1568 | * @var string
1569 | */
1570 | protected $char;
1571 |
1572 | protected $cursor;
1573 |
1574 | /**
1575 | * Parent node of the next node detected by the parser
1576 | *
1577 | * @var object
1578 | */
1579 | protected $parent;
1580 | protected $noise = array();
1581 |
1582 | /**
1583 | * Tokens considered blank in HTML
1584 | *
1585 | * @var string
1586 | */
1587 | protected $token_blank = " \t\r\n";
1588 |
1589 | /**
1590 | * Tokens to identify the equal sign for attributes, stopping either at the
1591 | * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e.
1592 | * "")
1593 | *
1594 | * @var string
1595 | */
1596 | protected $token_equal = ' =/>';
1597 |
1598 | /**
1599 | * Tokens to identify the end of a tag name. A tag name either ends on the
1600 | * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t")
1601 | *
1602 | * @var string
1603 | */
1604 | protected $token_slash = " />\r\n\t";
1605 |
1606 | /**
1607 | * Tokens to identify the end of an attribute
1608 | *
1609 | * @var string
1610 | */
1611 | protected $token_attr = ' >';
1612 |
1613 | // Note that this is referenced by a child node, and so it needs to be
1614 | // public for that node to see this information.
1615 | public $_charset = '';
1616 | public $_target_charset = '';
1617 |
1618 | /**
1619 | * Innertext for
elements
1620 | *
1621 | * @var string
1622 | */
1623 | protected $default_br_text = '';
1624 |
1625 | /**
1626 | * Suffix for elements
1627 | *
1628 | * @var string
1629 | */
1630 | public $default_span_text = '';
1631 |
1632 | /**
1633 | * Defines a list of self-closing tags (Void elements) according to the HTML
1634 | * Specification
1635 | *
1636 | * _Remarks_:
1637 | * - Use `isset()` instead of `in_array()` on array elements to boost
1638 | * performance about 30%
1639 | * - Sort elements by name for better readability!
1640 | *
1641 | * @link https://www.w3.org/TR/html HTML Specification
1642 | * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1643 | */
1644 | protected $self_closing_tags = array(
1645 | 'area' => 1,
1646 | 'base' => 1,
1647 | 'br' => 1,
1648 | 'col' => 1,
1649 | 'embed' => 1,
1650 | 'hr' => 1,
1651 | 'img' => 1,
1652 | 'input' => 1,
1653 | 'link' => 1,
1654 | 'meta' => 1,
1655 | 'param' => 1,
1656 | 'source' => 1,
1657 | 'track' => 1,
1658 | 'wbr' => 1
1659 | );
1660 |
1661 | /**
1662 | * Defines a list of tags which - if closed - close all optional closing
1663 | * elements within if they haven't been closed yet. (So, an element where
1664 | * neither opening nor closing tag is omissible consistently closes every
1665 | * optional closing element within)
1666 | *
1667 | * _Remarks_:
1668 | * - Use `isset()` instead of `in_array()` on array elements to boost
1669 | * performance about 30%
1670 | * - Sort elements by name for better readability!
1671 | */
1672 | protected $block_tags = array(
1673 | 'body' => 1,
1674 | 'div' => 1,
1675 | 'form' => 1,
1676 | 'root' => 1,
1677 | 'span' => 1,
1678 | 'table' => 1
1679 | );
1680 |
1681 | /**
1682 | * Defines elements whose end tag is omissible.
1683 | *
1684 | * * key = Name of an element whose end tag is omissible.
1685 | * * value = Names of elements whose end tag is omissible, that are closed
1686 | * by the current element.
1687 | *
1688 | * _Remarks_:
1689 | * - Use `isset()` instead of `in_array()` on array elements to boost
1690 | * performance about 30%
1691 | * - Sort elements by name for better readability!
1692 | *
1693 | * **Example**
1694 | *
1695 | * An `li` element’s end tag may be omitted if the `li` element is immediately
1696 | * followed by another `li` element. To do that, add following element to the
1697 | * array:
1698 | *
1699 | * ```php
1700 | * 'li' => array('li'),
1701 | * ```
1702 | *
1703 | * With this, the following two examples are considered equal. Note that the
1704 | * second example is missing the closing tags on `li` elements.
1705 | *
1706 | * ```html
1707 | *
1708 | * ```
1709 | *
1710 | *
1711 | *
1712 | * ```html
1713 | *
1714 | * ```
1715 | *
1716 | *
1717 | *
1718 | * @var array A two-dimensional array where the key is the name of an
1719 | * element whose end tag is omissible and the value is an array of elements
1720 | * whose end tag is omissible, that are closed by the current element.
1721 | *
1722 | * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1723 | *
1724 | * @todo The implementation of optional closing tags doesn't work in all cases
1725 | * because it only consideres elements who close other optional closing
1726 | * tags, not taking into account that some (non-blocking) tags should close
1727 | * these optional closing tags. For example, the end tag for "p" is omissible
1728 | * and can be closed by an "address" element, whose end tag is NOT omissible.
1729 | * Currently a "p" element without closing tag stops at the next "p" element
1730 | * or blocking tag, even if it contains other elements.
1731 | *
1732 | * @todo Known sourceforge issue #2977341
1733 | * B tags that are not closed cause us to return everything to the end of
1734 | * the document.
1735 | */
1736 | protected $optional_closing_tags = array(
1737 | // Not optional, see
1738 | // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1739 | 'b' => array('b' => 1),
1740 | 'dd' => array('dd' => 1, 'dt' => 1),
1741 | // Not optional, see
1742 | // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1743 | 'dl' => array('dd' => 1, 'dt' => 1),
1744 | 'dt' => array('dd' => 1, 'dt' => 1),
1745 | 'li' => array('li' => 1),
1746 | 'optgroup' => array('optgroup' => 1, 'option' => 1),
1747 | 'option' => array('optgroup' => 1, 'option' => 1),
1748 | 'p' => array('p' => 1),
1749 | 'rp' => array('rp' => 1, 'rt' => 1),
1750 | 'rt' => array('rp' => 1, 'rt' => 1),
1751 | 'td' => array('td' => 1, 'th' => 1),
1752 | 'th' => array('td' => 1, 'th' => 1),
1753 | 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1754 | );
1755 |
1756 | function __construct(
1757 | $str = null,
1758 | $lowercase = true,
1759 | $forceTagsClosed = true,
1760 | $target_charset = DEFAULT_TARGET_CHARSET,
1761 | $stripRN = true,
1762 | $defaultBRText = DEFAULT_BR_TEXT,
1763 | $defaultSpanText = DEFAULT_SPAN_TEXT,
1764 | $options = 0)
1765 | {
1766 | if ($str) {
1767 | if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1768 | $this->load_file($str);
1769 | } else {
1770 | $this->load(
1771 | $str,
1772 | $lowercase,
1773 | $stripRN,
1774 | $defaultBRText,
1775 | $defaultSpanText,
1776 | $options
1777 | );
1778 | }
1779 | }
1780 | // Forcing tags to be closed implies that we don't trust the html, but
1781 | // it can lead to parsing errors if we SHOULD trust the html.
1782 | if (!$forceTagsClosed) {
1783 | $this->optional_closing_array = array();
1784 | }
1785 |
1786 | $this->_target_charset = $target_charset;
1787 | }
1788 |
1789 | function __destruct()
1790 | {
1791 | $this->clear();
1792 | }
1793 |
1794 | // load html from string
1795 | function load(
1796 | $str,
1797 | $lowercase = true,
1798 | $stripRN = true,
1799 | $defaultBRText = DEFAULT_BR_TEXT,
1800 | $defaultSpanText = DEFAULT_SPAN_TEXT,
1801 | $options = 0)
1802 | {
1803 | global $debug_object;
1804 |
1805 | // prepare
1806 | $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1807 |
1808 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1809 | // Script tags removal now preceeds style tag removal.
1810 | // strip out