├── .gitattributes
├── .gitignore
├── .scrutinizer.yml
├── composer.json
├── phpunit.xml.dist
└── src
└── PHPHtmlParser
├── Content.php
├── Curl.php
├── CurlInterface.php
├── Dom.php
├── Dom
├── AbstractNode.php
├── ArrayNode.php
├── Collection.php
├── HtmlNode.php
├── InnerNode.php
├── LeafNode.php
├── MockNode.php
├── Tag.php
└── TextNode.php
├── Exceptions
├── ChildNotFoundException.php
├── CircularException.php
├── CurlException.php
├── EmptyCollectionException.php
├── NotLoadedException.php
├── ParentNotFoundException.php
├── StrictException.php
└── UnknownChildTypeException.php
├── Options.php
├── Selector.php
└── StaticDom.php
/.gitattributes:
--------------------------------------------------------------------------------
1 | /tests export-ignore
2 | /.scrutinizar.yml export-ignore
3 | /.travis.yml export-ignore
4 | /CHANGELOG.md export-ignore
5 | /CONTRIBUTING.md export-ignore
6 | /LICENSE.md export-ignore
7 | /README.md export-ignore
8 | /phpunit.php export-ignore
9 | /phpunit.xml export-ignore
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | composer.lock
2 | vendor/
3 | phpunit.xml
4 |
--------------------------------------------------------------------------------
/.scrutinizer.yml:
--------------------------------------------------------------------------------
1 | filter:
2 | paths: [src/*]
3 | excluded_paths: [tests/*]
4 | checks:
5 | php:
6 | code_rating: true
7 | remove_extra_empty_lines: true
8 | remove_php_closing_tag: true
9 | remove_trailing_whitespace: true
10 | fix_use_statements:
11 | remove_unused: true
12 | preserve_multiple: false
13 | preserve_blanklines: true
14 | order_alphabetically: true
15 | fix_php_opening_tag: true
16 | fix_linefeed: true
17 | fix_line_ending: true
18 | fix_identation_4spaces: true
19 | fix_doc_comments: true
20 | tools:
21 | external_code_coverage:
22 | timeout: 600
23 | runs: 3
24 | php_code_coverage: false
25 | php_code_sniffer:
26 | config:
27 | standard: PSR2
28 | filter:
29 | paths: ['src']
30 | php_loc:
31 | enabled: true
32 | excluded_dirs: [vendor, test]
33 | php_cpd:
34 | enabled: true
35 | excluded_dirs: [vendor, test]
36 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "thesoftwarefanatics/php-html-parser",
3 | "type": "library",
4 | "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.",
5 | "keywords": ["html", "dom", "parser"],
6 | "homepage": "https://github.com/thesoftwarefanatics/php-html-parser",
7 | "license": "MIT",
8 | "authors": [
9 | {
10 | "name": "Gilles Paquette",
11 | "email": "paquettg@gmail.com",
12 | "homepage": "http://gillespaquette.ca"
13 | },
14 | {
15 | "name": "The Software Fanatics GmbH",
16 | "email": "dev@thesoftwarefanatics.com",
17 | "homepage": "https://thesoftwarefanatics.com"
18 | }
19 | ],
20 | "require": {
21 | "php": "^7.1",
22 | "paquettg/string-encode": "^0.1.1"
23 | },
24 | "require-dev": {
25 | "phpunit/phpunit": "^6.5",
26 | "php-coveralls/php-coveralls": "^2.0",
27 | "mockery/mockery": "^1.0"
28 | },
29 | "replace": {
30 | "paquettg/php-html-parser": "self.version"
31 | },
32 | "autoload": {
33 | "psr-0": {
34 | "PHPHtmlParser": "src/"
35 | }
36 | },
37 | "extra": {
38 | "branch-alias": {
39 | "dev-master": "1.8.x-dev"
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/phpunit.xml.dist:
--------------------------------------------------------------------------------
1 |
2 |
13 |
14 |
15 | ./tests/
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/src/PHPHtmlParser/Content.php:
--------------------------------------------------------------------------------
1 | ';
40 | protected $slash = " />\r\n\t";
41 | protected $attr = ' >';
42 |
43 | /**
44 | * Content constructor.
45 | *
46 | * @param $content
47 | */
48 | public function __construct($content)
49 | {
50 | $this->content = $content;
51 | $this->size = strlen($content);
52 | $this->pos = 0;
53 | }
54 |
55 | /**
56 | * Returns the current position of the content.
57 | *
58 | * @return int
59 | */
60 | public function getPosition()
61 | {
62 | return $this->pos;
63 | }
64 |
65 | /**
66 | * Gets the current character we are at.
67 | *
68 | * @param int $char
69 | * @return string
70 | */
71 | public function char($char = null)
72 | {
73 | $pos = $this->pos;
74 | if ( ! is_null($char)) {
75 | $pos = $char;
76 | }
77 |
78 | if ( ! isset($this->content[$pos])) {
79 | return '';
80 | }
81 |
82 | return $this->content[$pos];
83 | }
84 |
85 | /**
86 | * Moves the current position forward.
87 | *
88 | * @param int $count
89 | * @return $this
90 | */
91 | public function fastForward($count)
92 | {
93 | $this->pos += $count;
94 |
95 | return $this;
96 | }
97 |
98 | /**
99 | * Moves the current position backward.
100 | *
101 | * @param int $count
102 | * @return $this
103 | */
104 | public function rewind($count)
105 | {
106 | $this->pos -= $count;
107 | if ($this->pos < 0) {
108 | $this->pos = 0;
109 | }
110 |
111 | return $this;
112 | }
113 |
114 | /**
115 | * Copy the content until we find the given string.
116 | *
117 | * @param string $string
118 | * @param bool $char
119 | * @param bool $escape
120 | * @return string
121 | */
122 | public function copyUntil($string, $char = false, $escape = false)
123 | {
124 | if ($this->pos >= $this->size) {
125 | // nothing left
126 | return '';
127 | }
128 |
129 | if ($escape) {
130 | $position = $this->pos;
131 | $found = false;
132 | while ( ! $found) {
133 | $position = strpos($this->content, $string, $position);
134 | if ($position === false) {
135 | // reached the end
136 | $found = true;
137 | continue;
138 | }
139 |
140 | if ($this->char($position - 1) == '\\') {
141 | // this character is escaped
142 | ++$position;
143 | continue;
144 | }
145 |
146 | $found = true;
147 | }
148 | } elseif ($char) {
149 | $position = strcspn($this->content, $string, $this->pos);
150 | $position += $this->pos;
151 | } else {
152 | $position = strpos($this->content, $string, $this->pos);
153 | }
154 |
155 | if ($position === false) {
156 | // could not find character, just return the remaining of the content
157 | $return = substr($this->content, $this->pos, $this->size - $this->pos);
158 | $this->pos = $this->size;
159 |
160 | return $return;
161 | }
162 |
163 | if ($position == $this->pos) {
164 | // we are at the right place
165 | return '';
166 | }
167 |
168 | $return = substr($this->content, $this->pos, $position - $this->pos);
169 | // set the new position
170 | $this->pos = $position;
171 |
172 | return $return;
173 | }
174 |
175 | /**
176 | * Copies the content until the string is found and return it
177 | * unless the 'unless' is found in the substring.
178 | *
179 | * @param string $string
180 | * @param string $unless
181 | * @return string
182 | */
183 | public function copyUntilUnless($string, $unless)
184 | {
185 | $lastPos = $this->pos;
186 | $this->fastForward(1);
187 | $foundString = $this->copyUntil($string, true, true);
188 |
189 | $position = strcspn($foundString, $unless);
190 | if ($position == strlen($foundString)) {
191 | return $string.$foundString;
192 | }
193 | // rewind changes and return nothing
194 | $this->pos = $lastPos;
195 |
196 | return '';
197 | }
198 |
199 | /**
200 | * Copies the content until it reaches the token string.,
201 | *
202 | * @param string $token
203 | * @param bool $char
204 | * @param bool $escape
205 | * @return string
206 | * @uses $this->copyUntil()
207 | */
208 | public function copyByToken($token, $char = false, $escape = false)
209 | {
210 | $string = $this->$token;
211 |
212 | return $this->copyUntil($string, $char, $escape);
213 | }
214 |
215 | /**
216 | * Skip a given set of characters.
217 | *
218 | * @param string $string
219 | * @param bool $copy
220 | * @return $this|string
221 | */
222 | public function skip($string, $copy = false)
223 | {
224 | $len = strspn($this->content, $string, $this->pos);
225 |
226 | // make it chainable if they don't want a copy
227 | $return = $this;
228 | if ($copy) {
229 | $return = substr($this->content, $this->pos, $len);
230 | }
231 |
232 | // update the position
233 | $this->pos += $len;
234 |
235 | return $return;
236 | }
237 |
238 | /**
239 | * Skip a given token of pre-defined characters.
240 | *
241 | * @param string $token
242 | * @param bool $copy
243 | * @return null|string
244 | * @uses $this->skip()
245 | */
246 | public function skipByToken($token, $copy = false)
247 | {
248 | $string = $this->$token;
249 |
250 | return $this->skip($string, $copy);
251 | }
252 | }
253 |
--------------------------------------------------------------------------------
/src/PHPHtmlParser/Curl.php:
--------------------------------------------------------------------------------
1 | root->innerHtml();
101 | }
102 |
103 | /**
104 | * A simple wrapper around the root node.
105 | *
106 | * @param string $name
107 | * @return mixed
108 | */
109 | public function __get($name)
110 | {
111 | return $this->root->$name;
112 | }
113 |
114 | /**
115 | * Attempts to load the dom from any resource, string, file, or URL.
116 | *
117 | * @param string $str
118 | * @param array $options
119 | * @return $this
120 | */
121 | public function load($str, $options = [])
122 | {
123 | // check if it's a file
124 | if (strpos($str, "\n") === false && is_file($str)) {
125 | return $this->loadFromFile($str, $options);
126 | }
127 | // check if it's a url
128 | if (preg_match("/^https?:\/\//i", $str)) {
129 | return $this->loadFromUrl($str, $options);
130 | }
131 |
132 | return $this->loadStr($str, $options);
133 | }
134 |
135 | /**
136 | * Loads the dom from a document file/url
137 | *
138 | * @param string $file
139 | * @param array $options
140 | * @return $this
141 | */
142 | public function loadFromFile($file, $options = [])
143 | {
144 | return $this->loadStr(file_get_contents($file), $options);
145 | }
146 |
147 | /**
148 | * Use a curl interface implementation to attempt to load
149 | * the content from a url.
150 | *
151 | * @param string $url
152 | * @param array $options
153 | * @param CurlInterface $curl
154 | * @return $this
155 | */
156 | public function loadFromUrl($url, $options = [], CurlInterface $curl = null)
157 | {
158 | if (is_null($curl)) {
159 | // use the default curl interface
160 | $curl = new Curl;
161 | }
162 | $content = $curl->get($url);
163 |
164 | return $this->loadStr($content, $options);
165 | }
166 |
167 | /**
168 | * Parsers the html of the given string. Used for load(), loadFromFile(),
169 | * and loadFromUrl().
170 | *
171 | * @param string $str
172 | * @param array $option
173 | * @return $this
174 | */
175 | public function loadStr($str, $option)
176 | {
177 | $this->options = new Options;
178 | $this->options->setOptions($this->globalOptions)
179 | ->setOptions($option);
180 |
181 | $this->rawSize = strlen($str);
182 | $this->raw = $str;
183 |
184 | $html = $this->clean($str);
185 |
186 | $this->size = strlen($str);
187 | $this->content = new Content($html);
188 |
189 | $this->parse();
190 | $this->detectCharset();
191 |
192 | return $this;
193 | }
194 |
195 | /**
196 | * Sets a global options array to be used by all load calls.
197 | *
198 | * @param array $options
199 | * @return $this
200 | */
201 | public function setOptions(array $options)
202 | {
203 | $this->globalOptions = $options;
204 |
205 | return $this;
206 | }
207 |
208 | /**
209 | * Find elements by css selector on the root node.
210 | *
211 | * @param string $selector
212 | * @param int $nth
213 | * @return array
214 | */
215 | public function find($selector, $nth = null)
216 | {
217 | $this->isLoaded();
218 |
219 | return $this->root->find($selector, $nth);
220 | }
221 |
222 | /**
223 | * Adds the tag (or tags in an array) to the list of tags that will always
224 | * be self closing.
225 | *
226 | * @param string|array $tag
227 | * @return $this
228 | */
229 | public function addSelfClosingTag($tag)
230 | {
231 | if ( ! is_array($tag)) {
232 | $tag = [$tag];
233 | }
234 | foreach ($tag as $value) {
235 | $this->selfClosing[] = $value;
236 | }
237 |
238 | return $this;
239 | }
240 |
241 | /**
242 | * Removes the tag (or tags in an array) from the list of tags that will
243 | * always be self closing.
244 | *
245 | * @param string|array $tag
246 | * @return $this
247 | */
248 | public function removeSelfClosingTag($tag)
249 | {
250 | if ( ! is_array($tag)) {
251 | $tag = [$tag];
252 | }
253 | $this->selfClosing = array_diff($this->selfClosing, $tag);
254 |
255 | return $this;
256 | }
257 |
258 | /**
259 | * Sets the list of self closing tags to empty.
260 | *
261 | * @return $this
262 | */
263 | public function clearSelfClosingTags()
264 | {
265 | $this->selfClosing = [];
266 |
267 | return $this;
268 | }
269 |
270 | /**
271 | * Simple wrapper function that returns the first child.
272 | *
273 | * @return \PHPHtmlParser\Dom\AbstractNode
274 | */
275 | public function firstChild()
276 | {
277 | $this->isLoaded();
278 |
279 | return $this->root->firstChild();
280 | }
281 |
282 | /**
283 | * Simple wrapper function that returns the last child.
284 | *
285 | * @return \PHPHtmlParser\Dom\AbstractNode
286 | */
287 | public function lastChild()
288 | {
289 | $this->isLoaded();
290 |
291 | return $this->root->lastChild();
292 | }
293 |
294 | /**
295 | * Simple wrapper function that returns an element by the
296 | * id.
297 | *
298 | * @param string $id
299 | * @return \PHPHtmlParser\Dom\AbstractNode
300 | */
301 | public function getElementById($id)
302 | {
303 | $this->isLoaded();
304 |
305 | return $this->find('#'.$id, 0);
306 | }
307 |
308 | /**
309 | * Simple wrapper function that returns all elements by
310 | * tag name.
311 | *
312 | * @param string $name
313 | * @return array
314 | */
315 | public function getElementsByTag($name)
316 | {
317 | $this->isLoaded();
318 |
319 | return $this->find($name);
320 | }
321 |
322 | /**
323 | * Simple wrapper function that returns all elements by
324 | * class name.
325 | *
326 | * @param string $class
327 | * @return array
328 | */
329 | public function getElementsByClass($class)
330 | {
331 | $this->isLoaded();
332 |
333 | return $this->find('.'.$class);
334 | }
335 |
336 | /**
337 | * Checks if the load methods have been called.
338 | *
339 | * @throws NotLoadedException
340 | */
341 | protected function isLoaded()
342 | {
343 | if (is_null($this->content)) {
344 | throw new NotLoadedException('Content is not loaded!');
345 | }
346 | }
347 |
348 | /**
349 | * Cleans the html of any none-html information.
350 | *
351 | * @param string $str
352 | * @return string
353 | */
354 | protected function clean($str)
355 | {
356 | if ($this->options->get('cleanupInput') != true) {
357 | // skip entire cleanup step
358 | return $str;
359 | }
360 |
361 | // remove white space before closing tags
362 | $str = preg_replace("#'\s+>#i", "'>", $str);
363 | $str = preg_replace('#"\s+>#i', '">', $str);
364 |
365 | // clean out the \n\r
366 | $replace = ' ';
367 | if ($this->options->get('preserveLineBreaks')) {
368 | $replace = '
';
369 | }
370 | $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
371 |
372 | // strip the doctype
373 | $str = preg_replace("##i", '', $str);
374 |
375 | // strip out comments
376 | $str = preg_replace("##i", '', $str);
377 |
378 | // strip out cdata
379 | $str = preg_replace("##i", '', $str);
380 |
381 | // strip out