├── LICENSE.md
├── README.md
├── simple_html_dom.php
└── sitemap-generator.php
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Hemn Chawroka
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PHP XML Sitemap Generator
2 |
3 | This is a simple and small PHP script that I wrote quickly for myself to create a XML sitemap of my page for Google and other search engines. Maybe others can use the script too.
4 |
5 | Sitemap format: [http://www.sitemaps.org/protocol.html](http://www.sitemaps.org/protocol.html)
6 |
7 | ##Features
8 | - Actually crawls webpages like Google would
9 | - Generates seperate XML file which gets updated every time the script gets executed (Runnable via CRON)
10 | - Awesome for SEO
11 | - Crawls faster than online services
12 | - Adaptable
13 |
14 | ## Usage
15 | Usage is pretty strait forward:
16 | - Configure the crawler by modifying the `sitemap-generator.php` file
17 | - Select URL to crawl
18 | - Select the file to which the sitemap will be saved
19 | - Select accepted extensions ("/" is manditory for proper functionality)
20 | - Select change frequency (always, daily, weekly, monthly, never, etc...)
21 | - Choose priority (It is all relative so it may as well be 1)
22 | - Generate sitemap
23 | - Either send a GET request to this script or simply point your browser
24 | - A sitemap will be generated and displayed
25 | - Submit sitemap.xml to Google
26 | - Setup a CRON Job to send web requests to this script every so often, this will keep the sitemap.xml file up to date
27 |
28 | The script can be started as CLI script or as Website. CLI is the prefered way to start this script.
29 |
30 | CLI scripts are started from the command line, can be used with CRON and so on. You start it with the php program.
31 |
32 | CLI command to create the XML file: `php sitemap-generator.php`
33 |
34 | To start the program with your Webserver as Website change in the script the line 22 from
35 | ```php
36 | define ('CLI', true);
37 | ```
38 | to
39 | ```php
40 | define ('CLI', false);
41 | ```
42 |
43 |
44 | ## sitemap.xml
45 | Add the XML file to your `/robots.txt`.
46 |
47 | Example line for the robots.txt:
48 |
49 | ```
50 | Sitemap: http://www.iprodev.com/sitemap.xml
51 | ```
52 |
53 |
54 | ## Credits
55 |
56 | PHP XML Sitemap Generator was created by [Hemn Chawroka](http://iprodev.com) from [iProDev](http://iprodev.com). Released under the MIT license.
57 |
58 | Included scripts:
59 |
60 | - [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net/) - A HTML DOM parser written in PHP5+ let you manipulate HTML in a very easy way!.
61 |
--------------------------------------------------------------------------------
/simple_html_dom.php:
--------------------------------------------------------------------------------
1 | size is the "real" number of bytes the dom was created from.
17 | * but for most purposes, it's a really good estimation.
18 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
19 | * Allow the user to tell us how much they trust the html.
20 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
21 | * This allows for us to find tags based on the text they contain.
22 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
23 | * Paperg: added parse_charset so that we know about the character set of the source document.
24 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
25 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
26 | *
27 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
28 | * PaperG (John Schlick) Added get_display_size for "IMG" tags.
29 | *
30 | * Licensed under The MIT License
31 | * Redistributions of files must retain the above copyright notice.
32 | *
33 | * @author S.C. Chen
34 | * @author John Schlick
35 | * @author Rus Carroll
36 | * @version 1.5 ($Rev: 196 $)
37 | * @package PlaceLocalInclude
38 | * @subpackage simple_html_dom
39 | */
40 |
41 | /**
42 | * All of the Defines for the classes below.
43 | * @author S.C. Chen
44 | */
45 | define('HDOM_TYPE_ELEMENT', 1);
46 | define('HDOM_TYPE_COMMENT', 2);
47 | define('HDOM_TYPE_TEXT', 3);
48 | define('HDOM_TYPE_ENDTAG', 4);
49 | define('HDOM_TYPE_ROOT', 5);
50 | define('HDOM_TYPE_UNKNOWN', 6);
51 | define('HDOM_QUOTE_DOUBLE', 0);
52 | define('HDOM_QUOTE_SINGLE', 1);
53 | define('HDOM_QUOTE_NO', 3);
54 | define('HDOM_INFO_BEGIN', 0);
55 | define('HDOM_INFO_END', 1);
56 | define('HDOM_INFO_QUOTE', 2);
57 | define('HDOM_INFO_SPACE', 3);
58 | define('HDOM_INFO_TEXT', 4);
59 | define('HDOM_INFO_INNER', 5);
60 | define('HDOM_INFO_OUTER', 6);
61 | define('HDOM_INFO_ENDSPACE',7);
62 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
63 | define('DEFAULT_BR_TEXT', "\r\n");
64 | define('DEFAULT_SPAN_TEXT', " ");
65 | define('MAX_FILE_SIZE', 600000);
66 | // helper functions
67 | // -----------------------------------------------------------------------------
68 | // get html dom from file
69 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
70 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
71 | {
72 | // We DO force the tags to be terminated.
73 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
74 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
75 | $contents = file_get_contents($url, $use_include_path, $context, $offset);
76 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
77 | //$contents = retrieve_url_contents($url);
78 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
79 | {
80 | return false;
81 | }
82 | // The second parameter can force the selectors to all be lowercase.
83 | $dom->load($contents, $lowercase, $stripRN);
84 | return $dom;
85 | }
86 |
87 | // get html dom from string
88 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
89 | {
90 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
91 | if (empty($str) || strlen($str) > MAX_FILE_SIZE)
92 | {
93 | $dom->clear();
94 | return false;
95 | }
96 | $dom->load($str, $lowercase, $stripRN);
97 | return $dom;
98 | }
99 |
100 | // dump html dom tree
101 | function dump_html_tree($node, $show_attr=true, $deep=0)
102 | {
103 | $node->dump($node);
104 | }
105 |
106 |
107 | /**
108 | * simple html dom node
109 | * PaperG - added ability for "find" routine to lowercase the value of the selector.
110 | * PaperG - added $tag_start to track the start position of the tag in the total byte index
111 | *
112 | * @package PlaceLocalInclude
113 | */
114 | class simple_html_dom_node
115 | {
116 | public $nodetype = HDOM_TYPE_TEXT;
117 | public $tag = 'text';
118 | public $attr = array();
119 | public $children = array();
120 | public $nodes = array();
121 | public $parent = null;
122 | // The "info" array - see HDOM_INFO_... for what each element contains.
123 | public $_ = array();
124 | public $tag_start = 0;
125 | private $dom = null;
126 |
127 | function __construct($dom)
128 | {
129 | $this->dom = $dom;
130 | $dom->nodes[] = $this;
131 | }
132 |
133 | function __destruct()
134 | {
135 | $this->clear();
136 | }
137 |
138 | function __toString()
139 | {
140 | return $this->outertext();
141 | }
142 |
143 | // clean up memory due to php5 circular references memory leak...
144 | function clear()
145 | {
146 | $this->dom = null;
147 | $this->nodes = null;
148 | $this->parent = null;
149 | $this->children = null;
150 | }
151 |
152 | // dump node's tree
153 | function dump($show_attr=true, $deep=0)
154 | {
155 | $lead = str_repeat(' ', $deep);
156 |
157 | echo $lead.$this->tag;
158 | if ($show_attr && count($this->attr)>0)
159 | {
160 | echo '(';
161 | foreach ($this->attr as $k=>$v)
162 | echo "[$k]=>\"".$this->$k.'", ';
163 | echo ')';
164 | }
165 | echo "\n";
166 |
167 | if ($this->nodes)
168 | {
169 | foreach ($this->nodes as $c)
170 | {
171 | $c->dump($show_attr, $deep+1);
172 | }
173 | }
174 | }
175 |
176 |
177 | // Debugging function to dump a single dom node with a bunch of information about it.
178 | function dump_node($echo=true)
179 | {
180 |
181 | $string = $this->tag;
182 | if (count($this->attr)>0)
183 | {
184 | $string .= '(';
185 | foreach ($this->attr as $k=>$v)
186 | {
187 | $string .= "[$k]=>\"".$this->$k.'", ';
188 | }
189 | $string .= ')';
190 | }
191 | if (count($this->_)>0)
192 | {
193 | $string .= ' $_ (';
194 | foreach ($this->_ as $k=>$v)
195 | {
196 | if (is_array($v))
197 | {
198 | $string .= "[$k]=>(";
199 | foreach ($v as $k2=>$v2)
200 | {
201 | $string .= "[$k2]=>\"".$v2.'", ';
202 | }
203 | $string .= ")";
204 | } else {
205 | $string .= "[$k]=>\"".$v.'", ';
206 | }
207 | }
208 | $string .= ")";
209 | }
210 |
211 | if (isset($this->text))
212 | {
213 | $string .= " text: (" . $this->text . ")";
214 | }
215 |
216 | $string .= " HDOM_INNER_INFO: '";
217 | if (isset($node->_[HDOM_INFO_INNER]))
218 | {
219 | $string .= $node->_[HDOM_INFO_INNER] . "'";
220 | }
221 | else
222 | {
223 | $string .= ' NULL ';
224 | }
225 |
226 | $string .= " children: " . count($this->children);
227 | $string .= " nodes: " . count($this->nodes);
228 | $string .= " tag_start: " . $this->tag_start;
229 | $string .= "\n";
230 |
231 | if ($echo)
232 | {
233 | echo $string;
234 | return;
235 | }
236 | else
237 | {
238 | return $string;
239 | }
240 | }
241 |
242 | // returns the parent of node
243 | // If a node is passed in, it will reset the parent of the current node to that one.
244 | function parent($parent=null)
245 | {
246 | // I am SURE that this doesn't work properly.
247 | // It fails to unset the current node from it's current parents nodes or children list first.
248 | if ($parent !== null)
249 | {
250 | $this->parent = $parent;
251 | $this->parent->nodes[] = $this;
252 | $this->parent->children[] = $this;
253 | }
254 |
255 | return $this->parent;
256 | }
257 |
258 | // verify that node has children
259 | function has_child()
260 | {
261 | return !empty($this->children);
262 | }
263 |
264 | // returns children of node
265 | function children($idx=-1)
266 | {
267 | if ($idx===-1)
268 | {
269 | return $this->children;
270 | }
271 | if (isset($this->children[$idx])) return $this->children[$idx];
272 | return null;
273 | }
274 |
275 | // returns the first child of node
276 | function first_child()
277 | {
278 | if (count($this->children)>0)
279 | {
280 | return $this->children[0];
281 | }
282 | return null;
283 | }
284 |
285 | // returns the last child of node
286 | function last_child()
287 | {
288 | if (($count=count($this->children))>0)
289 | {
290 | return $this->children[$count-1];
291 | }
292 | return null;
293 | }
294 |
295 | // returns the next sibling of node
296 | function next_sibling()
297 | {
298 | if ($this->parent===null)
299 | {
300 | return null;
301 | }
302 |
303 | $idx = 0;
304 | $count = count($this->parent->children);
305 | while ($idx<$count && $this!==$this->parent->children[$idx])
306 | {
307 | ++$idx;
308 | }
309 | if (++$idx>=$count)
310 | {
311 | return null;
312 | }
313 | return $this->parent->children[$idx];
314 | }
315 |
316 | // returns the previous sibling of node
317 | function prev_sibling()
318 | {
319 | if ($this->parent===null) return null;
320 | $idx = 0;
321 | $count = count($this->parent->children);
322 | while ($idx<$count && $this!==$this->parent->children[$idx])
323 | ++$idx;
324 | if (--$idx<0) return null;
325 | return $this->parent->children[$idx];
326 | }
327 |
328 | // function to locate a specific ancestor tag in the path to the root.
329 | function find_ancestor_tag($tag)
330 | {
331 | global $debugObject;
332 | if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
333 |
334 | // Start by including ourselves in the comparison.
335 | $returnDom = $this;
336 |
337 | while (!is_null($returnDom))
338 | {
339 | if (is_object($debugObject)) { $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag); }
340 |
341 | if ($returnDom->tag == $tag)
342 | {
343 | break;
344 | }
345 | $returnDom = $returnDom->parent;
346 | }
347 | return $returnDom;
348 | }
349 |
350 | // get dom node's inner html
351 | function innertext()
352 | {
353 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
354 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
355 |
356 | $ret = '';
357 | foreach ($this->nodes as $n)
358 | $ret .= $n->outertext();
359 | return $ret;
360 | }
361 |
362 | // get dom node's outer text (with tag)
363 | function outertext()
364 | {
365 | global $debugObject;
366 | if (is_object($debugObject))
367 | {
368 | $text = '';
369 | if ($this->tag == 'text')
370 | {
371 | if (!empty($this->text))
372 | {
373 | $text = " with text: " . $this->text;
374 | }
375 | }
376 | $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
377 | }
378 |
379 | if ($this->tag==='root') return $this->innertext();
380 |
381 | // trigger callback
382 | if ($this->dom && $this->dom->callback!==null)
383 | {
384 | call_user_func_array($this->dom->callback, array($this));
385 | }
386 |
387 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
388 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
389 |
390 | // render begin tag
391 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
392 | {
393 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
394 | } else {
395 | $ret = "";
396 | }
397 |
398 | // render inner text
399 | if (isset($this->_[HDOM_INFO_INNER]))
400 | {
401 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
402 | if ($this->tag != "br")
403 | {
404 | $ret .= $this->_[HDOM_INFO_INNER];
405 | }
406 | } else {
407 | if ($this->nodes)
408 | {
409 | foreach ($this->nodes as $n)
410 | {
411 | $ret .= $this->convert_text($n->outertext());
412 | }
413 | }
414 | }
415 |
416 | // render end tag
417 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
418 | $ret .= ''.$this->tag.'>';
419 | return $ret;
420 | }
421 |
422 | // get dom node's plain text
423 | function text()
424 | {
425 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
426 | switch ($this->nodetype)
427 | {
428 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
429 | case HDOM_TYPE_COMMENT: return '';
430 | case HDOM_TYPE_UNKNOWN: return '';
431 | }
432 | if (strcasecmp($this->tag, 'script')===0) return '';
433 | if (strcasecmp($this->tag, 'style')===0) return '';
434 |
435 | $ret = '';
436 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
437 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
438 | // WHY is this happening?
439 | if (!is_null($this->nodes))
440 | {
441 | foreach ($this->nodes as $n)
442 | {
443 | $ret .= $this->convert_text($n->text());
444 | }
445 |
446 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
447 | if ($this->tag == "span")
448 | {
449 | $ret .= $this->dom->default_span_text;
450 | }
451 |
452 |
453 | }
454 | return $ret;
455 | }
456 |
457 | function xmltext()
458 | {
459 | $ret = $this->innertext();
460 | $ret = str_ireplace('', '', $ret);
462 | return $ret;
463 | }
464 |
465 | // build node's text with tag
466 | function makeup()
467 | {
468 | // text, comment, unknown
469 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
470 |
471 | $ret = '<'.$this->tag;
472 | $i = -1;
473 |
474 | foreach ($this->attr as $key=>$val)
475 | {
476 | ++$i;
477 |
478 | // skip removed attribute
479 | if ($val===null || $val===false)
480 | continue;
481 |
482 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
483 | //no value attr: nowrap, checked selected...
484 | if ($val===true)
485 | $ret .= $key;
486 | else {
487 | switch ($this->_[HDOM_INFO_QUOTE][$i])
488 | {
489 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
490 | case HDOM_QUOTE_SINGLE: $quote = '\''; break;
491 | default: $quote = '';
492 | }
493 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
494 | }
495 | }
496 | $ret = $this->dom->restore_noise($ret);
497 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
498 | }
499 |
500 | // find elements by css selector
501 | //PaperG - added ability for find to lowercase the value of the selector.
502 | function find($selector, $idx=null, $lowercase=false)
503 | {
504 | $selectors = $this->parse_selector($selector);
505 | if (($count=count($selectors))===0) return array();
506 | $found_keys = array();
507 |
508 | // find each selector
509 | for ($c=0; $c<$count; ++$c)
510 | {
511 | // The change on the below line was documented on the sourceforge code tracker id 2788009
512 | // used to be: if (($levle=count($selectors[0]))===0) return array();
513 | if (($levle=count($selectors[$c]))===0) return array();
514 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
515 |
516 | $head = array($this->_[HDOM_INFO_BEGIN]=>1);
517 |
518 | // handle descendant selectors, no recursive!
519 | for ($l=0; $l<$levle; ++$l)
520 | {
521 | $ret = array();
522 | foreach ($head as $k=>$v)
523 | {
524 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
525 | //PaperG - Pass this optional parameter on to the seek function.
526 | $n->seek($selectors[$c][$l], $ret, $lowercase);
527 | }
528 | $head = $ret;
529 | }
530 |
531 | foreach ($head as $k=>$v)
532 | {
533 | if (!isset($found_keys[$k]))
534 | $found_keys[$k] = 1;
535 | }
536 | }
537 |
538 | // sort keys
539 | ksort($found_keys);
540 |
541 | $found = array();
542 | foreach ($found_keys as $k=>$v)
543 | $found[] = $this->dom->nodes[$k];
544 |
545 | // return nth-element or array
546 | if (is_null($idx)) return $found;
547 | else if ($idx<0) $idx = count($found) + $idx;
548 | return (isset($found[$idx])) ? $found[$idx] : null;
549 | }
550 |
551 | // seek for given conditions
552 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
553 | protected function seek($selector, &$ret, $lowercase=false)
554 | {
555 | global $debugObject;
556 | if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
557 |
558 | list($tag, $key, $val, $exp, $no_key) = $selector;
559 |
560 | // xpath index
561 | if ($tag && $key && is_numeric($key))
562 | {
563 | $count = 0;
564 | foreach ($this->children as $c)
565 | {
566 | if ($tag==='*' || $tag===$c->tag) {
567 | if (++$count==$key) {
568 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
569 | return;
570 | }
571 | }
572 | }
573 | return;
574 | }
575 |
576 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
577 | if ($end==0) {
578 | $parent = $this->parent;
579 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
580 | $end -= 1;
581 | $parent = $parent->parent;
582 | }
583 | $end += $parent->_[HDOM_INFO_END];
584 | }
585 |
586 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
587 | $node = $this->dom->nodes[$i];
588 |
589 | $pass = true;
590 |
591 | if ($tag==='*' && !$key) {
592 | if (in_array($node, $this->children, true))
593 | $ret[$i] = 1;
594 | continue;
595 | }
596 |
597 | // compare tag
598 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
599 | // compare key
600 | if ($pass && $key) {
601 | if ($no_key) {
602 | if (isset($node->attr[$key])) $pass=false;
603 | } else {
604 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
605 | }
606 | }
607 | // compare value
608 | if ($pass && $key && $val && $val!=='*') {
609 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
610 | if ($key == "plaintext") {
611 | // $node->plaintext actually returns $node->text();
612 | $nodeKeyValue = $node->text();
613 | } else {
614 | // this is a normal search, we want the value of that attribute of the tag.
615 | $nodeKeyValue = $node->attr[$key];
616 | }
617 | if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
618 |
619 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
620 | if ($lowercase) {
621 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
622 | } else {
623 | $check = $this->match($exp, $val, $nodeKeyValue);
624 | }
625 | if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));}
626 |
627 | // handle multiple class
628 | if (!$check && strcasecmp($key, 'class')===0) {
629 | foreach (explode(' ',$node->attr[$key]) as $k) {
630 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
631 | if (!empty($k)) {
632 | if ($lowercase) {
633 | $check = $this->match($exp, strtolower($val), strtolower($k));
634 | } else {
635 | $check = $this->match($exp, $val, $k);
636 | }
637 | if ($check) break;
638 | }
639 | }
640 | }
641 | if (!$check) $pass = false;
642 | }
643 | if ($pass) $ret[$i] = 1;
644 | unset($node);
645 | }
646 | // It's passed by reference so this is actually what this function returns.
647 | if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);}
648 | }
649 |
650 | protected function match($exp, $pattern, $value) {
651 | global $debugObject;
652 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
653 |
654 | switch ($exp) {
655 | case '=':
656 | return ($value===$pattern);
657 | case '!=':
658 | return ($value!==$pattern);
659 | case '^=':
660 | return preg_match("/^".preg_quote($pattern,'/')."/", $value);
661 | case '$=':
662 | return preg_match("/".preg_quote($pattern,'/')."$/", $value);
663 | case '*=':
664 | if ($pattern[0]=='/') {
665 | return preg_match($pattern, $value);
666 | }
667 | return preg_match("/".$pattern."/i", $value);
668 | }
669 | return false;
670 | }
671 |
672 | protected function parse_selector($selector_string) {
673 | global $debugObject;
674 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
675 |
676 | // pattern of CSS selectors, modified from mootools
677 | // Paperg: Add the colon to the attrbute, so that it properly finds like google does.
678 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
679 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
680 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
681 | // farther study is required to determine of this should be documented or removed.
682 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
683 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
684 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
685 | if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);}
686 |
687 | $selectors = array();
688 | $result = array();
689 | //print_r($matches);
690 |
691 | foreach ($matches as $m) {
692 | $m[0] = trim($m[0]);
693 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
694 | // for browser generated xpath
695 | if ($m[1]==='tbody') continue;
696 |
697 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
698 | if (!empty($m[2])) {$key='id'; $val=$m[2];}
699 | if (!empty($m[3])) {$key='class'; $val=$m[3];}
700 | if (!empty($m[4])) {$key=$m[4];}
701 | if (!empty($m[5])) {$exp=$m[5];}
702 | if (!empty($m[6])) {$val=$m[6];}
703 |
704 | // convert to lowercase
705 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
706 | //elements that do NOT have the specified attribute
707 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
708 |
709 | $result[] = array($tag, $key, $val, $exp, $no_key);
710 | if (trim($m[7])===',') {
711 | $selectors[] = $result;
712 | $result = array();
713 | }
714 | }
715 | if (count($result)>0)
716 | $selectors[] = $result;
717 | return $selectors;
718 | }
719 |
720 | function __get($name) {
721 | if (isset($this->attr[$name]))
722 | {
723 | return $this->convert_text($this->attr[$name]);
724 | }
725 | switch ($name) {
726 | case 'outertext': return $this->outertext();
727 | case 'innertext': return $this->innertext();
728 | case 'plaintext': return $this->text();
729 | case 'xmltext': return $this->xmltext();
730 | default: return array_key_exists($name, $this->attr);
731 | }
732 | }
733 |
734 | function __set($name, $value) {
735 | switch ($name) {
736 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
737 | case 'innertext':
738 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
739 | return $this->_[HDOM_INFO_INNER] = $value;
740 | }
741 | if (!isset($this->attr[$name])) {
742 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
743 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
744 | }
745 | $this->attr[$name] = $value;
746 | }
747 |
748 | function __isset($name) {
749 | switch ($name) {
750 | case 'outertext': return true;
751 | case 'innertext': return true;
752 | case 'plaintext': return true;
753 | }
754 | //no value attr: nowrap, checked selected...
755 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
756 | }
757 |
758 | function __unset($name) {
759 | if (isset($this->attr[$name]))
760 | unset($this->attr[$name]);
761 | }
762 |
763 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
764 | function convert_text($text)
765 | {
766 | global $debugObject;
767 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
768 |
769 | $converted_text = $text;
770 |
771 | $sourceCharset = "";
772 | $targetCharset = "";
773 |
774 | if ($this->dom)
775 | {
776 | $sourceCharset = strtoupper($this->dom->_charset);
777 | $targetCharset = strtoupper($this->dom->_target_charset);
778 | }
779 | if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
780 |
781 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
782 | {
783 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
784 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
785 | {
786 | $converted_text = $text;
787 | }
788 | else
789 | {
790 | $converted_text = iconv($sourceCharset, $targetCharset, $text);
791 | }
792 | }
793 |
794 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
795 | if ($targetCharset == 'UTF-8')
796 | {
797 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
798 | {
799 | $converted_text = substr($converted_text, 3);
800 | }
801 | if (substr($converted_text, -3) == "\xef\xbb\xbf")
802 | {
803 | $converted_text = substr($converted_text, 0, -3);
804 | }
805 | }
806 |
807 | return $converted_text;
808 | }
809 |
810 | /**
811 | * Returns true if $string is valid UTF-8 and false otherwise.
812 | *
813 | * @param mixed $str String to be tested
814 | * @return boolean
815 | */
816 | static function is_utf8($str)
817 | {
818 | $c=0; $b=0;
819 | $bits=0;
820 | $len=strlen($str);
821 | for($i=0; $i<$len; $i++)
822 | {
823 | $c=ord($str[$i]);
824 | if($c > 128)
825 | {
826 | if(($c >= 254)) return false;
827 | elseif($c >= 252) $bits=6;
828 | elseif($c >= 248) $bits=5;
829 | elseif($c >= 240) $bits=4;
830 | elseif($c >= 224) $bits=3;
831 | elseif($c >= 192) $bits=2;
832 | else return false;
833 | if(($i+$bits) > $len) return false;
834 | while($bits > 1)
835 | {
836 | $i++;
837 | $b=ord($str[$i]);
838 | if($b < 128 || $b > 191) return false;
839 | $bits--;
840 | }
841 | }
842 | }
843 | return true;
844 | }
845 | /*
846 | function is_utf8($string)
847 | {
848 | //this is buggy
849 | return (utf8_encode(utf8_decode($string)) == $string);
850 | }
851 | */
852 |
853 | /**
854 | * Function to try a few tricks to determine the displayed size of an img on the page.
855 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
856 | *
857 | * @author John Schlick
858 | * @version April 19 2012
859 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
860 | */
861 | function get_display_size()
862 | {
863 | global $debugObject;
864 |
865 | $width = -1;
866 | $height = -1;
867 |
868 | if ($this->tag !== 'img')
869 | {
870 | return false;
871 | }
872 |
873 | // See if there is aheight or width attribute in the tag itself.
874 | if (isset($this->attr['width']))
875 | {
876 | $width = $this->attr['width'];
877 | }
878 |
879 | if (isset($this->attr['height']))
880 | {
881 | $height = $this->attr['height'];
882 | }
883 |
884 | // Now look for an inline style.
885 | if (isset($this->attr['style']))
886 | {
887 | // Thanks to user gnarf from stackoverflow for this regular expression.
888 | $attributes = array();
889 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
890 | foreach ($matches as $match) {
891 | $attributes[$match[1]] = $match[2];
892 | }
893 |
894 | // If there is a width in the style attributes:
895 | if (isset($attributes['width']) && $width == -1)
896 | {
897 | // check that the last two characters are px (pixels)
898 | if (strtolower(substr($attributes['width'], -2)) == 'px')
899 | {
900 | $proposed_width = substr($attributes['width'], 0, -2);
901 | // Now make sure that it's an integer and not something stupid.
902 | if (filter_var($proposed_width, FILTER_VALIDATE_INT))
903 | {
904 | $width = $proposed_width;
905 | }
906 | }
907 | }
908 |
909 | // If there is a width in the style attributes:
910 | if (isset($attributes['height']) && $height == -1)
911 | {
912 | // check that the last two characters are px (pixels)
913 | if (strtolower(substr($attributes['height'], -2)) == 'px')
914 | {
915 | $proposed_height = substr($attributes['height'], 0, -2);
916 | // Now make sure that it's an integer and not something stupid.
917 | if (filter_var($proposed_height, FILTER_VALIDATE_INT))
918 | {
919 | $height = $proposed_height;
920 | }
921 | }
922 | }
923 |
924 | }
925 |
926 | // Future enhancement:
927 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
928 |
929 | // Far future enhancement
930 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
931 | // Note that in this case, the class or id will have the img subselector for it to apply to the image.
932 |
933 | // ridiculously far future development
934 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
935 |
936 | $result = array('height' => $height,
937 | 'width' => $width);
938 | return $result;
939 | }
940 |
941 | // camel naming conventions
942 | function getAllAttributes() {return $this->attr;}
943 | function getAttribute($name) {return $this->__get($name);}
944 | function setAttribute($name, $value) {$this->__set($name, $value);}
945 | function hasAttribute($name) {return $this->__isset($name);}
946 | function removeAttribute($name) {$this->__set($name, null);}
947 | function getElementById($id) {return $this->find("#$id", 0);}
948 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
949 | function getElementByTagName($name) {return $this->find($name, 0);}
950 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
951 | function parentNode() {return $this->parent();}
952 | function childNodes($idx=-1) {return $this->children($idx);}
953 | function firstChild() {return $this->first_child();}
954 | function lastChild() {return $this->last_child();}
955 | function nextSibling() {return $this->next_sibling();}
956 | function previousSibling() {return $this->prev_sibling();}
957 | function hasChildNodes() {return $this->has_child();}
958 | function nodeName() {return $this->tag;}
959 | function appendChild($node) {$node->parent($this); return $node;}
960 |
961 | }
962 |
963 | /**
964 | * simple html dom parser
965 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
966 | * Paperg - change $size from protected to public so we can easily access it
967 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
968 | *
969 | * @package PlaceLocalInclude
970 | */
971 | class simple_html_dom
972 | {
973 | public $root = null;
974 | public $nodes = array();
975 | public $callback = null;
976 | public $lowercase = false;
977 | // Used to keep track of how large the text was when we started.
978 | public $original_size;
979 | public $size;
980 | protected $pos;
981 | protected $doc;
982 | protected $char;
983 | protected $cursor;
984 | protected $parent;
985 | protected $noise = array();
986 | protected $token_blank = " \t\r\n";
987 | protected $token_equal = ' =/>';
988 | protected $token_slash = " />\r\n\t";
989 | protected $token_attr = ' >';
990 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
991 | public $_charset = '';
992 | public $_target_charset = '';
993 | protected $default_br_text = "";
994 | public $default_span_text = "";
995 |
996 | // use isset instead of in_array, performance boost about 30%...
997 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
998 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
999 | // Known sourceforge issue #2977341
1000 | // B tags that are not closed cause us to return everything to the end of the document.
1001 | protected $optional_closing_tags = array(
1002 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1003 | 'th'=>array('th'=>1),
1004 | 'td'=>array('td'=>1),
1005 | 'li'=>array('li'=>1),
1006 | 'dt'=>array('dt'=>1, 'dd'=>1),
1007 | 'dd'=>array('dd'=>1, 'dt'=>1),
1008 | 'dl'=>array('dd'=>1, 'dt'=>1),
1009 | 'p'=>array('p'=>1),
1010 | 'nobr'=>array('nobr'=>1),
1011 | 'b'=>array('b'=>1),
1012 | 'option'=>array('option'=>1),
1013 | );
1014 |
1015 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1016 | {
1017 | if ($str)
1018 | {
1019 | if (preg_match("/^http:\/\//i",$str) || is_file($str))
1020 | {
1021 | $this->load_file($str);
1022 | }
1023 | else
1024 | {
1025 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1026 | }
1027 | }
1028 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1029 | if (!$forceTagsClosed) {
1030 | $this->optional_closing_array=array();
1031 | }
1032 | $this->_target_charset = $target_charset;
1033 | }
1034 |
1035 | function __destruct()
1036 | {
1037 | $this->clear();
1038 | }
1039 |
1040 | // load html from string
1041 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1042 | {
1043 | global $debugObject;
1044 |
1045 | // prepare
1046 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1047 | // strip out comments
1048 | $this->remove_noise("''is");
1049 | // strip out cdata
1050 | $this->remove_noise("''is", true);
1051 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1052 | // Script tags removal now preceeds style tag removal.
1053 | // strip out