├── LICENSE.md
├── README.md
├── simple_html_dom.php
└── sitemap-generator.php


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Hemn Chawroka
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PHP XML Sitemap Generator
 2 | 
 3 | This is a simple and small PHP script that I wrote quickly for myself to create a XML sitemap of my page for Google and other search engines. Maybe others can use the script too.
 4 | 
 5 | Sitemap format: [http://www.sitemaps.org/protocol.html](http://www.sitemaps.org/protocol.html)
 6 | 
 7 | ##Features
 8 |  - Actually crawls webpages like Google would
 9 |  - Generates seperate XML file which gets updated every time the script gets executed (Runnable via CRON)
10 |  - Awesome for SEO
11 |  - Crawls faster than online services
12 |  - Adaptable
13 | 
14 | ## Usage
15 | Usage is pretty strait forward:
16 |  - Configure the crawler by modifying the `sitemap-generator.php` file
17 |     - Select URL to crawl
18 |     - Select the file to which the sitemap will be saved
19 |     - Select accepted extensions ("/" is manditory for proper functionality)
20 |     - Select change frequency (always, daily, weekly, monthly, never, etc...)
21 |     - Choose priority (It is all relative so it may as well be 1)
22 |  - Generate sitemap
23 |     - Either send a GET request to this script or simply point your browser
24 |     - A sitemap will be generated and displayed
25 |     - Submit sitemap.xml to Google
26 |     - Setup a CRON Job to send web requests to this script every so often, this will keep the sitemap.xml file up to date
27 | 
28 | The script can be started as CLI script or as Website. CLI is the prefered way to start this script.
29 | 
30 | CLI scripts are started from the command line, can be used with CRON and so on. You start it with the php program.
31 | 
32 | CLI command to create the XML file: `php sitemap-generator.php`
33 | 
34 | To start the program with your Webserver as Website change in the script the line 22 from
35 | ```php
36 |    define ('CLI', true);
37 | ```
38 | to 
39 | ```php
40 |    define ('CLI', false);
41 | ```
42 | 
43 | 
44 | ## sitemap.xml
45 | Add the XML file to your `/robots.txt`.
46 | 
47 | Example line for the robots.txt:
48 | 
49 | ```
50 | Sitemap: http://www.iprodev.com/sitemap.xml
51 | ```
52 | 
53 | 
54 | ## Credits
55 | 
56 | PHP XML Sitemap Generator was created by [Hemn Chawroka](http://iprodev.com) from [iProDev](http://iprodev.com). Released under the MIT license.
57 | 
58 | Included scripts:
59 | 
60 |  - [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net/) - A HTML DOM parser written in PHP5+ let you manipulate HTML in a very easy way!.
61 | 


--------------------------------------------------------------------------------
/simple_html_dom.php:
--------------------------------------------------------------------------------
   1 | <?php
   2 | /**
   3 |  * Website: http://sourceforge.net/projects/simplehtmldom/
   4 |  * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
   5 |  * Contributions by:
   6 |  *     Yousuke Kumakura (Attribute filters)
   7 |  *     Vadim Voituk (Negative indexes supports of "find" method)
   8 |  *     Antcs (Constructor with automatically load contents either text or file/url)
   9 |  *
  10 |  * all affected sections have comments starting with "PaperG"
  11 |  *
  12 |  * Paperg - Added case insensitive testing of the value of the selector.
  13 |  * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
  14 |  *  This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
  15 |  *  it will almost always be smaller by some amount.
  16 |  *  We use this to determine how far into the file the tag in question is.  This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
  17 |  *  but for most purposes, it's a really good estimation.
  18 |  * Paperg - Added the forceTagsClosed to the dom constructor.  Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
  19 |  * Allow the user to tell us how much they trust the html.
  20 |  * Paperg add the text and plaintext to the selectors for the find syntax.  plaintext implies text in the innertext of a node.  text implies that the tag is a text node.
  21 |  * This allows for us to find tags based on the text they contain.
  22 |  * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
  23 |  * Paperg: added parse_charset so that we know about the character set of the source document.
  24 |  *  NOTE:  If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
  25 |  *  last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
  26 |  *
  27 |  * Found infinite loop in the case of broken html in restore_noise.  Rewrote to protect from that.
  28 |  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
  29 |  *
  30 |  * Licensed under The MIT License
  31 |  * Redistributions of files must retain the above copyright notice.
  32 |  *
  33 |  * @author S.C. Chen <me578022@gmail.com>
  34 |  * @author John Schlick
  35 |  * @author Rus Carroll
  36 |  * @version 1.5 ($Rev: 196 $)
  37 |  * @package PlaceLocalInclude
  38 |  * @subpackage simple_html_dom
  39 |  */
  40 | 
  41 | /**
  42 |  * All of the Defines for the classes below.
  43 |  * @author S.C. Chen <me578022@gmail.com>
  44 |  */
  45 | define('HDOM_TYPE_ELEMENT', 1);
  46 | define('HDOM_TYPE_COMMENT', 2);
  47 | define('HDOM_TYPE_TEXT',    3);
  48 | define('HDOM_TYPE_ENDTAG',  4);
  49 | define('HDOM_TYPE_ROOT',    5);
  50 | define('HDOM_TYPE_UNKNOWN', 6);
  51 | define('HDOM_QUOTE_DOUBLE', 0);
  52 | define('HDOM_QUOTE_SINGLE', 1);
  53 | define('HDOM_QUOTE_NO',     3);
  54 | define('HDOM_INFO_BEGIN',   0);
  55 | define('HDOM_INFO_END',     1);
  56 | define('HDOM_INFO_QUOTE',   2);
  57 | define('HDOM_INFO_SPACE',   3);
  58 | define('HDOM_INFO_TEXT',    4);
  59 | define('HDOM_INFO_INNER',   5);
  60 | define('HDOM_INFO_OUTER',   6);
  61 | define('HDOM_INFO_ENDSPACE',7);
  62 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  63 | define('DEFAULT_BR_TEXT', "\r\n");
  64 | define('DEFAULT_SPAN_TEXT', " ");
  65 | define('MAX_FILE_SIZE', 600000);
  66 | // helper functions
  67 | // -----------------------------------------------------------------------------
  68 | // get html dom from file
  69 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
  70 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  71 | {
  72 |     // We DO force the tags to be terminated.
  73 |     $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  74 |     // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
  75 |     $contents = file_get_contents($url, $use_include_path, $context, $offset);
  76 |     // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
  77 |     //$contents = retrieve_url_contents($url);
  78 |     if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
  79 |     {
  80 |         return false;
  81 |     }
  82 |     // The second parameter can force the selectors to all be lowercase.
  83 |     $dom->load($contents, $lowercase, $stripRN);
  84 |     return $dom;
  85 | }
  86 | 
  87 | // get html dom from string
  88 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  89 | {
  90 |     $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  91 |     if (empty($str) || strlen($str) > MAX_FILE_SIZE)
  92 |     {
  93 |         $dom->clear();
  94 |         return false;
  95 |     }
  96 |     $dom->load($str, $lowercase, $stripRN);
  97 |     return $dom;
  98 | }
  99 | 
 100 | // dump html dom tree
 101 | function dump_html_tree($node, $show_attr=true, $deep=0)
 102 | {
 103 |     $node->dump($node);
 104 | }
 105 | 
 106 | 
 107 | /**
 108 |  * simple html dom node
 109 |  * PaperG - added ability for "find" routine to lowercase the value of the selector.
 110 |  * PaperG - added $tag_start to track the start position of the tag in the total byte index
 111 |  *
 112 |  * @package PlaceLocalInclude
 113 |  */
 114 | class simple_html_dom_node
 115 | {
 116 |     public $nodetype = HDOM_TYPE_TEXT;
 117 |     public $tag = 'text';
 118 |     public $attr = array();
 119 |     public $children = array();
 120 |     public $nodes = array();
 121 |     public $parent = null;
 122 |     // The "info" array - see HDOM_INFO_... for what each element contains.
 123 |     public $_ = array();
 124 |     public $tag_start = 0;
 125 |     private $dom = null;
 126 | 
 127 |     function __construct($dom)
 128 |     {
 129 |         $this->dom = $dom;
 130 |         $dom->nodes[] = $this;
 131 |     }
 132 | 
 133 |     function __destruct()
 134 |     {
 135 |         $this->clear();
 136 |     }
 137 | 
 138 |     function __toString()
 139 |     {
 140 |         return $this->outertext();
 141 |     }
 142 | 
 143 |     // clean up memory due to php5 circular references memory leak...
 144 |     function clear()
 145 |     {
 146 |         $this->dom = null;
 147 |         $this->nodes = null;
 148 |         $this->parent = null;
 149 |         $this->children = null;
 150 |     }
 151 | 
 152 |     // dump node's tree
 153 |     function dump($show_attr=true, $deep=0)
 154 |     {
 155 |         $lead = str_repeat('    ', $deep);
 156 | 
 157 |         echo $lead.$this->tag;
 158 |         if ($show_attr && count($this->attr)>0)
 159 |         {
 160 |             echo '(';
 161 |             foreach ($this->attr as $k=>$v)
 162 |                 echo "[$k]=>\"".$this->$k.'", ';
 163 |             echo ')';
 164 |         }
 165 |         echo "\n";
 166 | 
 167 |         if ($this->nodes)
 168 |         {
 169 |             foreach ($this->nodes as $c)
 170 |             {
 171 |                 $c->dump($show_attr, $deep+1);
 172 |             }
 173 |         }
 174 |     }
 175 | 
 176 | 
 177 |     // Debugging function to dump a single dom node with a bunch of information about it.
 178 |     function dump_node($echo=true)
 179 |     {
 180 | 
 181 |         $string = $this->tag;
 182 |         if (count($this->attr)>0)
 183 |         {
 184 |             $string .= '(';
 185 |             foreach ($this->attr as $k=>$v)
 186 |             {
 187 |                 $string .= "[$k]=>\"".$this->$k.'", ';
 188 |             }
 189 |             $string .= ')';
 190 |         }
 191 |         if (count($this->_)>0)
 192 |         {
 193 |             $string .= ' $_ (';
 194 |             foreach ($this->_ as $k=>$v)
 195 |             {
 196 |                 if (is_array($v))
 197 |                 {
 198 |                     $string .= "[$k]=>(";
 199 |                     foreach ($v as $k2=>$v2)
 200 |                     {
 201 |                         $string .= "[$k2]=>\"".$v2.'", ';
 202 |                     }
 203 |                     $string .= ")";
 204 |                 } else {
 205 |                     $string .= "[$k]=>\"".$v.'", ';
 206 |                 }
 207 |             }
 208 |             $string .= ")";
 209 |         }
 210 | 
 211 |         if (isset($this->text))
 212 |         {
 213 |             $string .= " text: (" . $this->text . ")";
 214 |         }
 215 | 
 216 |         $string .= " HDOM_INNER_INFO: '";
 217 |         if (isset($node->_[HDOM_INFO_INNER]))
 218 |         {
 219 |             $string .= $node->_[HDOM_INFO_INNER] . "'";
 220 |         }
 221 |         else
 222 |         {
 223 |             $string .= ' NULL ';
 224 |         }
 225 | 
 226 |         $string .= " children: " . count($this->children);
 227 |         $string .= " nodes: " . count($this->nodes);
 228 |         $string .= " tag_start: " . $this->tag_start;
 229 |         $string .= "\n";
 230 | 
 231 |         if ($echo)
 232 |         {
 233 |             echo $string;
 234 |             return;
 235 |         }
 236 |         else
 237 |         {
 238 |             return $string;
 239 |         }
 240 |     }
 241 | 
 242 |     // returns the parent of node
 243 |     // If a node is passed in, it will reset the parent of the current node to that one.
 244 |     function parent($parent=null)
 245 |     {
 246 |         // I am SURE that this doesn't work properly.
 247 |         // It fails to unset the current node from it's current parents nodes or children list first.
 248 |         if ($parent !== null)
 249 |         {
 250 |             $this->parent = $parent;
 251 |             $this->parent->nodes[] = $this;
 252 |             $this->parent->children[] = $this;
 253 |         }
 254 | 
 255 |         return $this->parent;
 256 |     }
 257 | 
 258 |     // verify that node has children
 259 |     function has_child()
 260 |     {
 261 |         return !empty($this->children);
 262 |     }
 263 | 
 264 |     // returns children of node
 265 |     function children($idx=-1)
 266 |     {
 267 |         if ($idx===-1)
 268 |         {
 269 |             return $this->children;
 270 |         }
 271 |         if (isset($this->children[$idx])) return $this->children[$idx];
 272 |         return null;
 273 |     }
 274 | 
 275 |     // returns the first child of node
 276 |     function first_child()
 277 |     {
 278 |         if (count($this->children)>0)
 279 |         {
 280 |             return $this->children[0];
 281 |         }
 282 |         return null;
 283 |     }
 284 | 
 285 |     // returns the last child of node
 286 |     function last_child()
 287 |     {
 288 |         if (($count=count($this->children))>0)
 289 |         {
 290 |             return $this->children[$count-1];
 291 |         }
 292 |         return null;
 293 |     }
 294 | 
 295 |     // returns the next sibling of node
 296 |     function next_sibling()
 297 |     {
 298 |         if ($this->parent===null)
 299 |         {
 300 |             return null;
 301 |         }
 302 | 
 303 |         $idx = 0;
 304 |         $count = count($this->parent->children);
 305 |         while ($idx<$count && $this!==$this->parent->children[$idx])
 306 |         {
 307 |             ++$idx;
 308 |         }
 309 |         if (++$idx>=$count)
 310 |         {
 311 |             return null;
 312 |         }
 313 |         return $this->parent->children[$idx];
 314 |     }
 315 | 
 316 |     // returns the previous sibling of node
 317 |     function prev_sibling()
 318 |     {
 319 |         if ($this->parent===null) return null;
 320 |         $idx = 0;
 321 |         $count = count($this->parent->children);
 322 |         while ($idx<$count && $this!==$this->parent->children[$idx])
 323 |             ++$idx;
 324 |         if (--$idx<0) return null;
 325 |         return $this->parent->children[$idx];
 326 |     }
 327 | 
 328 |     // function to locate a specific ancestor tag in the path to the root.
 329 |     function find_ancestor_tag($tag)
 330 |     {
 331 |         global $debugObject;
 332 |         if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
 333 | 
 334 |         // Start by including ourselves in the comparison.
 335 |         $returnDom = $this;
 336 | 
 337 |         while (!is_null($returnDom))
 338 |         {
 339 |             if (is_object($debugObject)) { $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag); }
 340 | 
 341 |             if ($returnDom->tag == $tag)
 342 |             {
 343 |                 break;
 344 |             }
 345 |             $returnDom = $returnDom->parent;
 346 |         }
 347 |         return $returnDom;
 348 |     }
 349 | 
 350 |     // get dom node's inner html
 351 |     function innertext()
 352 |     {
 353 |         if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 354 |         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 355 | 
 356 |         $ret = '';
 357 |         foreach ($this->nodes as $n)
 358 |             $ret .= $n->outertext();
 359 |         return $ret;
 360 |     }
 361 | 
 362 |     // get dom node's outer text (with tag)
 363 |     function outertext()
 364 |     {
 365 |         global $debugObject;
 366 |         if (is_object($debugObject))
 367 |         {
 368 |             $text = '';
 369 |             if ($this->tag == 'text')
 370 |             {
 371 |                 if (!empty($this->text))
 372 |                 {
 373 |                     $text = " with text: " . $this->text;
 374 |                 }
 375 |             }
 376 |             $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
 377 |         }
 378 | 
 379 |         if ($this->tag==='root') return $this->innertext();
 380 | 
 381 |         // trigger callback
 382 |         if ($this->dom && $this->dom->callback!==null)
 383 |         {
 384 |             call_user_func_array($this->dom->callback, array($this));
 385 |         }
 386 | 
 387 |         if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
 388 |         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 389 | 
 390 |         // render begin tag
 391 |         if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
 392 |         {
 393 |             $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
 394 |         } else {
 395 |             $ret = "";
 396 |         }
 397 | 
 398 |         // render inner text
 399 |         if (isset($this->_[HDOM_INFO_INNER]))
 400 |         {
 401 |             // If it's a br tag...  don't return the HDOM_INNER_INFO that we may or may not have added.
 402 |             if ($this->tag != "br")
 403 |             {
 404 |                 $ret .= $this->_[HDOM_INFO_INNER];
 405 |             }
 406 |         } else {
 407 |             if ($this->nodes)
 408 |             {
 409 |                 foreach ($this->nodes as $n)
 410 |                 {
 411 |                     $ret .= $this->convert_text($n->outertext());
 412 |                 }
 413 |             }
 414 |         }
 415 | 
 416 |         // render end tag
 417 |         if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
 418 |             $ret .= '</'.$this->tag.'>';
 419 |         return $ret;
 420 |     }
 421 | 
 422 |     // get dom node's plain text
 423 |     function text()
 424 |     {
 425 |         if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 426 |         switch ($this->nodetype)
 427 |         {
 428 |             case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 429 |             case HDOM_TYPE_COMMENT: return '';
 430 |             case HDOM_TYPE_UNKNOWN: return '';
 431 |         }
 432 |         if (strcasecmp($this->tag, 'script')===0) return '';
 433 |         if (strcasecmp($this->tag, 'style')===0) return '';
 434 | 
 435 |         $ret = '';
 436 |         // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
 437 |         // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
 438 |         // WHY is this happening?
 439 |         if (!is_null($this->nodes))
 440 |         {
 441 |             foreach ($this->nodes as $n)
 442 |             {
 443 |                 $ret .= $this->convert_text($n->text());
 444 |             }
 445 | 
 446 |             // If this node is a span... add a space at the end of it so multiple spans don't run into each other.  This is plaintext after all.
 447 |             if ($this->tag == "span")
 448 |             {
 449 |                 $ret .= $this->dom->default_span_text;
 450 |             }
 451 | 
 452 | 
 453 |         }
 454 |         return $ret;
 455 |     }
 456 | 
 457 |     function xmltext()
 458 |     {
 459 |         $ret = $this->innertext();
 460 |         $ret = str_ireplace('<![CDATA[', '', $ret);
 461 |         $ret = str_replace(']]>', '', $ret);
 462 |         return $ret;
 463 |     }
 464 | 
 465 |     // build node's text with tag
 466 |     function makeup()
 467 |     {
 468 |         // text, comment, unknown
 469 |         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 470 | 
 471 |         $ret = '<'.$this->tag;
 472 |         $i = -1;
 473 | 
 474 |         foreach ($this->attr as $key=>$val)
 475 |         {
 476 |             ++$i;
 477 | 
 478 |             // skip removed attribute
 479 |             if ($val===null || $val===false)
 480 |                 continue;
 481 | 
 482 |             $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
 483 |             //no value attr: nowrap, checked selected...
 484 |             if ($val===true)
 485 |                 $ret .= $key;
 486 |             else {
 487 |                 switch ($this->_[HDOM_INFO_QUOTE][$i])
 488 |                 {
 489 |                     case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
 490 |                     case HDOM_QUOTE_SINGLE: $quote = '\''; break;
 491 |                     default: $quote = '';
 492 |                 }
 493 |                 $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
 494 |             }
 495 |         }
 496 |         $ret = $this->dom->restore_noise($ret);
 497 |         return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
 498 |     }
 499 | 
 500 |     // find elements by css selector
 501 |     //PaperG - added ability for find to lowercase the value of the selector.
 502 |     function find($selector, $idx=null, $lowercase=false)
 503 |     {
 504 |         $selectors = $this->parse_selector($selector);
 505 |         if (($count=count($selectors))===0) return array();
 506 |         $found_keys = array();
 507 | 
 508 |         // find each selector
 509 |         for ($c=0; $c<$count; ++$c)
 510 |         {
 511 |             // The change on the below line was documented on the sourceforge code tracker id 2788009
 512 |             // used to be: if (($levle=count($selectors[0]))===0) return array();
 513 |             if (($levle=count($selectors[$c]))===0) return array();
 514 |             if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
 515 | 
 516 |             $head = array($this->_[HDOM_INFO_BEGIN]=>1);
 517 | 
 518 |             // handle descendant selectors, no recursive!
 519 |             for ($l=0; $l<$levle; ++$l)
 520 |             {
 521 |                 $ret = array();
 522 |                 foreach ($head as $k=>$v)
 523 |                 {
 524 |                     $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
 525 |                     //PaperG - Pass this optional parameter on to the seek function.
 526 |                     $n->seek($selectors[$c][$l], $ret, $lowercase);
 527 |                 }
 528 |                 $head = $ret;
 529 |             }
 530 | 
 531 |             foreach ($head as $k=>$v)
 532 |             {
 533 |                 if (!isset($found_keys[$k]))
 534 |                     $found_keys[$k] = 1;
 535 |             }
 536 |         }
 537 | 
 538 |         // sort keys
 539 |         ksort($found_keys);
 540 | 
 541 |         $found = array();
 542 |         foreach ($found_keys as $k=>$v)
 543 |             $found[] = $this->dom->nodes[$k];
 544 | 
 545 |         // return nth-element or array
 546 |         if (is_null($idx)) return $found;
 547 |         else if ($idx<0) $idx = count($found) + $idx;
 548 |         return (isset($found[$idx])) ? $found[$idx] : null;
 549 |     }
 550 | 
 551 |     // seek for given conditions
 552 |     // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
 553 |     protected function seek($selector, &$ret, $lowercase=false)
 554 |     {
 555 |         global $debugObject;
 556 |         if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
 557 | 
 558 |         list($tag, $key, $val, $exp, $no_key) = $selector;
 559 | 
 560 |         // xpath index
 561 |         if ($tag && $key && is_numeric($key))
 562 |         {
 563 |             $count = 0;
 564 |             foreach ($this->children as $c)
 565 |             {
 566 |                 if ($tag==='*' || $tag===$c->tag) {
 567 |                     if (++$count==$key) {
 568 |                         $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
 569 |                         return;
 570 |                     }
 571 |                 }
 572 |             }
 573 |             return;
 574 |         }
 575 | 
 576 |         $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
 577 |         if ($end==0) {
 578 |             $parent = $this->parent;
 579 |             while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
 580 |                 $end -= 1;
 581 |                 $parent = $parent->parent;
 582 |             }
 583 |             $end += $parent->_[HDOM_INFO_END];
 584 |         }
 585 | 
 586 |         for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
 587 |             $node = $this->dom->nodes[$i];
 588 | 
 589 |             $pass = true;
 590 | 
 591 |             if ($tag==='*' && !$key) {
 592 |                 if (in_array($node, $this->children, true))
 593 |                     $ret[$i] = 1;
 594 |                 continue;
 595 |             }
 596 | 
 597 |             // compare tag
 598 |             if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
 599 |             // compare key
 600 |             if ($pass && $key) {
 601 |                 if ($no_key) {
 602 |                     if (isset($node->attr[$key])) $pass=false;
 603 |                 } else {
 604 |                     if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
 605 |                 }
 606 |             }
 607 |             // compare value
 608 |             if ($pass && $key && $val  && $val!=='*') {
 609 |                 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
 610 |                 if ($key == "plaintext") {
 611 |                     // $node->plaintext actually returns $node->text();
 612 |                     $nodeKeyValue = $node->text();
 613 |                 } else {
 614 |                     // this is a normal search, we want the value of that attribute of the tag.
 615 |                     $nodeKeyValue = $node->attr[$key];
 616 |                 }
 617 |                 if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
 618 | 
 619 |                 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
 620 |                 if ($lowercase) {
 621 |                     $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
 622 |                 } else {
 623 |                     $check = $this->match($exp, $val, $nodeKeyValue);
 624 |                 }
 625 |                 if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));}
 626 | 
 627 |                 // handle multiple class
 628 |                 if (!$check && strcasecmp($key, 'class')===0) {
 629 |                     foreach (explode(' ',$node->attr[$key]) as $k) {
 630 |                         // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
 631 |                         if (!empty($k)) {
 632 |                             if ($lowercase) {
 633 |                                 $check = $this->match($exp, strtolower($val), strtolower($k));
 634 |                             } else {
 635 |                                 $check = $this->match($exp, $val, $k);
 636 |                             }
 637 |                             if ($check) break;
 638 |                         }
 639 |                     }
 640 |                 }
 641 |                 if (!$check) $pass = false;
 642 |             }
 643 |             if ($pass) $ret[$i] = 1;
 644 |             unset($node);
 645 |         }
 646 |         // It's passed by reference so this is actually what this function returns.
 647 |         if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);}
 648 |     }
 649 | 
 650 |     protected function match($exp, $pattern, $value) {
 651 |         global $debugObject;
 652 |         if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 653 | 
 654 |         switch ($exp) {
 655 |             case '=':
 656 |                 return ($value===$pattern);
 657 |             case '!=':
 658 |                 return ($value!==$pattern);
 659 |             case '^=':
 660 |                 return preg_match("/^".preg_quote($pattern,'/')."/", $value);
 661 |             case '$=':
 662 |                 return preg_match("/".preg_quote($pattern,'/')."$/", $value);
 663 |             case '*=':
 664 |                 if ($pattern[0]=='/') {
 665 |                     return preg_match($pattern, $value);
 666 |                 }
 667 |                 return preg_match("/".$pattern."/i", $value);
 668 |         }
 669 |         return false;
 670 |     }
 671 | 
 672 |     protected function parse_selector($selector_string) {
 673 |         global $debugObject;
 674 |         if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 675 | 
 676 |         // pattern of CSS selectors, modified from mootools
 677 |         // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
 678 |         // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
 679 | // Notice the \[ starting the attbute?  and the @? following?  This implies that an attribute can begin with an @ sign that is not captured.
 680 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
 681 | // farther study is required to determine of this should be documented or removed.
 682 | //        $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 683 |         $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 684 |         preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
 685 |         if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);}
 686 | 
 687 |         $selectors = array();
 688 |         $result = array();
 689 |         //print_r($matches);
 690 | 
 691 |         foreach ($matches as $m) {
 692 |             $m[0] = trim($m[0]);
 693 |             if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
 694 |             // for browser generated xpath
 695 |             if ($m[1]==='tbody') continue;
 696 | 
 697 |             list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
 698 |             if (!empty($m[2])) {$key='id'; $val=$m[2];}
 699 |             if (!empty($m[3])) {$key='class'; $val=$m[3];}
 700 |             if (!empty($m[4])) {$key=$m[4];}
 701 |             if (!empty($m[5])) {$exp=$m[5];}
 702 |             if (!empty($m[6])) {$val=$m[6];}
 703 | 
 704 |             // convert to lowercase
 705 |             if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
 706 |             //elements that do NOT have the specified attribute
 707 |             if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
 708 | 
 709 |             $result[] = array($tag, $key, $val, $exp, $no_key);
 710 |             if (trim($m[7])===',') {
 711 |                 $selectors[] = $result;
 712 |                 $result = array();
 713 |             }
 714 |         }
 715 |         if (count($result)>0)
 716 |             $selectors[] = $result;
 717 |         return $selectors;
 718 |     }
 719 | 
 720 |     function __get($name) {
 721 |         if (isset($this->attr[$name]))
 722 |         {
 723 |             return $this->convert_text($this->attr[$name]);
 724 |         }
 725 |         switch ($name) {
 726 |             case 'outertext': return $this->outertext();
 727 |             case 'innertext': return $this->innertext();
 728 |             case 'plaintext': return $this->text();
 729 |             case 'xmltext': return $this->xmltext();
 730 |             default: return array_key_exists($name, $this->attr);
 731 |         }
 732 |     }
 733 | 
 734 |     function __set($name, $value) {
 735 |         switch ($name) {
 736 |             case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
 737 |             case 'innertext':
 738 |                 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
 739 |                 return $this->_[HDOM_INFO_INNER] = $value;
 740 |         }
 741 |         if (!isset($this->attr[$name])) {
 742 |             $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
 743 |             $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
 744 |         }
 745 |         $this->attr[$name] = $value;
 746 |     }
 747 | 
 748 |     function __isset($name) {
 749 |         switch ($name) {
 750 |             case 'outertext': return true;
 751 |             case 'innertext': return true;
 752 |             case 'plaintext': return true;
 753 |         }
 754 |         //no value attr: nowrap, checked selected...
 755 |         return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
 756 |     }
 757 | 
 758 |     function __unset($name) {
 759 |         if (isset($this->attr[$name]))
 760 |             unset($this->attr[$name]);
 761 |     }
 762 | 
 763 |     // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
 764 |     function convert_text($text)
 765 |     {
 766 |         global $debugObject;
 767 |         if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 768 | 
 769 |         $converted_text = $text;
 770 | 
 771 |         $sourceCharset = "";
 772 |         $targetCharset = "";
 773 | 
 774 |         if ($this->dom)
 775 |         {
 776 |             $sourceCharset = strtoupper($this->dom->_charset);
 777 |             $targetCharset = strtoupper($this->dom->_target_charset);
 778 |         }
 779 |         if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
 780 | 
 781 |         if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
 782 |         {
 783 |             // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
 784 |             if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
 785 |             {
 786 |                 $converted_text = $text;
 787 |             }
 788 |             else
 789 |             {
 790 |                 $converted_text = iconv($sourceCharset, $targetCharset, $text);
 791 |             }
 792 |         }
 793 | 
 794 |         // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
 795 |         if ($targetCharset == 'UTF-8')
 796 |         {
 797 |             if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
 798 |             {
 799 |                 $converted_text = substr($converted_text, 3);
 800 |             }
 801 |             if (substr($converted_text, -3) == "\xef\xbb\xbf")
 802 |             {
 803 |                 $converted_text = substr($converted_text, 0, -3);
 804 |             }
 805 |         }
 806 | 
 807 |         return $converted_text;
 808 |     }
 809 | 
 810 |     /**
 811 |     * Returns true if $string is valid UTF-8 and false otherwise.
 812 |     *
 813 |     * @param mixed $str String to be tested
 814 |     * @return boolean
 815 |     */
 816 |     static function is_utf8($str)
 817 |     {
 818 |         $c=0; $b=0;
 819 |         $bits=0;
 820 |         $len=strlen($str);
 821 |         for($i=0; $i<$len; $i++)
 822 |         {
 823 |             $c=ord($str[$i]);
 824 |             if($c > 128)
 825 |             {
 826 |                 if(($c >= 254)) return false;
 827 |                 elseif($c >= 252) $bits=6;
 828 |                 elseif($c >= 248) $bits=5;
 829 |                 elseif($c >= 240) $bits=4;
 830 |                 elseif($c >= 224) $bits=3;
 831 |                 elseif($c >= 192) $bits=2;
 832 |                 else return false;
 833 |                 if(($i+$bits) > $len) return false;
 834 |                 while($bits > 1)
 835 |                 {
 836 |                     $i++;
 837 |                     $b=ord($str[$i]);
 838 |                     if($b < 128 || $b > 191) return false;
 839 |                     $bits--;
 840 |                 }
 841 |             }
 842 |         }
 843 |         return true;
 844 |     }
 845 |     /*
 846 |     function is_utf8($string)
 847 |     {
 848 |         //this is buggy
 849 |         return (utf8_encode(utf8_decode($string)) == $string);
 850 |     }
 851 |     */
 852 | 
 853 |     /**
 854 |      * Function to try a few tricks to determine the displayed size of an img on the page.
 855 |      * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
 856 |      *
 857 |      * @author John Schlick
 858 |      * @version April 19 2012
 859 |      * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
 860 |      */
 861 |     function get_display_size()
 862 |     {
 863 |         global $debugObject;
 864 | 
 865 |         $width = -1;
 866 |         $height = -1;
 867 | 
 868 |         if ($this->tag !== 'img')
 869 |         {
 870 |             return false;
 871 |         }
 872 | 
 873 |         // See if there is aheight or width attribute in the tag itself.
 874 |         if (isset($this->attr['width']))
 875 |         {
 876 |             $width = $this->attr['width'];
 877 |         }
 878 | 
 879 |         if (isset($this->attr['height']))
 880 |         {
 881 |             $height = $this->attr['height'];
 882 |         }
 883 | 
 884 |         // Now look for an inline style.
 885 |         if (isset($this->attr['style']))
 886 |         {
 887 |             // Thanks to user gnarf from stackoverflow for this regular expression.
 888 |             $attributes = array();
 889 |             preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
 890 |             foreach ($matches as $match) {
 891 |               $attributes[$match[1]] = $match[2];
 892 |             }
 893 | 
 894 |             // If there is a width in the style attributes:
 895 |             if (isset($attributes['width']) && $width == -1)
 896 |             {
 897 |                 // check that the last two characters are px (pixels)
 898 |                 if (strtolower(substr($attributes['width'], -2)) == 'px')
 899 |                 {
 900 |                     $proposed_width = substr($attributes['width'], 0, -2);
 901 |                     // Now make sure that it's an integer and not something stupid.
 902 |                     if (filter_var($proposed_width, FILTER_VALIDATE_INT))
 903 |                     {
 904 |                         $width = $proposed_width;
 905 |                     }
 906 |                 }
 907 |             }
 908 | 
 909 |             // If there is a width in the style attributes:
 910 |             if (isset($attributes['height']) && $height == -1)
 911 |             {
 912 |                 // check that the last two characters are px (pixels)
 913 |                 if (strtolower(substr($attributes['height'], -2)) == 'px')
 914 |                 {
 915 |                     $proposed_height = substr($attributes['height'], 0, -2);
 916 |                     // Now make sure that it's an integer and not something stupid.
 917 |                     if (filter_var($proposed_height, FILTER_VALIDATE_INT))
 918 |                     {
 919 |                         $height = $proposed_height;
 920 |                     }
 921 |                 }
 922 |             }
 923 | 
 924 |         }
 925 | 
 926 |         // Future enhancement:
 927 |         // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
 928 | 
 929 |         // Far future enhancement
 930 |         // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
 931 |         // Note that in this case, the class or id will have the img subselector for it to apply to the image.
 932 | 
 933 |         // ridiculously far future development
 934 |         // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
 935 | 
 936 |         $result = array('height' => $height,
 937 |                         'width' => $width);
 938 |         return $result;
 939 |     }
 940 | 
 941 |     // camel naming conventions
 942 |     function getAllAttributes() {return $this->attr;}
 943 |     function getAttribute($name) {return $this->__get($name);}
 944 |     function setAttribute($name, $value) {$this->__set($name, $value);}
 945 |     function hasAttribute($name) {return $this->__isset($name);}
 946 |     function removeAttribute($name) {$this->__set($name, null);}
 947 |     function getElementById($id) {return $this->find("#$id", 0);}
 948 |     function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
 949 |     function getElementByTagName($name) {return $this->find($name, 0);}
 950 |     function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
 951 |     function parentNode() {return $this->parent();}
 952 |     function childNodes($idx=-1) {return $this->children($idx);}
 953 |     function firstChild() {return $this->first_child();}
 954 |     function lastChild() {return $this->last_child();}
 955 |     function nextSibling() {return $this->next_sibling();}
 956 |     function previousSibling() {return $this->prev_sibling();}
 957 |     function hasChildNodes() {return $this->has_child();}
 958 |     function nodeName() {return $this->tag;}
 959 |     function appendChild($node) {$node->parent($this); return $node;}
 960 | 
 961 | }
 962 | 
 963 | /**
 964 |  * simple html dom parser
 965 |  * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
 966 |  * Paperg - change $size from protected to public so we can easily access it
 967 |  * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not.  Default is to NOT trust it.
 968 |  *
 969 |  * @package PlaceLocalInclude
 970 |  */
 971 | class simple_html_dom
 972 | {
 973 |     public $root = null;
 974 |     public $nodes = array();
 975 |     public $callback = null;
 976 |     public $lowercase = false;
 977 |     // Used to keep track of how large the text was when we started.
 978 |     public $original_size;
 979 |     public $size;
 980 |     protected $pos;
 981 |     protected $doc;
 982 |     protected $char;
 983 |     protected $cursor;
 984 |     protected $parent;
 985 |     protected $noise = array();
 986 |     protected $token_blank = " \t\r\n";
 987 |     protected $token_equal = ' =/>';
 988 |     protected $token_slash = " />\r\n\t";
 989 |     protected $token_attr = ' >';
 990 |     // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
 991 |     public $_charset = '';
 992 |     public $_target_charset = '';
 993 |     protected $default_br_text = "";
 994 |     public $default_span_text = "";
 995 | 
 996 |     // use isset instead of in_array, performance boost about 30%...
 997 |     protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
 998 |     protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
 999 |     // Known sourceforge issue #2977341
1000 |     // B tags that are not closed cause us to return everything to the end of the document.
1001 |     protected $optional_closing_tags = array(
1002 |         'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1003 |         'th'=>array('th'=>1),
1004 |         'td'=>array('td'=>1),
1005 |         'li'=>array('li'=>1),
1006 |         'dt'=>array('dt'=>1, 'dd'=>1),
1007 |         'dd'=>array('dd'=>1, 'dt'=>1),
1008 |         'dl'=>array('dd'=>1, 'dt'=>1),
1009 |         'p'=>array('p'=>1),
1010 |         'nobr'=>array('nobr'=>1),
1011 |         'b'=>array('b'=>1),
1012 | 		'option'=>array('option'=>1),
1013 |     );
1014 | 
1015 |     function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1016 |     {
1017 |         if ($str)
1018 |         {
1019 |             if (preg_match("/^http:\/\//i",$str) || is_file($str))
1020 |             {
1021 |                 $this->load_file($str);
1022 |             }
1023 |             else
1024 |             {
1025 |                 $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1026 |             }
1027 |         }
1028 |         // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1029 |         if (!$forceTagsClosed) {
1030 |             $this->optional_closing_array=array();
1031 |         }
1032 |         $this->_target_charset = $target_charset;
1033 |     }
1034 | 
1035 |     function __destruct()
1036 |     {
1037 |         $this->clear();
1038 |     }
1039 | 
1040 |     // load html from string
1041 |     function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1042 |     {
1043 |         global $debugObject;
1044 | 
1045 |         // prepare
1046 |         $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1047 |         // strip out comments
1048 |         $this->remove_noise("'<!--(.*?)-->'is");
1049 |         // strip out cdata
1050 |         $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1051 |         // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1052 |         // Script tags removal now preceeds style tag removal.
1053 |         // strip out <script> tags
1054 |         $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1055 |         $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1056 |         // strip out <style> tags
1057 |         $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1058 |         $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1059 |         // strip out preformatted tags
1060 |         $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1061 |         // strip out server side scripts
1062 |         $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1063 |         // strip smarty scripts
1064 |         $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1065 | 
1066 |         // parsing
1067 |         while ($this->parse());
1068 |         // end
1069 |         $this->root->_[HDOM_INFO_END] = $this->cursor;
1070 |         $this->parse_charset();
1071 | 
1072 |         // make load function chainable
1073 |         return $this;
1074 | 
1075 |     }
1076 | 
1077 |     // load html from file
1078 |     function load_file()
1079 |     {
1080 |         $args = func_get_args();
1081 |         $this->load(call_user_func_array('file_get_contents', $args), true);
1082 |         // Throw an error if we can't properly load the dom.
1083 |         if (($error=error_get_last())!==null) {
1084 |             $this->clear();
1085 |             return false;
1086 |         }
1087 |     }
1088 | 
1089 |     // set callback function
1090 |     function set_callback($function_name)
1091 |     {
1092 |         $this->callback = $function_name;
1093 |     }
1094 | 
1095 |     // remove callback function
1096 |     function remove_callback()
1097 |     {
1098 |         $this->callback = null;
1099 |     }
1100 | 
1101 |     // save dom as string
1102 |     function save($filepath='')
1103 |     {
1104 |         $ret = $this->root->innertext();
1105 |         if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1106 |         return $ret;
1107 |     }
1108 | 
1109 |     // find dom node by css selector
1110 |     // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1111 |     function find($selector, $idx=null, $lowercase=false)
1112 |     {
1113 |         return $this->root->find($selector, $idx, $lowercase);
1114 |     }
1115 | 
1116 |     // clean up memory due to php5 circular references memory leak...
1117 |     function clear()
1118 |     {
1119 |         foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1120 |         // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1121 |         if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1122 |         if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1123 |         if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1124 |         unset($this->doc);
1125 |         unset($this->noise);
1126 |     }
1127 | 
1128 |     function dump($show_attr=true)
1129 |     {
1130 |         $this->root->dump($show_attr);
1131 |     }
1132 | 
1133 |     // prepare HTML data and init everything
1134 |     protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1135 |     {
1136 |         $this->clear();
1137 | 
1138 |         // set the length of content before we do anything to it.
1139 |         $this->size = strlen($str);
1140 |         // Save the original size of the html that we got in.  It might be useful to someone.
1141 |         $this->original_size = $this->size;
1142 | 
1143 |         //before we save the string as the doc...  strip out the \r \n's if we are told to.
1144 |         if ($stripRN) {
1145 |             $str = str_replace("\r", " ", $str);
1146 |             $str = str_replace("\n", " ", $str);
1147 | 
1148 |             // set the length of content since we have changed it.
1149 |             $this->size = strlen($str);
1150 |         }
1151 | 
1152 |         $this->doc = $str;
1153 |         $this->pos = 0;
1154 |         $this->cursor = 1;
1155 |         $this->noise = array();
1156 |         $this->nodes = array();
1157 |         $this->lowercase = $lowercase;
1158 |         $this->default_br_text = $defaultBRText;
1159 |         $this->default_span_text = $defaultSpanText;
1160 |         $this->root = new simple_html_dom_node($this);
1161 |         $this->root->tag = 'root';
1162 |         $this->root->_[HDOM_INFO_BEGIN] = -1;
1163 |         $this->root->nodetype = HDOM_TYPE_ROOT;
1164 |         $this->parent = $this->root;
1165 |         if ($this->size>0) $this->char = $this->doc[0];
1166 |     }
1167 | 
1168 |     // parse html content
1169 |     protected function parse()
1170 |     {
1171 |         if (($s = $this->copy_until_char('<'))==='')
1172 |         {
1173 |             return $this->read_tag();
1174 |         }
1175 | 
1176 |         // text
1177 |         $node = new simple_html_dom_node($this);
1178 |         ++$this->cursor;
1179 |         $node->_[HDOM_INFO_TEXT] = $s;
1180 |         $this->link_nodes($node, false);
1181 |         return true;
1182 |     }
1183 | 
1184 |     // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1185 |     // NOTE:  IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1186 |     // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1187 |     protected function parse_charset()
1188 |     {
1189 |         global $debugObject;
1190 | 
1191 |         $charset = null;
1192 | 
1193 |         if (function_exists('get_last_retrieve_url_contents_content_type'))
1194 |         {
1195 |             $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1196 |             $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1197 |             if ($success)
1198 |             {
1199 |                 $charset = $matches[1];
1200 |                 if (is_object($debugObject)) {$debugObject->debugLog(2, 'header content-type found charset of: ' . $charset);}
1201 |             }
1202 | 
1203 |         }
1204 | 
1205 |         if (empty($charset))
1206 |         {
1207 |             $el = $this->root->find('meta[http-equiv=Content-Type]',0);
1208 |             if (!empty($el))
1209 |             {
1210 |                 $fullvalue = $el->content;
1211 |                 if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag found' . $fullvalue);}
1212 | 
1213 |                 if (!empty($fullvalue))
1214 |                 {
1215 |                     $success = preg_match('/charset=(.+)/', $fullvalue, $matches);
1216 |                     if ($success)
1217 |                     {
1218 |                         $charset = $matches[1];
1219 |                     }
1220 |                     else
1221 |                     {
1222 |                         // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1223 |                         if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1224 |                         $charset = 'ISO-8859-1';
1225 |                     }
1226 |                 }
1227 |             }
1228 |         }
1229 | 
1230 |         // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1231 |         if (empty($charset))
1232 |         {
1233 |             // Have php try to detect the encoding from the text given to us.
1234 |             $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1235 |             if (is_object($debugObject)) {$debugObject->debugLog(2, 'mb_detect found: ' . $charset);}
1236 | 
1237 |             // and if this doesn't work...  then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1238 |             if ($charset === false)
1239 |             {
1240 |                 if (is_object($debugObject)) {$debugObject->debugLog(2, 'since mb_detect failed - using default of utf-8');}
1241 |                 $charset = 'UTF-8';
1242 |             }
1243 |         }
1244 | 
1245 |         // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1246 |         if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1247 |         {
1248 |             if (is_object($debugObject)) {$debugObject->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1249 |             $charset = 'CP1252';
1250 |         }
1251 | 
1252 |         if (is_object($debugObject)) {$debugObject->debugLog(1, 'EXIT - ' . $charset);}
1253 | 
1254 |         return $this->_charset = $charset;
1255 |     }
1256 | 
1257 |     // read tag info
1258 |     protected function read_tag()
1259 |     {
1260 |         if ($this->char!=='<')
1261 |         {
1262 |             $this->root->_[HDOM_INFO_END] = $this->cursor;
1263 |             return false;
1264 |         }
1265 |         $begin_tag_pos = $this->pos;
1266 |         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1267 | 
1268 |         // end tag
1269 |         if ($this->char==='/')
1270 |         {
1271 |             $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1272 |             // This represents the change in the simple_html_dom trunk from revision 180 to 181.
1273 |             // $this->skip($this->token_blank_t);
1274 |             $this->skip($this->token_blank);
1275 |             $tag = $this->copy_until_char('>');
1276 | 
1277 |             // skip attributes in end tag
1278 |             if (($pos = strpos($tag, ' '))!==false)
1279 |                 $tag = substr($tag, 0, $pos);
1280 | 
1281 |             $parent_lower = strtolower($this->parent->tag);
1282 |             $tag_lower = strtolower($tag);
1283 | 
1284 |             if ($parent_lower!==$tag_lower)
1285 |             {
1286 |                 if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1287 |                 {
1288 |                     $this->parent->_[HDOM_INFO_END] = 0;
1289 |                     $org_parent = $this->parent;
1290 | 
1291 |                     while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1292 |                         $this->parent = $this->parent->parent;
1293 | 
1294 |                     if (strtolower($this->parent->tag)!==$tag_lower) {
1295 |                         $this->parent = $org_parent; // restore origonal parent
1296 |                         if ($this->parent->parent) $this->parent = $this->parent->parent;
1297 |                         $this->parent->_[HDOM_INFO_END] = $this->cursor;
1298 |                         return $this->as_text_node($tag);
1299 |                     }
1300 |                 }
1301 |                 else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1302 |                 {
1303 |                     $this->parent->_[HDOM_INFO_END] = 0;
1304 |                     $org_parent = $this->parent;
1305 | 
1306 |                     while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1307 |                         $this->parent = $this->parent->parent;
1308 | 
1309 |                     if (strtolower($this->parent->tag)!==$tag_lower)
1310 |                     {
1311 |                         $this->parent = $org_parent; // restore origonal parent
1312 |                         $this->parent->_[HDOM_INFO_END] = $this->cursor;
1313 |                         return $this->as_text_node($tag);
1314 |                     }
1315 |                 }
1316 |                 else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1317 |                 {
1318 |                     $this->parent->_[HDOM_INFO_END] = 0;
1319 |                     $this->parent = $this->parent->parent;
1320 |                 }
1321 |                 else
1322 |                     return $this->as_text_node($tag);
1323 |             }
1324 | 
1325 |             $this->parent->_[HDOM_INFO_END] = $this->cursor;
1326 |             if ($this->parent->parent) $this->parent = $this->parent->parent;
1327 | 
1328 |             $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1329 |             return true;
1330 |         }
1331 | 
1332 |         $node = new simple_html_dom_node($this);
1333 |         $node->_[HDOM_INFO_BEGIN] = $this->cursor;
1334 |         ++$this->cursor;
1335 |         $tag = $this->copy_until($this->token_slash);
1336 |         $node->tag_start = $begin_tag_pos;
1337 | 
1338 |         // doctype, cdata & comments...
1339 |         if (isset($tag[0]) && $tag[0]==='!') {
1340 |             $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1341 | 
1342 |             if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
1343 |                 $node->nodetype = HDOM_TYPE_COMMENT;
1344 |                 $node->tag = 'comment';
1345 |             } else {
1346 |                 $node->nodetype = HDOM_TYPE_UNKNOWN;
1347 |                 $node->tag = 'unknown';
1348 |             }
1349 |             if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1350 |             $this->link_nodes($node, true);
1351 |             $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1352 |             return true;
1353 |         }
1354 | 
1355 |         // text
1356 |         if ($pos=strpos($tag, '<')!==false) {
1357 |             $tag = '<' . substr($tag, 0, -1);
1358 |             $node->_[HDOM_INFO_TEXT] = $tag;
1359 |             $this->link_nodes($node, false);
1360 |             $this->char = $this->doc[--$this->pos]; // prev
1361 |             return true;
1362 |         }
1363 | 
1364 |         if (!preg_match("/^[\w-:]+$/", $tag)) {
1365 |             $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1366 |             if ($this->char==='<') {
1367 |                 $this->link_nodes($node, false);
1368 |                 return true;
1369 |             }
1370 | 
1371 |             if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1372 |             $this->link_nodes($node, false);
1373 |             $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1374 |             return true;
1375 |         }
1376 | 
1377 |         // begin tag
1378 |         $node->nodetype = HDOM_TYPE_ELEMENT;
1379 |         $tag_lower = strtolower($tag);
1380 |         $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1381 | 
1382 |         // handle optional closing tags
1383 |         if (isset($this->optional_closing_tags[$tag_lower]) )
1384 |         {
1385 |             while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1386 |             {
1387 |                 $this->parent->_[HDOM_INFO_END] = 0;
1388 |                 $this->parent = $this->parent->parent;
1389 |             }
1390 |             $node->parent = $this->parent;
1391 |         }
1392 | 
1393 |         $guard = 0; // prevent infinity loop
1394 |         $space = array($this->copy_skip($this->token_blank), '', '');
1395 | 
1396 |         // attributes
1397 |         do
1398 |         {
1399 |             if ($this->char!==null && $space[0]==='')
1400 |             {
1401 |                 break;
1402 |             }
1403 |             $name = $this->copy_until($this->token_equal);
1404 |             if ($guard===$this->pos)
1405 |             {
1406 |                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1407 |                 continue;
1408 |             }
1409 |             $guard = $this->pos;
1410 | 
1411 |             // handle endless '<'
1412 |             if ($this->pos>=$this->size-1 && $this->char!=='>') {
1413 |                 $node->nodetype = HDOM_TYPE_TEXT;
1414 |                 $node->_[HDOM_INFO_END] = 0;
1415 |                 $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1416 |                 $node->tag = 'text';
1417 |                 $this->link_nodes($node, false);
1418 |                 return true;
1419 |             }
1420 | 
1421 |             // handle mismatch '<'
1422 |             if ($this->doc[$this->pos-1]=='<') {
1423 |                 $node->nodetype = HDOM_TYPE_TEXT;
1424 |                 $node->tag = 'text';
1425 |                 $node->attr = array();
1426 |                 $node->_[HDOM_INFO_END] = 0;
1427 |                 $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1428 |                 $this->pos -= 2;
1429 |                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1430 |                 $this->link_nodes($node, false);
1431 |                 return true;
1432 |             }
1433 | 
1434 |             if ($name!=='/' && $name!=='') {
1435 |                 $space[1] = $this->copy_skip($this->token_blank);
1436 |                 $name = $this->restore_noise($name);
1437 |                 if ($this->lowercase) $name = strtolower($name);
1438 |                 if ($this->char==='=') {
1439 |                     $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1440 |                     $this->parse_attr($node, $name, $space);
1441 |                 }
1442 |                 else {
1443 |                     //no value attr: nowrap, checked selected...
1444 |                     $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1445 |                     $node->attr[$name] = true;
1446 |                     if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1447 |                 }
1448 |                 $node->_[HDOM_INFO_SPACE][] = $space;
1449 |                 $space = array($this->copy_skip($this->token_blank), '', '');
1450 |             }
1451 |             else
1452 |                 break;
1453 |         } while ($this->char!=='>' && $this->char!=='/');
1454 | 
1455 |         $this->link_nodes($node, true);
1456 |         $node->_[HDOM_INFO_ENDSPACE] = $space[0];
1457 | 
1458 |         // check self closing
1459 |         if ($this->copy_until_char_escape('>')==='/')
1460 |         {
1461 |             $node->_[HDOM_INFO_ENDSPACE] .= '/';
1462 |             $node->_[HDOM_INFO_END] = 0;
1463 |         }
1464 |         else
1465 |         {
1466 |             // reset parent
1467 |             if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1468 |         }
1469 |         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1470 | 
1471 |         // If it's a BR tag, we need to set it's text to the default text.
1472 |         // This way when we see it in plaintext, we can generate formatting that the user wants.
1473 |         // since a br tag never has sub nodes, this works well.
1474 |         if ($node->tag == "br")
1475 |         {
1476 |             $node->_[HDOM_INFO_INNER] = $this->default_br_text;
1477 |         }
1478 | 
1479 |         return true;
1480 |     }
1481 | 
1482 |     // parse attributes
1483 |     protected function parse_attr($node, $name, &$space)
1484 |     {
1485 |         // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1486 |         // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1487 |         if (isset($node->attr[$name]))
1488 |         {
1489 |             return;
1490 |         }
1491 | 
1492 |         $space[2] = $this->copy_skip($this->token_blank);
1493 |         switch ($this->char) {
1494 |             case '"':
1495 |                 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1496 |                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1497 |                 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
1498 |                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1499 |                 break;
1500 |             case '\'':
1501 |                 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1502 |                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1503 |                 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
1504 |                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1505 |                 break;
1506 |             default:
1507 |                 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1508 |                 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1509 |         }
1510 |         // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1511 |         $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1512 |         $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1513 |         // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1514 |         if ($name == "class") {
1515 |             $node->attr[$name] = trim($node->attr[$name]);
1516 |         }
1517 |     }
1518 | 
1519 |     // link node's parent
1520 |     protected function link_nodes(&$node, $is_child)
1521 |     {
1522 |         $node->parent = $this->parent;
1523 |         $this->parent->nodes[] = $node;
1524 |         if ($is_child)
1525 |         {
1526 |             $this->parent->children[] = $node;
1527 |         }
1528 |     }
1529 | 
1530 |     // as a text node
1531 |     protected function as_text_node($tag)
1532 |     {
1533 |         $node = new simple_html_dom_node($this);
1534 |         ++$this->cursor;
1535 |         $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1536 |         $this->link_nodes($node, false);
1537 |         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1538 |         return true;
1539 |     }
1540 | 
1541 |     protected function skip($chars)
1542 |     {
1543 |         $this->pos += strspn($this->doc, $chars, $this->pos);
1544 |         $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1545 |     }
1546 | 
1547 |     protected function copy_skip($chars)
1548 |     {
1549 |         $pos = $this->pos;
1550 |         $len = strspn($this->doc, $chars, $pos);
1551 |         $this->pos += $len;
1552 |         $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1553 |         if ($len===0) return '';
1554 |         return substr($this->doc, $pos, $len);
1555 |     }
1556 | 
1557 |     protected function copy_until($chars)
1558 |     {
1559 |         $pos = $this->pos;
1560 |         $len = strcspn($this->doc, $chars, $pos);
1561 |         $this->pos += $len;
1562 |         $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1563 |         return substr($this->doc, $pos, $len);
1564 |     }
1565 | 
1566 |     protected function copy_until_char($char)
1567 |     {
1568 |         if ($this->char===null) return '';
1569 | 
1570 |         if (($pos = strpos($this->doc, $char, $this->pos))===false) {
1571 |             $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1572 |             $this->char = null;
1573 |             $this->pos = $this->size;
1574 |             return $ret;
1575 |         }
1576 | 
1577 |         if ($pos===$this->pos) return '';
1578 |         $pos_old = $this->pos;
1579 |         $this->char = $this->doc[$pos];
1580 |         $this->pos = $pos;
1581 |         return substr($this->doc, $pos_old, $pos-$pos_old);
1582 |     }
1583 | 
1584 |     protected function copy_until_char_escape($char)
1585 |     {
1586 |         if ($this->char===null) return '';
1587 | 
1588 |         $start = $this->pos;
1589 |         while (1)
1590 |         {
1591 |             if (($pos = strpos($this->doc, $char, $start))===false)
1592 |             {
1593 |                 $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1594 |                 $this->char = null;
1595 |                 $this->pos = $this->size;
1596 |                 return $ret;
1597 |             }
1598 | 
1599 |             if ($pos===$this->pos) return '';
1600 | 
1601 |             if ($this->doc[$pos-1]==='\\') {
1602 |                 $start = $pos+1;
1603 |                 continue;
1604 |             }
1605 | 
1606 |             $pos_old = $this->pos;
1607 |             $this->char = $this->doc[$pos];
1608 |             $this->pos = $pos;
1609 |             return substr($this->doc, $pos_old, $pos-$pos_old);
1610 |         }
1611 |     }
1612 | 
1613 |     // remove noise from html content
1614 |     // save the noise in the $this->noise array.
1615 |     protected function remove_noise($pattern, $remove_tag=false)
1616 |     {
1617 |         global $debugObject;
1618 |         if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
1619 | 
1620 |         $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1621 | 
1622 |         for ($i=$count-1; $i>-1; --$i)
1623 |         {
1624 |             $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1625 |             if (is_object($debugObject)) { $debugObject->debugLog(2, 'key is: ' . $key); }
1626 |             $idx = ($remove_tag) ? 0 : 1;
1627 |             $this->noise[$key] = $matches[$i][$idx][0];
1628 |             $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
1629 |         }
1630 | 
1631 |         // reset the length of content
1632 |         $this->size = strlen($this->doc);
1633 |         if ($this->size>0)
1634 |         {
1635 |             $this->char = $this->doc[0];
1636 |         }
1637 |     }
1638 | 
1639 |     // restore noise to html content
1640 |     function restore_noise($text)
1641 |     {
1642 |         global $debugObject;
1643 |         if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
1644 | 
1645 |         while (($pos=strpos($text, '___noise___'))!==false)
1646 |         {
1647 |             // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1648 |             if (strlen($text) > $pos+15)
1649 |             {
1650 |                 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1651 |                 if (is_object($debugObject)) { $debugObject->debugLog(2, 'located key of: ' . $key); }
1652 | 
1653 |                 if (isset($this->noise[$key]))
1654 |                 {
1655 |                     $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
1656 |                 }
1657 |                 else
1658 |                 {
1659 |                     // do this to prevent an infinite loop.
1660 |                     $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
1661 |                 }
1662 |             }
1663 |             else
1664 |             {
1665 |                 // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1666 |                 $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
1667 |             }
1668 |         }
1669 |         return $text;
1670 |     }
1671 | 
1672 |     // Sometimes we NEED one of the noise elements.
1673 |     function search_noise($text)
1674 |     {
1675 |         global $debugObject;
1676 |         if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
1677 | 
1678 |         foreach($this->noise as $noiseElement)
1679 |         {
1680 |             if (strpos($noiseElement, $text)!==false)
1681 |             {
1682 |                 return $noiseElement;
1683 |             }
1684 |         }
1685 |     }
1686 |     function __toString()
1687 |     {
1688 |         return $this->root->innertext();
1689 |     }
1690 | 
1691 |     function __get($name)
1692 |     {
1693 |         switch ($name)
1694 |         {
1695 |             case 'outertext':
1696 |                 return $this->root->innertext();
1697 |             case 'innertext':
1698 |                 return $this->root->innertext();
1699 |             case 'plaintext':
1700 |                 return $this->root->text();
1701 |             case 'charset':
1702 |                 return $this->_charset;
1703 |             case 'target_charset':
1704 |                 return $this->_target_charset;
1705 |         }
1706 |     }
1707 | 
1708 |     // camel naming conventions
1709 |     function childNodes($idx=-1) {return $this->root->childNodes($idx);}
1710 |     function firstChild() {return $this->root->first_child();}
1711 |     function lastChild() {return $this->root->last_child();}
1712 |     function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
1713 |     function createTextNode($value) {return @end(str_get_html($value)->nodes);}
1714 |     function getElementById($id) {return $this->find("#$id", 0);}
1715 |     function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1716 |     function getElementByTagName($name) {return $this->find($name, 0);}
1717 |     function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
1718 |     function loadFile() {$args = func_get_args();$this->load_file($args);}
1719 | }
1720 | 
1721 | ?>


--------------------------------------------------------------------------------
/sitemap-generator.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /*************************************************************
  3 |  iProDev PHP XML Sitemap Generator
  4 |  Simple site crawler to create a search engine XML Sitemap.
  5 |  Version 1.0
  6 |  Free to use, without any warranty.
  7 |  Written by iProDev(Hemn Chawroka) http://iprodev.com 28/Mar/2016.
  8 | 
  9 | *************************************************************/
 10 | 	require_once "simple_html_dom.php";
 11 | 
 12 | 	// Set the output file name.
 13 | 	$file = "sitemap.xml";
 14 | 
 15 | 	// Set the start URL. Here is http used, use https:// for 
 16 | 	// SSL websites.
 17 | 	$start_url = "http://iprodev.com/";       
 18 | 
 19 | 	// Set true or false to define how the script is used.
 20 | 	// true:  As CLI script.
 21 | 	// false: As Website script.
 22 | 	define ('CLI', true);
 23 | 
 24 | 	// Define here the URLs to skip. All URLs that start with 
 25 | 	// the defined URL will be skipped too.
 26 | 	// Example: "http://iprodev.com/print" will also skip
 27 | 	// http://iprodev.com/print/bootmanager.html
 28 | 	$skip = array (
 29 | 					"http://iprodev.com/print/",
 30 | 				  );
 31 | 
 32 | 	// Define what file types should be scanned.
 33 | 	$extension = array (
 34 | 						 ".html", 
 35 | 						 ".php",
 36 | 						 "/",
 37 | 					   ); 
 38 | 
 39 | 	// Scan frequency
 40 | 	$freq = "daily";
 41 | 
 42 | 	// Page priority
 43 | 	$priority = "1.0";
 44 | 
 45 | 	// Init end ==========================
 46 | 
 47 | 	define ('VERSION', "1.0");                                            
 48 | 	define ('NL', CLI ? "\n" : "<br>");
 49 | 
 50 | 	function rel2abs($rel, $base) {
 51 | 		if(strpos($rel,"//") === 0) {
 52 | 			return "http:".$rel;
 53 | 		}
 54 | 		/* return if  already absolute URL */
 55 | 		if  (parse_url($rel, PHP_URL_SCHEME) != '') return $rel;
 56 | 		$first_char = substr ($rel, 0, 1);
 57 | 		/* queries and  anchors */
 58 | 		if ($first_char == '#'  || $first_char == '?') return $base.$rel;
 59 | 		/* parse base URL  and convert to local variables:
 60 | 		$scheme, $host,  $path */
 61 | 		extract(parse_url($base));
 62 | 		/* remove  non-directory element from path */
 63 | 		$path = preg_replace('#/[^/]*$#',  '', $path);
 64 | 		/* destroy path if  relative url points to root */
 65 | 		if ($first_char ==  '/') $path = '';
 66 | 		/* dirty absolute  URL */
 67 | 		$abs =  "$host$path/$rel";
 68 | 		/* replace '//' or  '/./' or '/foo/../' with '/' */
 69 | 		$re =  array('#(/.?/)#', '#/(?!..)[^/]+/../#');
 70 | 		for($n=1; $n>0;  $abs=preg_replace($re, '/', $abs, -1, $n)) {}
 71 | 		/* absolute URL is  ready! */
 72 | 		return  $scheme.'://'.$abs;
 73 | 	}
 74 | 
 75 | 	function GetUrl ($url) {
 76 | 		$agent = "Mozilla/5.0 (compatible; iProDev PHP XML Sitemap Generator/" . VERSION . ", http://iprodev.com)";
 77 | 
 78 | 		$ch = curl_init();
 79 | 		curl_setopt ($ch, CURLOPT_AUTOREFERER, true);
 80 | 		curl_setopt ($ch, CURLOPT_URL, $url);
 81 | 		curl_setopt ($ch, CURLOPT_USERAGENT, $agent);
 82 | 		curl_setopt ($ch, CURLOPT_VERBOSE, 1);
 83 | 		curl_setopt ($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
 84 | 		curl_setopt ($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
 85 | 		curl_setopt ($ch, CURLOPT_HEADER, 0);
 86 | 		curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
 87 | 		curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
 88 | 		curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, 5);
 89 | 
 90 | 		$data = curl_exec($ch);
 91 | 
 92 | 		curl_close($ch);
 93 | 
 94 | 		return $data;
 95 | 	}
 96 | 
 97 | 	function Scan ($url) {
 98 | 		global $start_url, $scanned, $pf, $extension, $skip, $freq, $priority;
 99 | 
100 | 		echo $url . NL;
101 | 
102 | 		$url = filter_var ($url, FILTER_SANITIZE_URL);
103 | 
104 | 		if (!filter_var ($url, FILTER_VALIDATE_URL) || in_array ($url, $scanned)) {
105 | 			return;
106 | 		}
107 | 
108 | 		array_push ($scanned, $url);
109 | 		$html = str_get_html (GetUrl ($url));
110 | 		$a1   = $html->find('a');
111 | 
112 | 		foreach ($a1 as $val) {
113 | 			$next_url = $val->href or "";
114 | 
115 | 			$fragment_split = explode ("#", $next_url);
116 | 			$next_url       = $fragment_split[0];
117 | 
118 | 			if ((substr ($next_url, 0, 7) != "http://")  && 
119 | 				(substr ($next_url, 0, 8) != "https://") &&
120 | 				(substr ($next_url, 0, 6) != "ftp://")   &&
121 | 				(substr ($next_url, 0, 7) != "mailto:"))
122 | 			{
123 | 				$next_url = @rel2abs ($next_url, $url);
124 | 			}
125 | 
126 | 			$next_url = filter_var ($next_url, FILTER_SANITIZE_URL);
127 | 
128 | 			if (substr ($next_url, 0, strlen ($start_url)) == $start_url) {
129 | 				$ignore = false;
130 | 
131 | 				if (!filter_var ($next_url, FILTER_VALIDATE_URL)) {
132 | 					$ignore = true;
133 | 				}
134 | 
135 | 				if (in_array ($next_url, $scanned)) {
136 | 					$ignore = true;
137 | 				}
138 | 
139 | 				if (isset ($skip) && !$ignore) {
140 | 					foreach ($skip as $v) {
141 | 						if (substr ($next_url, 0, strlen ($v)) == $v)
142 | 						{
143 | 							$ignore = true;
144 | 						}
145 | 					}
146 | 				}
147 | 
148 | 				if (!$ignore) {
149 | 					foreach ($extension as $ext) {
150 | 						if (strpos ($next_url, $ext) > 0) {
151 | 							$pr = number_format ( round ( $priority / count ( explode( "/", trim ( str_ireplace ( array ("http://", "https://"), "", $next_url ), "/" ) ) ) + 0.5, 3 ), 1 );
152 | 							fwrite ($pf, "  <url>\n" .
153 | 										 "    <loc>" . htmlentities ($next_url) ."</loc>\n" .
154 | 										 "    <changefreq>$freq</changefreq>\n" .
155 | 										 "    <priority>$pr</priority>\n" .
156 | 										 "  </url>\n");
157 | 							Scan ($next_url);
158 | 						}
159 | 					}
160 | 				}
161 | 			}
162 | 		}
163 | 	}
164 | 
165 | 	
166 | 
167 | 	$pf = fopen ($file, "w");
168 | 	if (!$pf) {
169 | 		echo "Cannot create $file!" . NL;
170 | 		return;
171 | 	}
172 | 
173 | 	$start_url = filter_var ($start_url, FILTER_SANITIZE_URL);
174 | 
175 | 	fwrite ($pf, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" .
176 | 				 "<?xml-stylesheet type=\"text/xsl\" href=\"http://iprodev.github.io/PHP-XML-Sitemap-Generator/xml-sitemap.xsl\"?>\n" .
177 | 				 "<!-- Created with iProDev PHP XML Sitemap Generator " . VERSION . " http://iprodev.com -->\n" .
178 | 				 "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n" .
179 | 				 "        xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n" .
180 | 				 "        xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9\n" .
181 | 				 "        http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">\n" .
182 | 				 "  <url>\n" .
183 | 				 "    <loc>" . htmlentities ($start_url) ."</loc>\n" .
184 | 				 "    <changefreq>$freq</changefreq>\n" .
185 | 				 "    <priority>$priority</priority>\n" .
186 | 				 "  </url>\n");
187 | 
188 | 	$scanned = array ();
189 | 	Scan ($start_url);
190 | 
191 | 	fwrite ($pf, "</urlset>\n");
192 | 	fclose ($pf);
193 | 
194 | 	echo "Done." . NL;
195 | 	echo "$file created." . NL;
196 | ?>


--------------------------------------------------------------------------------