├── .gitignore
├── LICENSE
├── README.md
├── example.php
└── src
    ├── WaifuGenerator.php
    └── data
        └── dom.php


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Moe Poi ~
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # WaifuGenerator
 2 | Script For Generate Waifu (≧▽≦)
 3 | 
 4 | ----
 5 | 
 6 | ## Usage
 7 | 
 8 | ```sh
 9 | <?php
10 | 
11 | include('src/WaifuGenerator.php');
12 | 
13 | $anu = new WaifuGenerator();
14 | $name = $anu->type("name");
15 | $image = $anu->type("image");
16 | echo "=== RESULT ===\n\nName : {$name}\nImage : {$image}";
17 | 
18 | ?>
19 | ```
20 | 
21 | ## Credit
22 | 
23 | Moe Poi ~ / [@moepoi](https://github.com/moepoi)
24 | 


--------------------------------------------------------------------------------
/example.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | include('src/WaifuGenerator.php');
 4 | 
 5 | $anu = new WaifuGenerator();
 6 | $name = $anu->type("name");
 7 | $image = $anu->type("image");
 8 | echo "=== RESULT ===\n\nName : {$name}\nImage : {$image}";
 9 | 
10 | ?>


--------------------------------------------------------------------------------
/src/WaifuGenerator.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | include('data/dom.php');
 4 | 
 5 | /**
 6 |  * @author Moe Poi <moepoi@protonmail.com>
 7 |  * @license MIT
 8 |  */
 9 | 
10 | class WaifuGenerator {
11 |     public $image;
12 |     public $name;
13 |     public $page;
14 |     public $url;
15 |     public $req;
16 |     public function type($type) {
17 |         $this->image = array();
18 |         $this->name = array();
19 |         $this->page = strval(rand(1,11));
20 |         $this->url = sprintf('http://jurnalotaku.com/tag/waifu-wednesday/page/%s/', $this->page);
21 |         $this->req = file_get_html($this->url);
22 |         foreach($this->req->find('div[class=article-wrapper article-tb m-tb]') as $x)
23 |             foreach($x->find('div') as $y)
24 |                 foreach($y->find('div') as $z)
25 |                     foreach($z->find('img') as $s)
26 |                         array_push($this->image, $s->src) && array_push($this->name, $s->alt);
27 |         $num = rand(0,count($this->image));
28 |         if ($type == "name"){
29 |             return str_replace("[Waifu Wednesday] ","",$this->name[$num]);
30 |         }elseif ($type == "image"){
31 |             return $this->image[$num];
32 |         }else{
33 |             return "name/image";
34 |         }
35 |     }
36 | }
37 | 
38 | ?>


--------------------------------------------------------------------------------
/src/data/dom.php:
--------------------------------------------------------------------------------
   1 | <?php
   2 | /**
   3 |  * Website: http://sourceforge.net/projects/simplehtmldom/
   4 |  * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
   5 |  * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
   6 |  * Contributions by:
   7 |  *	 Yousuke Kumakura (Attribute filters)
   8 |  *	 Vadim Voituk (Negative indexes supports of "find" method)
   9 |  *	 Antcs (Constructor with automatically load contents either text or file/url)
  10 |  *
  11 |  * all affected sections have comments starting with "PaperG"
  12 |  *
  13 |  * Paperg - Added case insensitive testing of the value of the selector.
  14 |  * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
  15 |  *  This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
  16 |  *  it will almost always be smaller by some amount.
  17 |  *  We use this to determine how far into the file the tag in question is.  This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
  18 |  *  but for most purposes, it's a really good estimation.
  19 |  * Paperg - Added the forceTagsClosed to the dom constructor.  Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
  20 |  * Allow the user to tell us how much they trust the html.
  21 |  * Paperg add the text and plaintext to the selectors for the find syntax.  plaintext implies text in the innertext of a node.  text implies that the tag is a text node.
  22 |  * This allows for us to find tags based on the text they contain.
  23 |  * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
  24 |  * Paperg: added parse_charset so that we know about the character set of the source document.
  25 |  *  NOTE:  If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
  26 |  *  last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
  27 |  *
  28 |  * Found infinite loop in the case of broken html in restore_noise.  Rewrote to protect from that.
  29 |  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
  30 |  *
  31 |  * Licensed under The MIT License
  32 |  * Redistributions of files must retain the above copyright notice.
  33 |  *
  34 |  * @author S.C. Chen <me578022@gmail.com>
  35 |  * @author John Schlick
  36 |  * @author Rus Carroll
  37 |  * @version Rev. 1.7 (214)
  38 |  * @package PlaceLocalInclude
  39 |  * @subpackage simple_html_dom
  40 |  */
  41 | 
  42 | /**
  43 |  * All of the Defines for the classes below.
  44 |  * @author S.C. Chen <me578022@gmail.com>
  45 |  */
  46 | define('HDOM_TYPE_ELEMENT', 1);
  47 | define('HDOM_TYPE_COMMENT', 2);
  48 | define('HDOM_TYPE_TEXT',	3);
  49 | define('HDOM_TYPE_ENDTAG',  4);
  50 | define('HDOM_TYPE_ROOT',	5);
  51 | define('HDOM_TYPE_UNKNOWN', 6);
  52 | define('HDOM_QUOTE_DOUBLE', 0);
  53 | define('HDOM_QUOTE_SINGLE', 1);
  54 | define('HDOM_QUOTE_NO',	 3);
  55 | define('HDOM_INFO_BEGIN',   0);
  56 | define('HDOM_INFO_END',	 1);
  57 | define('HDOM_INFO_QUOTE',   2);
  58 | define('HDOM_INFO_SPACE',   3);
  59 | define('HDOM_INFO_TEXT',	4);
  60 | define('HDOM_INFO_INNER',   5);
  61 | define('HDOM_INFO_OUTER',   6);
  62 | define('HDOM_INFO_ENDSPACE',7);
  63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  64 | define('DEFAULT_BR_TEXT', "\r\n");
  65 | define('DEFAULT_SPAN_TEXT', " ");
  66 | define('MAX_FILE_SIZE', 600000);
  67 | 
  68 | /** Contents between curly braces "{" and "}" are interpreted as text */
  69 | define('HDOM_SMARTY_AS_TEXT', 1);
  70 | 
  71 | // helper functions
  72 | // -----------------------------------------------------------------------------
  73 | // get html dom from file
  74 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
  75 | function file_get_html($url, $use_include_path = false, $context=null, $offset = 0, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  76 | {
  77 | 	// Ensure maximum length is greater than zero
  78 | 	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
  79 | 
  80 | 	// We DO force the tags to be terminated.
  81 | 	$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  82 | 	// For sourceforge users: uncomment the next line and comment the retrieve_url_contents line 2 lines down if it is not already done.
  83 | 	$contents = file_get_contents($url, $use_include_path, $context, $offset, $maxLen);
  84 | 	// Paperg - use our own mechanism for getting the contents as we want to control the timeout.
  85 | 	//$contents = retrieve_url_contents($url);
  86 | 	if (empty($contents) || strlen($contents) > $maxLen)
  87 | 	{
  88 | 		return false;
  89 | 	}
  90 | 	// The second parameter can force the selectors to all be lowercase.
  91 | 	$dom->load($contents, $lowercase, $stripRN);
  92 | 	return $dom;
  93 | }
  94 | 
  95 | // get html dom from string
  96 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  97 | {
  98 | 	$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  99 | 	if (empty($str) || strlen($str) > MAX_FILE_SIZE)
 100 | 	{
 101 | 		$dom->clear();
 102 | 		return false;
 103 | 	}
 104 | 	$dom->load($str, $lowercase, $stripRN);
 105 | 	return $dom;
 106 | }
 107 | 
 108 | // dump html dom tree
 109 | function dump_html_tree($node, $show_attr=true, $deep=0)
 110 | {
 111 | 	$node->dump($node);
 112 | }
 113 | 
 114 | 
 115 | /**
 116 |  * simple html dom node
 117 |  * PaperG - added ability for "find" routine to lowercase the value of the selector.
 118 |  * PaperG - added $tag_start to track the start position of the tag in the total byte index
 119 |  *
 120 |  * @package PlaceLocalInclude
 121 |  */
 122 | class simple_html_dom_node
 123 | {
 124 | 	/**
 125 | 	 * Node type
 126 | 	 *
 127 | 	 * Default is {@see HDOM_TYPE_TEXT}
 128 | 	 *
 129 | 	 * @var int
 130 | 	 */
 131 | 	public $nodetype = HDOM_TYPE_TEXT;
 132 | 
 133 | 	/**
 134 | 	 * Tag name
 135 | 	 *
 136 | 	 * Default is 'text'
 137 | 	 *
 138 | 	 * @var string
 139 | 	 */
 140 | 	public $tag = 'text';
 141 | 
 142 | 	/**
 143 | 	 * List of attributes
 144 | 	 *
 145 | 	 * @var array
 146 | 	 */
 147 | 	public $attr = array();
 148 | 
 149 | 	/**
 150 | 	 * List of child node objects
 151 | 	 *
 152 | 	 * @var array
 153 | 	 */
 154 | 	public $children = array();
 155 | 	public $nodes = array();
 156 | 
 157 | 	/**
 158 | 	 * The parent node object
 159 | 	 *
 160 | 	 * @var object|null
 161 | 	 */
 162 | 	public $parent = null;
 163 | 
 164 | 	// The "info" array - see HDOM_INFO_... for what each element contains.
 165 | 	public $_ = array();
 166 | 
 167 | 	/**
 168 | 	 * Start position of the tag in the document
 169 | 	 *
 170 | 	 * @var int
 171 | 	 */
 172 | 	public $tag_start = 0;
 173 | 
 174 | 	/**
 175 | 	 * The DOM object
 176 | 	 *
 177 | 	 * @var object|null
 178 | 	 */
 179 | 	private $dom = null;
 180 | 
 181 | 	/**
 182 | 	 * Construct new node object
 183 | 	 *
 184 | 	 * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
 185 | 	 */
 186 | 	function __construct($dom)
 187 | 	{
 188 | 		$this->dom = $dom;
 189 | 		$dom->nodes[] = $this;
 190 | 	}
 191 | 
 192 | 	function __destruct()
 193 | 	{
 194 | 		$this->clear();
 195 | 	}
 196 | 
 197 | 	function __toString()
 198 | 	{
 199 | 		return $this->outertext();
 200 | 	}
 201 | 
 202 | 	// clean up memory due to php5 circular references memory leak...
 203 | 	function clear()
 204 | 	{
 205 | 		$this->dom = null;
 206 | 		$this->nodes = null;
 207 | 		$this->parent = null;
 208 | 		$this->children = null;
 209 | 	}
 210 | 
 211 | 	// dump node's tree
 212 | 	function dump($show_attr=true, $deep=0)
 213 | 	{
 214 | 		$lead = str_repeat('	', $deep);
 215 | 
 216 | 		echo $lead.$this->tag;
 217 | 		if ($show_attr && count($this->attr)>0)
 218 | 		{
 219 | 			echo '(';
 220 | 			foreach ($this->attr as $k=>$v)
 221 | 				echo "[$k]=>\"".$this->$k.'", ';
 222 | 			echo ')';
 223 | 		}
 224 | 		echo "\n";
 225 | 
 226 | 		if ($this->nodes)
 227 | 		{
 228 | 			foreach ($this->nodes as $c)
 229 | 			{
 230 | 				$c->dump($show_attr, $deep+1);
 231 | 			}
 232 | 		}
 233 | 	}
 234 | 
 235 | 
 236 | 	// Debugging function to dump a single dom node with a bunch of information about it.
 237 | 	function dump_node($echo=true)
 238 | 	{
 239 | 
 240 | 		$string = $this->tag;
 241 | 		if (count($this->attr)>0)
 242 | 		{
 243 | 			$string .= '(';
 244 | 			foreach ($this->attr as $k=>$v)
 245 | 			{
 246 | 				$string .= "[$k]=>\"".$this->$k.'", ';
 247 | 			}
 248 | 			$string .= ')';
 249 | 		}
 250 | 		if (count($this->_)>0)
 251 | 		{
 252 | 			$string .= ' $_ (';
 253 | 			foreach ($this->_ as $k=>$v)
 254 | 			{
 255 | 				if (is_array($v))
 256 | 				{
 257 | 					$string .= "[$k]=>(";
 258 | 					foreach ($v as $k2=>$v2)
 259 | 					{
 260 | 						$string .= "[$k2]=>\"".$v2.'", ';
 261 | 					}
 262 | 					$string .= ")";
 263 | 				} else {
 264 | 					$string .= "[$k]=>\"".$v.'", ';
 265 | 				}
 266 | 			}
 267 | 			$string .= ")";
 268 | 		}
 269 | 
 270 | 		if (isset($this->text))
 271 | 		{
 272 | 			$string .= " text: (" . $this->text . ")";
 273 | 		}
 274 | 
 275 | 		$string .= " HDOM_INNER_INFO: '";
 276 | 		if (isset($node->_[HDOM_INFO_INNER]))
 277 | 		{
 278 | 			$string .= $node->_[HDOM_INFO_INNER] . "'";
 279 | 		}
 280 | 		else
 281 | 		{
 282 | 			$string .= ' NULL ';
 283 | 		}
 284 | 
 285 | 		$string .= " children: " . count($this->children);
 286 | 		$string .= " nodes: " . count($this->nodes);
 287 | 		$string .= " tag_start: " . $this->tag_start;
 288 | 		$string .= "\n";
 289 | 
 290 | 		if ($echo)
 291 | 		{
 292 | 			echo $string;
 293 | 			return;
 294 | 		}
 295 | 		else
 296 | 		{
 297 | 			return $string;
 298 | 		}
 299 | 	}
 300 | 
 301 | 	/**
 302 | 	 * Return or set parent node
 303 | 	 *
 304 | 	 * @param object|null $parent (optional) The parent node, `null` to return
 305 | 	 * the current parent node.
 306 | 	 * @return object|null The parent node
 307 | 	 */
 308 | 	function parent($parent=null)
 309 | 	{
 310 | 		// I am SURE that this doesn't work properly.
 311 | 		// It fails to unset the current node from it's current parents nodes or children list first.
 312 | 		if ($parent !== null)
 313 | 		{
 314 | 			$this->parent = $parent;
 315 | 			$this->parent->nodes[] = $this;
 316 | 			$this->parent->children[] = $this;
 317 | 		}
 318 | 
 319 | 		return $this->parent;
 320 | 	}
 321 | 
 322 | 	/**
 323 | 	 * @return bool True if the node has at least one child node
 324 | 	 */
 325 | 	function has_child()
 326 | 	{
 327 | 		return !empty($this->children);
 328 | 	}
 329 | 
 330 | 	/**
 331 | 	 * Get child node at specified index
 332 | 	 *
 333 | 	 * @param int $idx The index of the child node to return, `-1` to return all
 334 | 	 * child nodes.
 335 | 	 * @return object|array|null The child node at the specified index, all child
 336 | 	 * nodes or null if the index is invalid.
 337 | 	 */
 338 | 	function children($idx=-1)
 339 | 	{
 340 | 		if ($idx===-1)
 341 | 		{
 342 | 			return $this->children;
 343 | 		}
 344 | 		if (isset($this->children[$idx]))
 345 | 		{
 346 | 			return $this->children[$idx];
 347 | 		}
 348 | 		return null;
 349 | 	}
 350 | 
 351 | 	/**
 352 | 	 * Get first child node
 353 | 	 *
 354 | 	 * @return object|null The first child node or null if the current node has
 355 | 	 * no child nodes.
 356 | 	 *
 357 | 	 * @todo Use `empty()` instead of `count()` to improve performance on large
 358 | 	 * arrays.
 359 | 	 */
 360 | 	function first_child()
 361 | 	{
 362 | 		if (count($this->children)>0)
 363 | 		{
 364 | 			return $this->children[0];
 365 | 		}
 366 | 		return null;
 367 | 	}
 368 | 
 369 | 	/**
 370 | 	 * Get last child node
 371 | 	 *
 372 | 	 * @return object|null The last child node or null if the current node has
 373 | 	 * no child nodes.
 374 | 	 *
 375 | 	 * @todo Use `end()` to slightly improve performance on large arrays.
 376 | 	 */
 377 | 	function last_child()
 378 | 	{
 379 | 		if (($count=count($this->children))>0)
 380 | 		{
 381 | 			return $this->children[$count-1];
 382 | 		}
 383 | 		return null;
 384 | 	}
 385 | 
 386 | 	/**
 387 | 	 * Get next sibling node
 388 | 	 *
 389 | 	 * @return object|null The sibling node or null if the current node has no
 390 | 	 * sibling nodes.
 391 | 	 */
 392 | 	function next_sibling()
 393 | 	{
 394 | 		if ($this->parent===null)
 395 | 		{
 396 | 			return null;
 397 | 		}
 398 | 
 399 | 		$idx = 0;
 400 | 		$count = count($this->parent->children);
 401 | 		while ($idx<$count && $this!==$this->parent->children[$idx])
 402 | 		{
 403 | 			++$idx;
 404 | 		}
 405 | 		if (++$idx>=$count)
 406 | 		{
 407 | 			return null;
 408 | 		}
 409 | 		return $this->parent->children[$idx];
 410 | 	}
 411 | 
 412 | 	/**
 413 | 	 * Get previous sibling node
 414 | 	 *
 415 | 	 * @return object|null The sibling node or null if the current node has no
 416 | 	 * sibling nodes.
 417 | 	 */
 418 | 	function prev_sibling()
 419 | 	{
 420 | 		if ($this->parent===null) return null;
 421 | 		$idx = 0;
 422 | 		$count = count($this->parent->children);
 423 | 		while ($idx<$count && $this!==$this->parent->children[$idx])
 424 | 			++$idx;
 425 | 		if (--$idx<0) return null;
 426 | 		return $this->parent->children[$idx];
 427 | 	}
 428 | 
 429 | 	/**
 430 | 	 * Traverse ancestors to the first matching tag.
 431 | 	 *
 432 | 	 * @param string $tag Tag to find
 433 | 	 * @return object|null First matching node in the DOM tree or null if no
 434 | 	 * match was found.
 435 | 	 *
 436 | 	 * @todo Null is returned implicitly by calling ->parent on the root node.
 437 | 	 * This behaviour could change at any time, rendering this function invalid.
 438 | 	 */
 439 | 	function find_ancestor_tag($tag)
 440 | 	{
 441 | 		global $debug_object;
 442 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
 443 | 
 444 | 		// Start by including ourselves in the comparison.
 445 | 		$returnDom = $this;
 446 | 
 447 | 		while (!is_null($returnDom))
 448 | 		{
 449 | 			if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
 450 | 
 451 | 			if ($returnDom->tag == $tag)
 452 | 			{
 453 | 				break;
 454 | 			}
 455 | 			$returnDom = $returnDom->parent;
 456 | 		}
 457 | 		return $returnDom;
 458 | 	}
 459 | 
 460 | 	/**
 461 | 	 * Get node's inner text (everything inside the opening and closing tags)
 462 | 	 *
 463 | 	 * @return string
 464 | 	 */
 465 | 	function innertext()
 466 | 	{
 467 | 		if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 468 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 469 | 
 470 | 		$ret = '';
 471 | 		foreach ($this->nodes as $n)
 472 | 			$ret .= $n->outertext();
 473 | 		return $ret;
 474 | 	}
 475 | 
 476 | 	/**
 477 | 	 * Get node's outer text (everything including the opening and closing tags)
 478 | 	 *
 479 | 	 * @return string
 480 | 	 */
 481 | 	function outertext()
 482 | 	{
 483 | 		global $debug_object;
 484 | 		if (is_object($debug_object))
 485 | 		{
 486 | 			$text = '';
 487 | 			if ($this->tag == 'text')
 488 | 			{
 489 | 				if (!empty($this->text))
 490 | 				{
 491 | 					$text = " with text: " . $this->text;
 492 | 				}
 493 | 			}
 494 | 			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
 495 | 		}
 496 | 
 497 | 		if ($this->tag==='root') return $this->innertext();
 498 | 
 499 | 		// trigger callback
 500 | 		if ($this->dom && $this->dom->callback!==null)
 501 | 		{
 502 | 			call_user_func_array($this->dom->callback, array($this));
 503 | 		}
 504 | 
 505 | 		if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
 506 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 507 | 
 508 | 		// render begin tag
 509 | 		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
 510 | 		{
 511 | 			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
 512 | 		} else {
 513 | 			$ret = "";
 514 | 		}
 515 | 
 516 | 		// render inner text
 517 | 		if (isset($this->_[HDOM_INFO_INNER]))
 518 | 		{
 519 | 			// If it's a br tag...  don't return the HDOM_INNER_INFO that we may or may not have added.
 520 | 			if ($this->tag != "br")
 521 | 			{
 522 | 				$ret .= $this->_[HDOM_INFO_INNER];
 523 | 			}
 524 | 		} else {
 525 | 			if ($this->nodes)
 526 | 			{
 527 | 				foreach ($this->nodes as $n)
 528 | 				{
 529 | 					$ret .= $this->convert_text($n->outertext());
 530 | 				}
 531 | 			}
 532 | 		}
 533 | 
 534 | 		// render end tag
 535 | 		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
 536 | 			$ret .= '</'.$this->tag.'>';
 537 | 		return $ret;
 538 | 	}
 539 | 
 540 | 	/**
 541 | 	 * Get node's plain text (everything excluding all tags)
 542 | 	 *
 543 | 	 * @return string
 544 | 	 */
 545 | 	function text()
 546 | 	{
 547 | 		if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 548 | 		switch ($this->nodetype)
 549 | 		{
 550 | 			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 551 | 			case HDOM_TYPE_COMMENT: return '';
 552 | 			case HDOM_TYPE_UNKNOWN: return '';
 553 | 		}
 554 | 		if (strcasecmp($this->tag, 'script')===0) return '';
 555 | 		if (strcasecmp($this->tag, 'style')===0) return '';
 556 | 
 557 | 		$ret = '';
 558 | 		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
 559 | 		// NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
 560 | 		// WHY is this happening?
 561 | 		if (!is_null($this->nodes))
 562 | 		{
 563 | 			foreach ($this->nodes as $n)
 564 | 			{
 565 | 				// Start paragraph after a blank line
 566 | 				if ($n->tag == 'p')
 567 | 				{
 568 | 					$ret .= "\n\n";
 569 | 				}
 570 | 
 571 | 				$ret .= $this->convert_text($n->text());
 572 | 
 573 | 				// If this node is a span... add a space at the end of it so multiple spans don't run into each other.  This is plaintext after all.
 574 | 				if ($n->tag == "span")
 575 | 				{
 576 | 					$ret .= $this->dom->default_span_text;
 577 | 				}
 578 | 			}
 579 | 		}
 580 | 		return trim($ret);
 581 | 	}
 582 | 
 583 | 	/**
 584 | 	 * Get node's xml text (inner text as a CDATA section)
 585 | 	 *
 586 | 	 * @return string
 587 | 	 */
 588 | 	function xmltext()
 589 | 	{
 590 | 		$ret = $this->innertext();
 591 | 		$ret = str_ireplace('<![CDATA[', '', $ret);
 592 | 		$ret = str_replace(']]>', '', $ret);
 593 | 		return $ret;
 594 | 	}
 595 | 
 596 | 	// build node's text with tag
 597 | 	function makeup()
 598 | 	{
 599 | 		// text, comment, unknown
 600 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 601 | 
 602 | 		$ret = '<'.$this->tag;
 603 | 		$i = -1;
 604 | 
 605 | 		foreach ($this->attr as $key=>$val)
 606 | 		{
 607 | 			++$i;
 608 | 
 609 | 			// skip removed attribute
 610 | 			if ($val===null || $val===false)
 611 | 				continue;
 612 | 
 613 | 			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
 614 | 			//no value attr: nowrap, checked selected...
 615 | 			if ($val===true)
 616 | 				$ret .= $key;
 617 | 			else {
 618 | 				switch ($this->_[HDOM_INFO_QUOTE][$i])
 619 | 				{
 620 | 					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
 621 | 					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
 622 | 					default: $quote = '';
 623 | 				}
 624 | 				$ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
 625 | 			}
 626 | 		}
 627 | 		$ret = $this->dom->restore_noise($ret);
 628 | 		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
 629 | 	}
 630 | 
 631 | 	// find elements by css selector
 632 | 	//PaperG - added ability for find to lowercase the value of the selector.
 633 | 	function find($selector, $idx=null, $lowercase=false)
 634 | 	{
 635 | 		$selectors = $this->parse_selector($selector);
 636 | 		if (($count=count($selectors))===0) return array();
 637 | 		$found_keys = array();
 638 | 
 639 | 		// find each selector
 640 | 		for ($c=0; $c<$count; ++$c)
 641 | 		{
 642 | 			// The change on the below line was documented on the sourceforge code tracker id 2788009
 643 | 			// used to be: if (($levle=count($selectors[0]))===0) return array();
 644 | 			if (($levle=count($selectors[$c]))===0) return array();
 645 | 			if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
 646 | 
 647 | 			$head = array($this->_[HDOM_INFO_BEGIN]=>1);
 648 | 
 649 | 			// handle descendant selectors, no recursive!
 650 | 			for ($l=0; $l<$levle; ++$l)
 651 | 			{
 652 | 				$ret = array();
 653 | 				foreach ($head as $k=>$v)
 654 | 				{
 655 | 					$n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
 656 | 					//PaperG - Pass this optional parameter on to the seek function.
 657 | 					$n->seek($selectors[$c][$l], $ret, $lowercase);
 658 | 				}
 659 | 				$head = $ret;
 660 | 			}
 661 | 
 662 | 			foreach ($head as $k=>$v)
 663 | 			{
 664 | 				if (!isset($found_keys[$k]))
 665 | 				{
 666 | 					$found_keys[$k] = 1;
 667 | 				}
 668 | 			}
 669 | 		}
 670 | 
 671 | 		// sort keys
 672 | 		ksort($found_keys);
 673 | 
 674 | 		$found = array();
 675 | 		foreach ($found_keys as $k=>$v)
 676 | 			$found[] = $this->dom->nodes[$k];
 677 | 
 678 | 		// return nth-element or array
 679 | 		if (is_null($idx)) return $found;
 680 | 		else if ($idx<0) $idx = count($found) + $idx;
 681 | 		return (isset($found[$idx])) ? $found[$idx] : null;
 682 | 	}
 683 | 
 684 | 	// seek for given conditions
 685 | 	// PaperG - added parameter to allow for case insensitive testing of the value of a selector.
 686 | 	protected function seek($selector, &$ret, $lowercase=false)
 687 | 	{
 688 | 		global $debug_object;
 689 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
 690 | 
 691 | 		list($tag, $key, $val, $exp, $no_key) = $selector;
 692 | 
 693 | 		// xpath index
 694 | 		if ($tag && $key && is_numeric($key))
 695 | 		{
 696 | 			$count = 0;
 697 | 			foreach ($this->children as $c)
 698 | 			{
 699 | 				if ($tag==='*' || $tag===$c->tag) {
 700 | 					if (++$count==$key) {
 701 | 						$ret[$c->_[HDOM_INFO_BEGIN]] = 1;
 702 | 						return;
 703 | 					}
 704 | 				}
 705 | 			}
 706 | 			return;
 707 | 		}
 708 | 
 709 | 		$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
 710 | 		if ($end==0) {
 711 | 			$parent = $this->parent;
 712 | 			while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
 713 | 				$end -= 1;
 714 | 				$parent = $parent->parent;
 715 | 			}
 716 | 			$end += $parent->_[HDOM_INFO_END];
 717 | 		}
 718 | 
 719 | 		for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
 720 | 			$node = $this->dom->nodes[$i];
 721 | 
 722 | 			$pass = true;
 723 | 
 724 | 			if ($tag==='*' && !$key) {
 725 | 				if (in_array($node, $this->children, true))
 726 | 					$ret[$i] = 1;
 727 | 				continue;
 728 | 			}
 729 | 
 730 | 			// compare tag
 731 | 			if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
 732 | 			// compare key
 733 | 			if ($pass && $key) {
 734 | 				if ($no_key) {
 735 | 					if (isset($node->attr[$key])) $pass=false;
 736 | 				} else {
 737 | 					if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
 738 | 				}
 739 | 			}
 740 | 			// compare value
 741 | 			if ($pass && $key && $val  && $val!=='*') {
 742 | 				// If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
 743 | 				if ($key == "plaintext") {
 744 | 					// $node->plaintext actually returns $node->text();
 745 | 					$nodeKeyValue = $node->text();
 746 | 				} else {
 747 | 					// this is a normal search, we want the value of that attribute of the tag.
 748 | 					$nodeKeyValue = $node->attr[$key];
 749 | 				}
 750 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
 751 | 
 752 | 				//PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
 753 | 				if ($lowercase) {
 754 | 					$check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
 755 | 				} else {
 756 | 					$check = $this->match($exp, $val, $nodeKeyValue);
 757 | 				}
 758 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
 759 | 
 760 | 				// handle multiple class
 761 | 				if (!$check && strcasecmp($key, 'class')===0) {
 762 | 					foreach (explode(' ',$node->attr[$key]) as $k) {
 763 | 						// Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
 764 | 						if (!empty($k)) {
 765 | 							if ($lowercase) {
 766 | 								$check = $this->match($exp, strtolower($val), strtolower($k));
 767 | 							} else {
 768 | 								$check = $this->match($exp, $val, $k);
 769 | 							}
 770 | 							if ($check) break;
 771 | 						}
 772 | 					}
 773 | 				}
 774 | 				if (!$check) $pass = false;
 775 | 			}
 776 | 			if ($pass) $ret[$i] = 1;
 777 | 			unset($node);
 778 | 		}
 779 | 		// It's passed by reference so this is actually what this function returns.
 780 | 		if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
 781 | 	}
 782 | 
 783 | 	protected function match($exp, $pattern, $value) {
 784 | 		global $debug_object;
 785 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 786 | 
 787 | 		switch ($exp) {
 788 | 			case '=':
 789 | 				return ($value===$pattern);
 790 | 			case '!=':
 791 | 				return ($value!==$pattern);
 792 | 			case '^=':
 793 | 				return preg_match("/^".preg_quote($pattern,'/')."/", $value);
 794 | 			case '$=':
 795 | 				return preg_match("/".preg_quote($pattern,'/')."$/", $value);
 796 | 			case '*=':
 797 | 				if ($pattern[0]=='/') {
 798 | 					return preg_match($pattern, $value);
 799 | 				}
 800 | 				return preg_match("/".$pattern."/i", $value);
 801 | 		}
 802 | 		return false;
 803 | 	}
 804 | 
 805 | 	protected function parse_selector($selector_string) {
 806 | 		global $debug_object;
 807 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 808 | 
 809 | 		// pattern of CSS selectors, modified from mootools
 810 | 		// Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
 811 | 		// Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
 812 | // Notice the \[ starting the attbute?  and the @? following?  This implies that an attribute can begin with an @ sign that is not captured.
 813 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
 814 | // farther study is required to determine of this should be documented or removed.
 815 | //		$pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 816 | 		$pattern = "/([\w:\*-]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 817 | 		preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
 818 | 		if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
 819 | 
 820 | 		$selectors = array();
 821 | 		$result = array();
 822 | 		//print_r($matches);
 823 | 
 824 | 		foreach ($matches as $m) {
 825 | 			$m[0] = trim($m[0]);
 826 | 			if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
 827 | 			// for browser generated xpath
 828 | 			if ($m[1]==='tbody') continue;
 829 | 
 830 | 			list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
 831 | 			if (!empty($m[2])) {$key='id'; $val=$m[2];}
 832 | 			if (!empty($m[3])) {$key='class'; $val=$m[3];}
 833 | 			if (!empty($m[4])) {$key=$m[4];}
 834 | 			if (!empty($m[5])) {$exp=$m[5];}
 835 | 			if (!empty($m[6])) {$val=$m[6];}
 836 | 
 837 | 			// convert to lowercase
 838 | 			if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
 839 | 			//elements that do NOT have the specified attribute
 840 | 			if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
 841 | 
 842 | 			$result[] = array($tag, $key, $val, $exp, $no_key);
 843 | 			if (trim($m[7])===',') {
 844 | 				$selectors[] = $result;
 845 | 				$result = array();
 846 | 			}
 847 | 		}
 848 | 		if (count($result)>0)
 849 | 			$selectors[] = $result;
 850 | 		return $selectors;
 851 | 	}
 852 | 
 853 | 	function __get($name)
 854 | 	{
 855 | 		if (isset($this->attr[$name]))
 856 | 		{
 857 | 			return $this->convert_text($this->attr[$name]);
 858 | 		}
 859 | 		switch ($name)
 860 | 		{
 861 | 			case 'outertext': return $this->outertext();
 862 | 			case 'innertext': return $this->innertext();
 863 | 			case 'plaintext': return $this->text();
 864 | 			case 'xmltext': return $this->xmltext();
 865 | 			default: return array_key_exists($name, $this->attr);
 866 | 		}
 867 | 	}
 868 | 
 869 | 	function __set($name, $value)
 870 | 	{
 871 | 		global $debug_object;
 872 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 873 | 
 874 | 		switch ($name)
 875 | 		{
 876 | 			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
 877 | 			case 'innertext':
 878 | 				if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
 879 | 				return $this->_[HDOM_INFO_INNER] = $value;
 880 | 		}
 881 | 		if (!isset($this->attr[$name]))
 882 | 		{
 883 | 			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
 884 | 			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
 885 | 		}
 886 | 		$this->attr[$name] = $value;
 887 | 	}
 888 | 
 889 | 	function __isset($name)
 890 | 	{
 891 | 		switch ($name)
 892 | 		{
 893 | 			case 'outertext': return true;
 894 | 			case 'innertext': return true;
 895 | 			case 'plaintext': return true;
 896 | 		}
 897 | 		//no value attr: nowrap, checked selected...
 898 | 		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
 899 | 	}
 900 | 
 901 | 	function __unset($name) {
 902 | 		if (isset($this->attr[$name]))
 903 | 			unset($this->attr[$name]);
 904 | 	}
 905 | 
 906 | 	// PaperG - Function to convert the text from one character set to another if the two sets are not the same.
 907 | 	function convert_text($text)
 908 | 	{
 909 | 		global $debug_object;
 910 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 911 | 
 912 | 		$converted_text = $text;
 913 | 
 914 | 		$sourceCharset = "";
 915 | 		$targetCharset = "";
 916 | 
 917 | 		if ($this->dom)
 918 | 		{
 919 | 			$sourceCharset = strtoupper($this->dom->_charset);
 920 | 			$targetCharset = strtoupper($this->dom->_target_charset);
 921 | 		}
 922 | 		if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
 923 | 
 924 | 		if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
 925 | 		{
 926 | 			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
 927 | 			if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
 928 | 			{
 929 | 				$converted_text = $text;
 930 | 			}
 931 | 			else
 932 | 			{
 933 | 				$converted_text = iconv($sourceCharset, $targetCharset, $text);
 934 | 			}
 935 | 		}
 936 | 
 937 | 		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
 938 | 		if ($targetCharset == 'UTF-8')
 939 | 		{
 940 | 			if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
 941 | 			{
 942 | 				$converted_text = substr($converted_text, 3);
 943 | 			}
 944 | 			if (substr($converted_text, -3) == "\xef\xbb\xbf")
 945 | 			{
 946 | 				$converted_text = substr($converted_text, 0, -3);
 947 | 			}
 948 | 		}
 949 | 
 950 | 		return $converted_text;
 951 | 	}
 952 | 
 953 | 	/**
 954 | 	* Returns true if $string is valid UTF-8 and false otherwise.
 955 | 	*
 956 | 	* @param mixed $str String to be tested
 957 | 	* @return boolean
 958 | 	*/
 959 | 	static function is_utf8($str)
 960 | 	{
 961 | 		$c=0; $b=0;
 962 | 		$bits=0;
 963 | 		$len=strlen($str);
 964 | 		for($i=0; $i<$len; $i++)
 965 | 		{
 966 | 			$c=ord($str[$i]);
 967 | 			if($c > 128)
 968 | 			{
 969 | 				if(($c >= 254)) return false;
 970 | 				elseif($c >= 252) $bits=6;
 971 | 				elseif($c >= 248) $bits=5;
 972 | 				elseif($c >= 240) $bits=4;
 973 | 				elseif($c >= 224) $bits=3;
 974 | 				elseif($c >= 192) $bits=2;
 975 | 				else return false;
 976 | 				if(($i+$bits) > $len) return false;
 977 | 				while($bits > 1)
 978 | 				{
 979 | 					$i++;
 980 | 					$b=ord($str[$i]);
 981 | 					if($b < 128 || $b > 191) return false;
 982 | 					$bits--;
 983 | 				}
 984 | 			}
 985 | 		}
 986 | 		return true;
 987 | 	}
 988 | 	/*
 989 | 	function is_utf8($string)
 990 | 	{
 991 | 		//this is buggy
 992 | 		return (utf8_encode(utf8_decode($string)) == $string);
 993 | 	}
 994 | 	*/
 995 | 
 996 | 	/**
 997 | 	 * Function to try a few tricks to determine the displayed size of an img on the page.
 998 | 	 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
 999 | 	 *
1000 | 	 * @author John Schlick
1001 | 	 * @version April 19 2012
1002 | 	 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
1003 | 	 */
1004 | 	function get_display_size()
1005 | 	{
1006 | 		global $debug_object;
1007 | 
1008 | 		$width = -1;
1009 | 		$height = -1;
1010 | 
1011 | 		if ($this->tag !== 'img')
1012 | 		{
1013 | 			return false;
1014 | 		}
1015 | 
1016 | 		// See if there is aheight or width attribute in the tag itself.
1017 | 		if (isset($this->attr['width']))
1018 | 		{
1019 | 			$width = $this->attr['width'];
1020 | 		}
1021 | 
1022 | 		if (isset($this->attr['height']))
1023 | 		{
1024 | 			$height = $this->attr['height'];
1025 | 		}
1026 | 
1027 | 		// Now look for an inline style.
1028 | 		if (isset($this->attr['style']))
1029 | 		{
1030 | 			// Thanks to user gnarf from stackoverflow for this regular expression.
1031 | 			$attributes = array();
1032 | 			preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
1033 | 			foreach ($matches as $match) {
1034 | 			  $attributes[$match[1]] = $match[2];
1035 | 			}
1036 | 
1037 | 			// If there is a width in the style attributes:
1038 | 			if (isset($attributes['width']) && $width == -1)
1039 | 			{
1040 | 				// check that the last two characters are px (pixels)
1041 | 				if (strtolower(substr($attributes['width'], -2)) == 'px')
1042 | 				{
1043 | 					$proposed_width = substr($attributes['width'], 0, -2);
1044 | 					// Now make sure that it's an integer and not something stupid.
1045 | 					if (filter_var($proposed_width, FILTER_VALIDATE_INT))
1046 | 					{
1047 | 						$width = $proposed_width;
1048 | 					}
1049 | 				}
1050 | 			}
1051 | 
1052 | 			// If there is a width in the style attributes:
1053 | 			if (isset($attributes['height']) && $height == -1)
1054 | 			{
1055 | 				// check that the last two characters are px (pixels)
1056 | 				if (strtolower(substr($attributes['height'], -2)) == 'px')
1057 | 				{
1058 | 					$proposed_height = substr($attributes['height'], 0, -2);
1059 | 					// Now make sure that it's an integer and not something stupid.
1060 | 					if (filter_var($proposed_height, FILTER_VALIDATE_INT))
1061 | 					{
1062 | 						$height = $proposed_height;
1063 | 					}
1064 | 				}
1065 | 			}
1066 | 
1067 | 		}
1068 | 
1069 | 		// Future enhancement:
1070 | 		// Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
1071 | 
1072 | 		// Far future enhancement
1073 | 		// Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
1074 | 		// Note that in this case, the class or id will have the img subselector for it to apply to the image.
1075 | 
1076 | 		// ridiculously far future development
1077 | 		// If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
1078 | 
1079 | 		$result = array('height' => $height,
1080 | 						'width' => $width);
1081 | 		return $result;
1082 | 	}
1083 | 
1084 | 	// camel naming conventions
1085 | 	function getAllAttributes() {return $this->attr;}
1086 | 	function getAttribute($name) {return $this->__get($name);}
1087 | 	function setAttribute($name, $value) {$this->__set($name, $value);}
1088 | 	function hasAttribute($name) {return $this->__isset($name);}
1089 | 	function removeAttribute($name) {$this->__set($name, null);}
1090 | 	function getElementById($id) {return $this->find("#$id", 0);}
1091 | 	function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1092 | 	function getElementByTagName($name) {return $this->find($name, 0);}
1093 | 	function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
1094 | 	function parentNode() {return $this->parent();}
1095 | 	function childNodes($idx=-1) {return $this->children($idx);}
1096 | 	function firstChild() {return $this->first_child();}
1097 | 	function lastChild() {return $this->last_child();}
1098 | 	function nextSibling() {return $this->next_sibling();}
1099 | 	function previousSibling() {return $this->prev_sibling();}
1100 | 	function hasChildNodes() {return $this->has_child();}
1101 | 	function nodeName() {return $this->tag;}
1102 | 	function appendChild($node) {$node->parent($this); return $node;}
1103 | 
1104 | }
1105 | 
1106 | /**
1107 |  * simple html dom parser
1108 |  * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
1109 |  * Paperg - change $size from protected to public so we can easily access it
1110 |  * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not.  Default is to NOT trust it.
1111 |  *
1112 |  * @package PlaceLocalInclude
1113 |  */
1114 | class simple_html_dom
1115 | {
1116 | 	/**
1117 | 	 * The root node of the document
1118 | 	 *
1119 | 	 * @var object
1120 | 	 */
1121 | 	public $root = null;
1122 | 
1123 | 	/**
1124 | 	 * List of nodes in the current DOM
1125 | 	 *
1126 | 	 * @var array
1127 | 	 */
1128 | 	public $nodes = array();
1129 | 
1130 | 	/**
1131 | 	 * Callback function to run for each element in the DOM.
1132 | 	 *
1133 | 	 * @var callable|null
1134 | 	 */
1135 | 	public $callback = null;
1136 | 
1137 | 	/**
1138 | 	 * Indicates how tags and attributes are matched
1139 | 	 *
1140 | 	 * @var bool When set to **true** tags and attributes will be converted to
1141 | 	 * lowercase before matching.
1142 | 	 */
1143 | 	public $lowercase = false;
1144 | 
1145 | 	/**
1146 | 	 * Original document size
1147 | 	 *
1148 | 	 * Holds the original document size.
1149 | 	 *
1150 | 	 * @var int
1151 | 	 */
1152 | 	public $original_size;
1153 | 
1154 | 	/**
1155 | 	 * Current document size
1156 | 	 *
1157 | 	 * Holds the current document size. The document size is determined by the
1158 | 	 * string length of ({@see simple_html_dom::$doc}).
1159 | 	 *
1160 | 	 * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1161 | 	 *
1162 | 	 * @var int
1163 | 	 * */
1164 | 	public $size;
1165 | 
1166 | 	/**
1167 | 	 * Current position in the document
1168 | 	 *
1169 | 	 * @var int
1170 | 	 */
1171 | 	protected $pos;
1172 | 
1173 | 	/**
1174 | 	 * The document
1175 | 	 *
1176 | 	 * @var string
1177 | 	 */
1178 | 	protected $doc;
1179 | 
1180 | 	/**
1181 | 	 * Current character
1182 | 	 *
1183 | 	 * Holds the current character at position {@see simple_html_dom::$pos} in
1184 | 	 * the document {@see simple_html_dom::$doc}
1185 | 	 *
1186 | 	 * _Note_: Using this variable is more efficient than calling `substr($doc, $pos, 1)`
1187 | 	 *
1188 | 	 * @var string
1189 | 	 */
1190 | 	protected $char;
1191 | 
1192 | 	protected $cursor;
1193 | 
1194 | 	/**
1195 | 	 * Parent node of the next node detected by the parser
1196 | 	 *
1197 | 	 * @var object
1198 | 	 */
1199 | 	protected $parent;
1200 | 	protected $noise = array();
1201 | 
1202 | 	/**
1203 | 	 * Tokens considered blank in HTML
1204 | 	 *
1205 | 	 * @var string
1206 | 	 */
1207 | 	protected $token_blank = " \t\r\n";
1208 | 
1209 | 	/**
1210 | 	 * Tokens to identify the equal sign for attributes, stopping either at the
1211 | 	 * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e.
1212 | 	 * "<html>")
1213 | 	 *
1214 | 	 * @var string
1215 | 	 */
1216 | 	protected $token_equal = ' =/>';
1217 | 
1218 | 	/**
1219 | 	 * Tokens to identify the end of a tag name. A tag name either ends on the
1220 | 	 * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t")
1221 | 	 *
1222 | 	 * @var string
1223 | 	 */
1224 | 	protected $token_slash = " />\r\n\t";
1225 | 
1226 | 	/**
1227 | 	 * Tokens to identify the end of an attribute
1228 | 	 *
1229 | 	 * @var string
1230 | 	 */
1231 | 	protected $token_attr = ' >';
1232 | 
1233 | 	// Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1234 | 	public $_charset = '';
1235 | 	public $_target_charset = '';
1236 | 
1237 | 	/**
1238 | 	 * Innertext for <br> elements
1239 | 	 *
1240 | 	 * @var string
1241 | 	 */
1242 | 	protected $default_br_text = "";
1243 | 
1244 | 	/**
1245 | 	 * Suffix for <span> elements
1246 | 	 *
1247 | 	 * @var string
1248 | 	 */
1249 | 	public $default_span_text = "";
1250 | 
1251 | 	/**
1252 | 	 * Defines a list of self-closing tags (Void elements) according to the HTML
1253 | 	 * Specification
1254 | 	 *
1255 | 	 * _Remarks_:
1256 | 	 * - Use `isset()` instead of `in_array()` on array elements to boost
1257 | 	 * performance about 30%
1258 | 	 * - Sort elements by name for better readability!
1259 | 	 *
1260 | 	 * @link https://www.w3.org/TR/html HTML Specification
1261 | 	 * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1262 | 	 */
1263 | 	protected $self_closing_tags = array(
1264 | 		'area'=>1,
1265 | 		'base'=>1,
1266 | 		'br'=>1,
1267 | 		'col'=>1,
1268 | 		'embed'=>1,
1269 | 		'hr'=>1,
1270 | 		'img'=>1,
1271 | 		'input'=>1,
1272 | 		'link'=>1,
1273 | 		'meta'=>1,
1274 | 		'param'=>1,
1275 | 		'source'=>1,
1276 | 		'track'=>1,
1277 | 		'wbr'=>1
1278 | 	);
1279 | 
1280 | 	/**
1281 | 	 * Defines a list of tags which - if closed - close all optional closing
1282 | 	 * elements within if they haven't been closed yet. (So, an element where
1283 | 	 * neither opening nor closing tag is omissible consistently closes every
1284 | 	 * optional closing element within)
1285 | 	 *
1286 | 	 * _Remarks_:
1287 | 	 * - Use `isset()` instead of `in_array()` on array elements to boost
1288 | 	 * performance about 30%
1289 | 	 * - Sort elements by name for better readability!
1290 | 	 */
1291 | 	protected $block_tags = array(
1292 | 		'body'=>1,
1293 | 		'div'=>1,
1294 | 		'form'=>1,
1295 | 		'root'=>1,
1296 | 		'span'=>1,
1297 | 		'table'=>1
1298 | 	);
1299 | 
1300 | 	/**
1301 | 	 * Defines elements whose end tag is omissible.
1302 | 	 *
1303 | 	 * * key = Name of an element whose end tag is omissible.
1304 | 	 * * value = Names of elements whose end tag is omissible, that are closed
1305 | 	 * by the current element.
1306 | 	 *
1307 | 	 * _Remarks_:
1308 | 	 * - Use `isset()` instead of `in_array()` on array elements to boost
1309 | 	 * performance about 30%
1310 | 	 * - Sort elements by name for better readability!
1311 | 	 *
1312 | 	 * **Example**
1313 | 	 *
1314 | 	 * An `li` element’s end tag may be omitted if the `li` element is immediately
1315 | 	 * followed by another `li` element. To do that, add following element to the
1316 | 	 * array:
1317 | 	 *
1318 | 	 * ```php
1319 | 	 * 'li' => array('li'),
1320 | 	 * ```
1321 | 	 *
1322 | 	 * With this, the following two examples are considered equal. Note that the
1323 | 	 * second example is missing the closing tags on `li` elements.
1324 | 	 *
1325 | 	 * ```html
1326 | 	 * <ul><li>First Item</li><li>Second Item</li></ul>
1327 | 	 * ```
1328 | 	 *
1329 | 	 * <ul><li>First Item</li><li>Second Item</li></ul>
1330 | 	 *
1331 | 	 * ```html
1332 | 	 * <ul><li>First Item<li>Second Item</ul>
1333 | 	 * ```
1334 | 	 *
1335 | 	 * <ul><li>First Item<li>Second Item</ul>
1336 | 	 *
1337 | 	 * @var array A two-dimensional array where the key is the name of an
1338 | 	 * element whose end tag is omissible and the value is an array of elements
1339 | 	 * whose end tag is omissible, that are closed by the current element.
1340 | 	 *
1341 | 	 * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1342 | 	 *
1343 | 	 * @todo The implementation of optional closing tags doesn't work in all cases
1344 | 	 * because it only consideres elements who close other optional closing
1345 | 	 * tags, not taking into account that some (non-blocking) tags should close
1346 | 	 * these optional closing tags. For example, the end tag for "p" is omissible
1347 | 	 * and can be closed by an "address" element, whose end tag is NOT omissible.
1348 | 	 * Currently a "p" element without closing tag stops at the next "p" element
1349 | 	 * or blocking tag, even if it contains other elements.
1350 | 	 *
1351 | 	 * @todo Known sourceforge issue #2977341
1352 | 	 * B tags that are not closed cause us to return everything to the end of
1353 | 	 * the document.
1354 | 	 */
1355 | 	protected $optional_closing_tags = array(
1356 | 		'b'=>array('b'=>1), // Not optional, see https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1357 | 		'dd'=>array('dd'=>1, 'dt'=>1),
1358 | 		'dl'=>array('dd'=>1, 'dt'=>1), // Not optional, see https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1359 | 		'dt'=>array('dd'=>1, 'dt'=>1),
1360 | 		'li'=>array('li'=>1),
1361 | 		'optgroup'=>array('optgroup'=>1, 'option'=>1),
1362 | 		'option'=>array('optgroup'=>1, 'option'=>1),
1363 | 		'p'=>array('p'=>1),
1364 | 		'rp'=>array('rp'=>1, 'rt'=>1),
1365 | 		'rt'=>array('rp'=>1, 'rt'=>1),
1366 | 		'td'=>array('td'=>1, 'th'=>1),
1367 | 		'th'=>array('td'=>1, 'th'=>1),
1368 | 		'tr'=>array('td'=>1, 'th'=>1, 'tr'=>1),
1369 | 	);
1370 | 
1371 | 	function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0)
1372 | 	{
1373 | 		if ($str)
1374 | 		{
1375 | 			if (preg_match("/^http:\/\//i",$str) || is_file($str))
1376 | 			{
1377 | 				$this->load_file($str);
1378 | 			}
1379 | 			else
1380 | 			{
1381 | 				$this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options);
1382 | 			}
1383 | 		}
1384 | 		// Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1385 | 		if (!$forceTagsClosed) {
1386 | 			$this->optional_closing_array=array();
1387 | 		}
1388 | 		$this->_target_charset = $target_charset;
1389 | 	}
1390 | 
1391 | 	function __destruct()
1392 | 	{
1393 | 		$this->clear();
1394 | 	}
1395 | 
1396 | 	// load html from string
1397 | 	function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0)
1398 | 	{
1399 | 		global $debug_object;
1400 | 
1401 | 		// prepare
1402 | 		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1403 | 
1404 | 		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1405 | 		// Script tags removal now preceeds style tag removal.
1406 | 		// strip out <script> tags
1407 | 		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1408 | 		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1409 | 
1410 | 		// strip out the \r \n's if we are told to.
1411 | 		if ($stripRN) {
1412 | 			$this->doc = str_replace("\r", " ", $this->doc);
1413 | 			$this->doc = str_replace("\n", " ", $this->doc);
1414 | 
1415 | 			// set the length of content since we have changed it.
1416 | 			$this->size = strlen($this->doc);
1417 | 		}
1418 | 
1419 | 		// strip out cdata
1420 | 		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1421 | 		// strip out comments
1422 | 		$this->remove_noise("'<!--(.*?)-->'is");
1423 | 		// strip out <style> tags
1424 | 		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1425 | 		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1426 | 		// strip out preformatted tags
1427 | 		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1428 | 		// strip out server side scripts
1429 | 		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1430 | 
1431 | 		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1432 | 			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1433 | 		}
1434 | 
1435 | 		// parsing
1436 | 		$this->parse();
1437 | 		// end
1438 | 		$this->root->_[HDOM_INFO_END] = $this->cursor;
1439 | 		$this->parse_charset();
1440 | 
1441 | 		// make load function chainable
1442 | 		return $this;
1443 | 
1444 | 	}
1445 | 
1446 | 	// load html from file
1447 | 	function load_file()
1448 | 	{
1449 | 		$args = func_get_args();
1450 | 
1451 | 		if($doc = call_user_func_array('file_get_contents', $args) !== false) {
1452 | 			$this->load($doc, true);
1453 | 		} else {
1454 | 			return false;
1455 | 		}
1456 | 	}
1457 | 
1458 | 	/**
1459 | 	 * Set the callback function
1460 | 	 *
1461 | 	 * @param callable $function_name Callback function to run for each element
1462 | 	 * in the DOM.
1463 | 	 * @return void
1464 | 	 */
1465 | 	function set_callback($function_name)
1466 | 	{
1467 | 		$this->callback = $function_name;
1468 | 	}
1469 | 
1470 | 	/**
1471 | 	 * Remove callback function
1472 | 	 *
1473 | 	 * @return void
1474 | 	 */
1475 | 	function remove_callback()
1476 | 	{
1477 | 		$this->callback = null;
1478 | 	}
1479 | 
1480 | 	// save dom as string
1481 | 	function save($filepath='')
1482 | 	{
1483 | 		$ret = $this->root->innertext();
1484 | 		if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1485 | 		return $ret;
1486 | 	}
1487 | 
1488 | 	// find dom node by css selector
1489 | 	// Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1490 | 	function find($selector, $idx=null, $lowercase=false)
1491 | 	{
1492 | 		return $this->root->find($selector, $idx, $lowercase);
1493 | 	}
1494 | 
1495 | 	// clean up memory due to php5 circular references memory leak...
1496 | 	function clear()
1497 | 	{
1498 | 		foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1499 | 		// This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1500 | 		if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1501 | 		if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1502 | 		if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1503 | 		unset($this->doc);
1504 | 		unset($this->noise);
1505 | 	}
1506 | 
1507 | 	function dump($show_attr=true)
1508 | 	{
1509 | 		$this->root->dump($show_attr);
1510 | 	}
1511 | 
1512 | 	// prepare HTML data and init everything
1513 | 	protected function prepare($str, $lowercase=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1514 | 	{
1515 | 		$this->clear();
1516 | 
1517 | 		$this->doc = trim($str);
1518 | 		$this->size = strlen($this->doc);
1519 | 		$this->original_size = $this->size; // Save the original size of the html that we got in.  It might be useful to someone.
1520 | 		$this->pos = 0;
1521 | 		$this->cursor = 1;
1522 | 		$this->noise = array();
1523 | 		$this->nodes = array();
1524 | 		$this->lowercase = $lowercase;
1525 | 		$this->default_br_text = $defaultBRText;
1526 | 		$this->default_span_text = $defaultSpanText;
1527 | 		$this->root = new simple_html_dom_node($this);
1528 | 		$this->root->tag = 'root';
1529 | 		$this->root->_[HDOM_INFO_BEGIN] = -1;
1530 | 		$this->root->nodetype = HDOM_TYPE_ROOT;
1531 | 		$this->parent = $this->root;
1532 | 		if ($this->size>0) $this->char = $this->doc[0];
1533 | 	}
1534 | 
1535 | 	/**
1536 | 	 * Parse HTML content
1537 | 	 *
1538 | 	 * @return bool True on success
1539 | 	 */
1540 | 	protected function parse()
1541 | 	{
1542 | 		while (true) {
1543 | 			// Read next tag if there is no text between current position and the
1544 | 			// next opening tag.
1545 | 			if (($s = $this->copy_until_char('<'))==='')
1546 | 			{
1547 | 				if($this->read_tag()) {
1548 | 					continue;
1549 | 				} else {
1550 | 					return true;
1551 | 				}
1552 | 			}
1553 | 
1554 | 			// Add a text node for text between tags
1555 | 			$node = new simple_html_dom_node($this);
1556 | 			++$this->cursor;
1557 | 			$node->_[HDOM_INFO_TEXT] = $s;
1558 | 			$this->link_nodes($node, false);
1559 | 		}
1560 | 	}
1561 | 
1562 | 	// PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1563 | 	// NOTE:  IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1564 | 	// (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1565 | 	protected function parse_charset()
1566 | 	{
1567 | 		global $debug_object;
1568 | 
1569 | 		$charset = null;
1570 | 
1571 | 		if (function_exists('get_last_retrieve_url_contents_content_type'))
1572 | 		{
1573 | 			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
1574 | 			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1575 | 			if ($success)
1576 | 			{
1577 | 				$charset = $matches[1];
1578 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1579 | 			}
1580 | 
1581 | 		}
1582 | 
1583 | 		if (empty($charset))
1584 | 		{
1585 | 			$el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1586 | 			if (!empty($el))
1587 | 			{
1588 | 				$fullvalue = $el->content;
1589 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1590 | 
1591 | 				if (!empty($fullvalue))
1592 | 				{
1593 | 					$success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1594 | 					if ($success)
1595 | 					{
1596 | 						$charset = $matches[1];
1597 | 					}
1598 | 					else
1599 | 					{
1600 | 						// If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1601 | 						if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1602 | 						$charset = 'ISO-8859-1';
1603 | 					}
1604 | 				}
1605 | 			}
1606 | 		}
1607 | 
1608 | 		// If we couldn't find a charset above, then lets try to detect one based on the text we got...
1609 | 		if (empty($charset))
1610 | 		{
1611 | 			// Use this in case mb_detect_charset isn't installed/loaded on this machine.
1612 | 			$charset = false;
1613 | 			if (function_exists('mb_detect_encoding'))
1614 | 			{
1615 | 				// Have php try to detect the encoding from the text given to us.
1616 | 				$charset = mb_detect_encoding($this->doc . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1617 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1618 | 			}
1619 | 
1620 | 			// and if this doesn't work...  then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1621 | 			if ($charset === false)
1622 | 			{
1623 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1624 | 				$charset = 'UTF-8';
1625 | 			}
1626 | 		}
1627 | 
1628 | 		// Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1629 | 		if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1630 | 		{
1631 | 			if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1632 | 			$charset = 'CP1252';
1633 | 		}
1634 | 
1635 | 		if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1636 | 
1637 | 		return $this->_charset = $charset;
1638 | 	}
1639 | 
1640 | 	/**
1641 | 	 * Parse tag from current document position.
1642 | 	 *
1643 | 	 * @return bool True if a tag was found, false otherwise
1644 | 	 */
1645 | 	protected function read_tag()
1646 | 	{
1647 | 		// Set end position if no further tags found
1648 | 		if ($this->char!=='<')
1649 | 		{
1650 | 			$this->root->_[HDOM_INFO_END] = $this->cursor;
1651 | 			return false;
1652 | 		}
1653 | 		$begin_tag_pos = $this->pos;
1654 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1655 | 
1656 | 		// end tag
1657 | 		if ($this->char==='/')
1658 | 		{
1659 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1660 | 
1661 | 			// Skip whitespace in end tags (i.e. in "</   html>")
1662 | 			$this->skip($this->token_blank);
1663 | 			$tag = $this->copy_until_char('>');
1664 | 
1665 | 			// Skip attributes in end tags
1666 | 			if (($pos = strpos($tag, ' '))!==false)
1667 | 				$tag = substr($tag, 0, $pos);
1668 | 
1669 | 			$parent_lower = strtolower($this->parent->tag);
1670 | 			$tag_lower = strtolower($tag);
1671 | 
1672 | 			// The end tag is supposed to close the parent tag. Handle situations
1673 | 			// when it doesn't
1674 | 			if ($parent_lower!==$tag_lower)
1675 | 			{
1676 | 				// Parent tag does not have to be closed necessarily (optional closing tag)
1677 | 				// Current tag is a block tag, so it may close an ancestor
1678 | 				if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1679 | 				{
1680 | 					$this->parent->_[HDOM_INFO_END] = 0;
1681 | 					$org_parent = $this->parent;
1682 | 
1683 | 					// Traverse ancestors to find a matching opening tag
1684 | 					// Stop at root node
1685 | 					while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1686 | 						$this->parent = $this->parent->parent;
1687 | 
1688 | 					// If we don't have a match add current tag as text node
1689 | 					if (strtolower($this->parent->tag)!==$tag_lower) {
1690 | 						$this->parent = $org_parent; // restore origonal parent
1691 | 						if ($this->parent->parent) $this->parent = $this->parent->parent;
1692 | 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1693 | 						return $this->as_text_node($tag);
1694 | 					}
1695 | 				}
1696 | 				// Grandparent exists and current tag is a block tag, so our parent doesn't have an end tag
1697 | 				else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1698 | 				{
1699 | 					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
1700 | 					$org_parent = $this->parent;
1701 | 
1702 | 					// Traverse ancestors to find a matching opening tag
1703 | 					// Stop at root node
1704 | 					while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1705 | 						$this->parent = $this->parent->parent;
1706 | 
1707 | 					// If we don't have a match add current tag as text node
1708 | 					if (strtolower($this->parent->tag)!==$tag_lower)
1709 | 					{
1710 | 						$this->parent = $org_parent; // restore origonal parent
1711 | 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1712 | 						return $this->as_text_node($tag);
1713 | 					}
1714 | 				}
1715 | 				// Grandparent exists and current tag closes it
1716 | 				else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1717 | 				{
1718 | 					$this->parent->_[HDOM_INFO_END] = 0;
1719 | 					$this->parent = $this->parent->parent;
1720 | 				}
1721 | 				else // Random tag, add as text node
1722 | 					return $this->as_text_node($tag);
1723 | 			}
1724 | 
1725 | 			// Set end position of parent tag to current cursor position
1726 | 			$this->parent->_[HDOM_INFO_END] = $this->cursor;
1727 | 			if ($this->parent->parent) $this->parent = $this->parent->parent;
1728 | 
1729 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1730 | 			return true;
1731 | 		}
1732 | 
1733 | 		// start tag
1734 | 		$node = new simple_html_dom_node($this);
1735 | 		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
1736 | 		++$this->cursor;
1737 | 		$tag = $this->copy_until($this->token_slash); // Get tag name
1738 | 		$node->tag_start = $begin_tag_pos;
1739 | 
1740 | 		// doctype, cdata & comments...
1741 | 		// <!DOCTYPE html>
1742 | 		// <![CDATA[ ... ]]>
1743 | 		// <!-- Comment -->
1744 | 		if (isset($tag[0]) && $tag[0]==='!') {
1745 | 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1746 | 
1747 | 			if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { // Comment ("<!--")
1748 | 				$node->nodetype = HDOM_TYPE_COMMENT;
1749 | 				$node->tag = 'comment';
1750 | 			} else { // Could be doctype or CDATA but we don't care
1751 | 				$node->nodetype = HDOM_TYPE_UNKNOWN;
1752 | 				$node->tag = 'unknown';
1753 | 			}
1754 | 			if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1755 | 			$this->link_nodes($node, true);
1756 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1757 | 			return true;
1758 | 		}
1759 | 
1760 | 		// The start tag cannot contain another start tag, if so add as text
1761 | 		// i.e. "<<html>"
1762 | 		if ($pos=strpos($tag, '<')!==false) {
1763 | 			$tag = '<' . substr($tag, 0, -1);
1764 | 			$node->_[HDOM_INFO_TEXT] = $tag;
1765 | 			$this->link_nodes($node, false);
1766 | 			$this->char = $this->doc[--$this->pos]; // prev
1767 | 			return true;
1768 | 		}
1769 | 
1770 | 		// Handle invalid tag names (i.e. "<html#doc>")
1771 | 		if (!preg_match("/^\w[\w:-]*$/", $tag)) {
1772 | 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1773 | 
1774 | 			// Next char is the beginning of a new tag, don't touch it.
1775 | 			if ($this->char==='<') {
1776 | 				$this->link_nodes($node, false);
1777 | 				return true;
1778 | 			}
1779 | 
1780 | 			// Next char closes current tag, add and be done with it.
1781 | 			if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1782 | 			$this->link_nodes($node, false);
1783 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1784 | 			return true;
1785 | 		}
1786 | 
1787 | 		// begin tag, add new node
1788 | 		$node->nodetype = HDOM_TYPE_ELEMENT;
1789 | 		$tag_lower = strtolower($tag);
1790 | 		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
1791 | 
1792 | 		// handle optional closing tags
1793 | 		if (isset($this->optional_closing_tags[$tag_lower]) )
1794 | 		{
1795 | 			// Traverse ancestors to close all optional closing tags
1796 | 			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1797 | 			{
1798 | 				$this->parent->_[HDOM_INFO_END] = 0;
1799 | 				$this->parent = $this->parent->parent;
1800 | 			}
1801 | 			$node->parent = $this->parent;
1802 | 		}
1803 | 
1804 | 		$guard = 0; // prevent infinity loop
1805 | 		$space = array($this->copy_skip($this->token_blank), '', ''); // [0] Space between tag and first attribute
1806 | 
1807 | 		// attributes
1808 | 		do
1809 | 		{
1810 | 			// Everything until the first equal sign should be the attribute name
1811 | 			$name = $this->copy_until($this->token_equal);
1812 | 
1813 | 			if ($name==='' && $this->char!==null && $space[0]==='')
1814 | 			{
1815 | 				break;
1816 | 			}
1817 | 
1818 | 			if ($guard===$this->pos) // Escape infinite loop
1819 | 			{
1820 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1821 | 				continue;
1822 | 			}
1823 | 			$guard = $this->pos;
1824 | 
1825 | 			// handle endless '<'
1826 | 			if ($this->pos>=$this->size-1 && $this->char!=='>') { // Out of bounds before the tag ended
1827 | 				$node->nodetype = HDOM_TYPE_TEXT;
1828 | 				$node->_[HDOM_INFO_END] = 0;
1829 | 				$node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1830 | 				$node->tag = 'text';
1831 | 				$this->link_nodes($node, false);
1832 | 				return true;
1833 | 			}
1834 | 
1835 | 			// handle mismatch '<'
1836 | 			if ($this->doc[$this->pos-1]=='<') { // Attributes cannot start after opening tag
1837 | 				$node->nodetype = HDOM_TYPE_TEXT;
1838 | 				$node->tag = 'text';
1839 | 				$node->attr = array();
1840 | 				$node->_[HDOM_INFO_END] = 0;
1841 | 				$node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1842 | 				$this->pos -= 2;
1843 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1844 | 				$this->link_nodes($node, false);
1845 | 				return true;
1846 | 			}
1847 | 
1848 | 			if ($name!=='/' && $name!=='') { // this is a attribute name
1849 | 				$space[1] = $this->copy_skip($this->token_blank); // [1] Whitespace after attribute name
1850 | 				$name = $this->restore_noise($name); // might be a noisy name
1851 | 				if ($this->lowercase) $name = strtolower($name);
1852 | 				if ($this->char==='=') { // attribute with value
1853 | 					$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1854 | 					$this->parse_attr($node, $name, $space); // get attribute value
1855 | 				}
1856 | 				else {
1857 | 					//no value attr: nowrap, checked selected...
1858 | 					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1859 | 					$node->attr[$name] = true;
1860 | 					if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1861 | 				}
1862 | 				$node->_[HDOM_INFO_SPACE][] = $space;
1863 | 				$space = array($this->copy_skip($this->token_blank), '', ''); // prepare for next attribute
1864 | 			}
1865 | 			else // no more attributes
1866 | 				break;
1867 | 		} while ($this->char!=='>' && $this->char!=='/'); // go until the tag ended
1868 | 
1869 | 		$this->link_nodes($node, true);
1870 | 		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
1871 | 
1872 | 		// handle empty tags (i.e. "<div/>")
1873 | 		if ($this->copy_until_char('>')==='/')
1874 | 		{
1875 | 			$node->_[HDOM_INFO_ENDSPACE] .= '/';
1876 | 			$node->_[HDOM_INFO_END] = 0;
1877 | 		}
1878 | 		else
1879 | 		{
1880 | 			// reset parent
1881 | 			if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1882 | 		}
1883 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1884 | 
1885 | 		// If it's a BR tag, we need to set it's text to the default text.
1886 | 		// This way when we see it in plaintext, we can generate formatting that the user wants.
1887 | 		// since a br tag never has sub nodes, this works well.
1888 | 		if ($node->tag == "br")
1889 | 		{
1890 | 			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
1891 | 		}
1892 | 
1893 | 		return true;
1894 | 	}
1895 | 
1896 | 	/**
1897 | 	 * Parse attribute from current document position
1898 | 	 *
1899 | 	 * @param object $node Node for the attributes
1900 | 	 * @param string $name Name of the current attribute
1901 | 	 * @param array $space Array for spacing information
1902 | 	 * @return void
1903 | 	 */
1904 | 	protected function parse_attr($node, $name, &$space)
1905 | 	{
1906 | 		// Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1907 | 		// If the attribute is already defined inside a tag, only pay attention to the first one as opposed to the last one.
1908 | 		// https://stackoverflow.com/a/26341866
1909 | 		if (isset($node->attr[$name]))
1910 | 		{
1911 | 			return;
1912 | 		}
1913 | 
1914 | 		$space[2] = $this->copy_skip($this->token_blank); // [2] Whitespace between "=" and the value
1915 | 		switch ($this->char) {
1916 | 			case '"': // value is anything between double quotes
1917 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1918 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1919 | 				$node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
1920 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1921 | 				break;
1922 | 			case '\'': // value is anything between single quotes
1923 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1924 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1925 | 				$node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
1926 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1927 | 				break;
1928 | 			default: // value is anything until the first space or end tag
1929 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1930 | 				$node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1931 | 		}
1932 | 		// PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1933 | 		$node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1934 | 		$node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1935 | 		// PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1936 | 		if ($name == "class") {
1937 | 			$node->attr[$name] = trim($node->attr[$name]);
1938 | 		}
1939 | 	}
1940 | 
1941 | 	/**
1942 | 	 * Link node to parent node
1943 | 	 *
1944 | 	 * @param object $node Node to link to parent
1945 | 	 * @param bool $is_child True if the node is a child of parent
1946 | 	 * @return void
1947 | 	 */
1948 | 	// link node's parent
1949 | 	protected function link_nodes(&$node, $is_child)
1950 | 	{
1951 | 		$node->parent = $this->parent;
1952 | 		$this->parent->nodes[] = $node;
1953 | 		if ($is_child)
1954 | 		{
1955 | 			$this->parent->children[] = $node;
1956 | 		}
1957 | 	}
1958 | 
1959 | 	/**
1960 | 	 * Add tag as text node to current node
1961 | 	 *
1962 | 	 * @param string $tag Tag name
1963 | 	 * @return bool True on success
1964 | 	 */
1965 | 	protected function as_text_node($tag)
1966 | 	{
1967 | 		$node = new simple_html_dom_node($this);
1968 | 		++$this->cursor;
1969 | 		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1970 | 		$this->link_nodes($node, false);
1971 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1972 | 		return true;
1973 | 	}
1974 | 
1975 | 	/**
1976 | 	 * Seek from the current document position to the first occurrence of a
1977 | 	 * character not defined by the provided string. Update the current document
1978 | 	 * position to the new position.
1979 | 	 *
1980 | 	 * @param string $chars A string containing every allowed character.
1981 | 	 * @return void
1982 | 	 */
1983 | 	protected function skip($chars)
1984 | 	{
1985 | 		$this->pos += strspn($this->doc, $chars, $this->pos);
1986 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1987 | 	}
1988 | 
1989 | 	/**
1990 | 	 * Copy substring from the current document position to the first occurrence
1991 | 	 * of a character not defined by the provided string.
1992 | 	 *
1993 | 	 * @param string $chars A string containing every allowed character.
1994 | 	 * @return string Substring from the current document position to the first
1995 | 	 * occurrence of a character not defined by the provided string.
1996 | 	 */
1997 | 	protected function copy_skip($chars)
1998 | 	{
1999 | 		$pos = $this->pos;
2000 | 		$len = strspn($this->doc, $chars, $pos);
2001 | 		$this->pos += $len;
2002 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
2003 | 		if ($len===0) return '';
2004 | 		return substr($this->doc, $pos, $len);
2005 | 	}
2006 | 
2007 | 	/**
2008 | 	 * Copy substring from the current document position to the first occurrence
2009 | 	 * of any of the provided characters.
2010 | 	 *
2011 | 	 * @param string $chars A string containing every character to stop at.
2012 | 	 * @return string Substring from the current document position to the first
2013 | 	 * occurrence of any of the provided characters.
2014 | 	 */
2015 | 	protected function copy_until($chars)
2016 | 	{
2017 | 		$pos = $this->pos;
2018 | 		$len = strcspn($this->doc, $chars, $pos);
2019 | 		$this->pos += $len;
2020 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
2021 | 		return substr($this->doc, $pos, $len);
2022 | 	}
2023 | 
2024 | 	/**
2025 | 	 * Copy substring from the current document position to the first occurrence
2026 | 	 * of the provided string.
2027 | 	 *
2028 | 	 * @param string $char The string to stop at.
2029 | 	 * @return string Substring from the current document position to the first
2030 | 	 * occurrence of the provided string.
2031 | 	 */
2032 | 	protected function copy_until_char($char)
2033 | 	{
2034 | 		if ($this->char===null) return '';
2035 | 
2036 | 		if (($pos = strpos($this->doc, $char, $this->pos))===false) {
2037 | 			$ret = substr($this->doc, $this->pos, $this->size-$this->pos);
2038 | 			$this->char = null;
2039 | 			$this->pos = $this->size;
2040 | 			return $ret;
2041 | 		}
2042 | 
2043 | 		if ($pos===$this->pos) return '';
2044 | 		$pos_old = $this->pos;
2045 | 		$this->char = $this->doc[$pos];
2046 | 		$this->pos = $pos;
2047 | 		return substr($this->doc, $pos_old, $pos-$pos_old);
2048 | 	}
2049 | 
2050 | 	/**
2051 | 	 * Remove noise from HTML content
2052 | 	 *
2053 | 	 * Noise is stored to {@see simple_html_dom::$noise}
2054 | 	 *
2055 | 	 * @param string $pattern The regex pattern used for finding noise
2056 | 	 * @param bool $remove_tag True to remove the entire match. Default is false
2057 | 	 * to only remove the captured data.
2058 | 	 */
2059 | 	protected function remove_noise($pattern, $remove_tag=false)
2060 | 	{
2061 | 		global $debug_object;
2062 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2063 | 
2064 | 		$count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
2065 | 
2066 | 		for ($i=$count-1; $i>-1; --$i)
2067 | 		{
2068 | 			$key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
2069 | 			if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
2070 | 			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2071 | 			$this->noise[$key] = $matches[$i][$idx][0];
2072 | 			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2073 | 		}
2074 | 
2075 | 		// reset the length of content
2076 | 		$this->size = strlen($this->doc);
2077 | 		if ($this->size>0)
2078 | 		{
2079 | 			$this->char = $this->doc[0];
2080 | 		}
2081 | 	}
2082 | 
2083 | 	/**
2084 | 	 * Restore noise to HTML content
2085 | 	 *
2086 | 	 * Noise is restored from {@see simple_html_dom::$noise}
2087 | 	 *
2088 | 	 * @param string $text A subset of HTML containing noise
2089 | 	 * @return string The same content with noise restored
2090 | 	 */
2091 | 	function restore_noise($text)
2092 | 	{
2093 | 		global $debug_object;
2094 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2095 | 
2096 | 		while (($pos=strpos($text, '___noise___'))!==false)
2097 | 		{
2098 | 			// Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
2099 | 			if (strlen($text) > $pos+15)
2100 | 			{	// todo: "___noise___1000" (or any number with four or more digits) in the DOM causes an infinite loop which could be utilized by malicious software
2101 | 				$key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
2102 | 				if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
2103 | 
2104 | 				if (isset($this->noise[$key]))
2105 | 				{
2106 | 					$text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
2107 | 				}
2108 | 				else
2109 | 				{
2110 | 					// do this to prevent an infinite loop.
2111 | 					$text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
2112 | 				}
2113 | 			}
2114 | 			else
2115 | 			{
2116 | 				// There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
2117 | 				$text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
2118 | 			}
2119 | 		}
2120 | 		return $text;
2121 | 	}
2122 | 
2123 | 	// Sometimes we NEED one of the noise elements.
2124 | 	function search_noise($text)
2125 | 	{
2126 | 		global $debug_object;
2127 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2128 | 
2129 | 		foreach($this->noise as $noiseElement)
2130 | 		{
2131 | 			if (strpos($noiseElement, $text)!==false)
2132 | 			{
2133 | 				return $noiseElement;
2134 | 			}
2135 | 		}
2136 | 	}
2137 | 	function __toString()
2138 | 	{
2139 | 		return $this->root->innertext();
2140 | 	}
2141 | 
2142 | 	function __get($name)
2143 | 	{
2144 | 		switch ($name)
2145 | 		{
2146 | 			case 'outertext':
2147 | 				return $this->root->innertext();
2148 | 			case 'innertext':
2149 | 				return $this->root->innertext();
2150 | 			case 'plaintext':
2151 | 				return $this->root->text();
2152 | 			case 'charset':
2153 | 				return $this->_charset;
2154 | 			case 'target_charset':
2155 | 				return $this->_target_charset;
2156 | 		}
2157 | 	}
2158 | 
2159 | 	// camel naming conventions
2160 | 	function childNodes($idx=-1) {return $this->root->childNodes($idx);}
2161 | 	function firstChild() {return $this->root->first_child();}
2162 | 	function lastChild() {return $this->root->last_child();}
2163 | 	function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
2164 | 	function createTextNode($value) {return @end(str_get_html($value)->nodes);}
2165 | 	function getElementById($id) {return $this->find("#$id", 0);}
2166 | 	function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
2167 | 	function getElementByTagName($name) {return $this->find($name, 0);}
2168 | 	function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
2169 | 	function loadFile() {$args = func_get_args();$this->load_file($args);}
2170 | }
2171 | 
2172 | ?>


--------------------------------------------------------------------------------