├── .gitignore ├── LICENSE ├── README.md ├── example.php └── src ├── WaifuGenerator.php └── data └── dom.php /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Moe Poi ~ 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WaifuGenerator 2 | Script For Generate Waifu (≧▽≦) 3 | 4 | ---- 5 | 6 | ## Usage 7 | 8 | ```sh 9 | type("name"); 15 | $image = $anu->type("image"); 16 | echo "=== RESULT ===\n\nName : {$name}\nImage : {$image}"; 17 | 18 | ?> 19 | ``` 20 | 21 | ## Credit 22 | 23 | Moe Poi ~ / [@moepoi](https://github.com/moepoi) 24 | -------------------------------------------------------------------------------- /example.php: -------------------------------------------------------------------------------- 1 | type("name"); 7 | $image = $anu->type("image"); 8 | echo "=== RESULT ===\n\nName : {$name}\nImage : {$image}"; 9 | 10 | ?> -------------------------------------------------------------------------------- /src/WaifuGenerator.php: -------------------------------------------------------------------------------- 1 | 7 | * @license MIT 8 | */ 9 | 10 | class WaifuGenerator { 11 | public $image; 12 | public $name; 13 | public $page; 14 | public $url; 15 | public $req; 16 | public function type($type) { 17 | $this->image = array(); 18 | $this->name = array(); 19 | $this->page = strval(rand(1,11)); 20 | $this->url = sprintf('http://jurnalotaku.com/tag/waifu-wednesday/page/%s/', $this->page); 21 | $this->req = file_get_html($this->url); 22 | foreach($this->req->find('div[class=article-wrapper article-tb m-tb]') as $x) 23 | foreach($x->find('div') as $y) 24 | foreach($y->find('div') as $z) 25 | foreach($z->find('img') as $s) 26 | array_push($this->image, $s->src) && array_push($this->name, $s->alt); 27 | $num = rand(0,count($this->image)); 28 | if ($type == "name"){ 29 | return str_replace("[Waifu Wednesday] ","",$this->name[$num]); 30 | }elseif ($type == "image"){ 31 | return $this->image[$num]; 32 | }else{ 33 | return "name/image"; 34 | } 35 | } 36 | } 37 | 38 | ?> -------------------------------------------------------------------------------- /src/data/dom.php: -------------------------------------------------------------------------------- 1 | size is the "real" number of bytes the dom was created from. 18 | * but for most purposes, it's a really good estimation. 19 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. 20 | * Allow the user to tell us how much they trust the html. 21 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. 22 | * This allows for us to find tags based on the text they contain. 23 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. 24 | * Paperg: added parse_charset so that we know about the character set of the source document. 25 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the 26 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. 27 | * 28 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. 29 | * PaperG (John Schlick) Added get_display_size for "IMG" tags. 30 | * 31 | * Licensed under The MIT License 32 | * Redistributions of files must retain the above copyright notice. 33 | * 34 | * @author S.C. Chen 35 | * @author John Schlick 36 | * @author Rus Carroll 37 | * @version Rev. 1.7 (214) 38 | * @package PlaceLocalInclude 39 | * @subpackage simple_html_dom 40 | */ 41 | 42 | /** 43 | * All of the Defines for the classes below. 44 | * @author S.C. Chen 45 | */ 46 | define('HDOM_TYPE_ELEMENT', 1); 47 | define('HDOM_TYPE_COMMENT', 2); 48 | define('HDOM_TYPE_TEXT', 3); 49 | define('HDOM_TYPE_ENDTAG', 4); 50 | define('HDOM_TYPE_ROOT', 5); 51 | define('HDOM_TYPE_UNKNOWN', 6); 52 | define('HDOM_QUOTE_DOUBLE', 0); 53 | define('HDOM_QUOTE_SINGLE', 1); 54 | define('HDOM_QUOTE_NO', 3); 55 | define('HDOM_INFO_BEGIN', 0); 56 | define('HDOM_INFO_END', 1); 57 | define('HDOM_INFO_QUOTE', 2); 58 | define('HDOM_INFO_SPACE', 3); 59 | define('HDOM_INFO_TEXT', 4); 60 | define('HDOM_INFO_INNER', 5); 61 | define('HDOM_INFO_OUTER', 6); 62 | define('HDOM_INFO_ENDSPACE',7); 63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 64 | define('DEFAULT_BR_TEXT', "\r\n"); 65 | define('DEFAULT_SPAN_TEXT', " "); 66 | define('MAX_FILE_SIZE', 600000); 67 | 68 | /** Contents between curly braces "{" and "}" are interpreted as text */ 69 | define('HDOM_SMARTY_AS_TEXT', 1); 70 | 71 | // helper functions 72 | // ----------------------------------------------------------------------------- 73 | // get html dom from file 74 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. 75 | function file_get_html($url, $use_include_path = false, $context=null, $offset = 0, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 76 | { 77 | // Ensure maximum length is greater than zero 78 | if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 79 | 80 | // We DO force the tags to be terminated. 81 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 82 | // For sourceforge users: uncomment the next line and comment the retrieve_url_contents line 2 lines down if it is not already done. 83 | $contents = file_get_contents($url, $use_include_path, $context, $offset, $maxLen); 84 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout. 85 | //$contents = retrieve_url_contents($url); 86 | if (empty($contents) || strlen($contents) > $maxLen) 87 | { 88 | return false; 89 | } 90 | // The second parameter can force the selectors to all be lowercase. 91 | $dom->load($contents, $lowercase, $stripRN); 92 | return $dom; 93 | } 94 | 95 | // get html dom from string 96 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 97 | { 98 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 99 | if (empty($str) || strlen($str) > MAX_FILE_SIZE) 100 | { 101 | $dom->clear(); 102 | return false; 103 | } 104 | $dom->load($str, $lowercase, $stripRN); 105 | return $dom; 106 | } 107 | 108 | // dump html dom tree 109 | function dump_html_tree($node, $show_attr=true, $deep=0) 110 | { 111 | $node->dump($node); 112 | } 113 | 114 | 115 | /** 116 | * simple html dom node 117 | * PaperG - added ability for "find" routine to lowercase the value of the selector. 118 | * PaperG - added $tag_start to track the start position of the tag in the total byte index 119 | * 120 | * @package PlaceLocalInclude 121 | */ 122 | class simple_html_dom_node 123 | { 124 | /** 125 | * Node type 126 | * 127 | * Default is {@see HDOM_TYPE_TEXT} 128 | * 129 | * @var int 130 | */ 131 | public $nodetype = HDOM_TYPE_TEXT; 132 | 133 | /** 134 | * Tag name 135 | * 136 | * Default is 'text' 137 | * 138 | * @var string 139 | */ 140 | public $tag = 'text'; 141 | 142 | /** 143 | * List of attributes 144 | * 145 | * @var array 146 | */ 147 | public $attr = array(); 148 | 149 | /** 150 | * List of child node objects 151 | * 152 | * @var array 153 | */ 154 | public $children = array(); 155 | public $nodes = array(); 156 | 157 | /** 158 | * The parent node object 159 | * 160 | * @var object|null 161 | */ 162 | public $parent = null; 163 | 164 | // The "info" array - see HDOM_INFO_... for what each element contains. 165 | public $_ = array(); 166 | 167 | /** 168 | * Start position of the tag in the document 169 | * 170 | * @var int 171 | */ 172 | public $tag_start = 0; 173 | 174 | /** 175 | * The DOM object 176 | * 177 | * @var object|null 178 | */ 179 | private $dom = null; 180 | 181 | /** 182 | * Construct new node object 183 | * 184 | * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes} 185 | */ 186 | function __construct($dom) 187 | { 188 | $this->dom = $dom; 189 | $dom->nodes[] = $this; 190 | } 191 | 192 | function __destruct() 193 | { 194 | $this->clear(); 195 | } 196 | 197 | function __toString() 198 | { 199 | return $this->outertext(); 200 | } 201 | 202 | // clean up memory due to php5 circular references memory leak... 203 | function clear() 204 | { 205 | $this->dom = null; 206 | $this->nodes = null; 207 | $this->parent = null; 208 | $this->children = null; 209 | } 210 | 211 | // dump node's tree 212 | function dump($show_attr=true, $deep=0) 213 | { 214 | $lead = str_repeat(' ', $deep); 215 | 216 | echo $lead.$this->tag; 217 | if ($show_attr && count($this->attr)>0) 218 | { 219 | echo '('; 220 | foreach ($this->attr as $k=>$v) 221 | echo "[$k]=>\"".$this->$k.'", '; 222 | echo ')'; 223 | } 224 | echo "\n"; 225 | 226 | if ($this->nodes) 227 | { 228 | foreach ($this->nodes as $c) 229 | { 230 | $c->dump($show_attr, $deep+1); 231 | } 232 | } 233 | } 234 | 235 | 236 | // Debugging function to dump a single dom node with a bunch of information about it. 237 | function dump_node($echo=true) 238 | { 239 | 240 | $string = $this->tag; 241 | if (count($this->attr)>0) 242 | { 243 | $string .= '('; 244 | foreach ($this->attr as $k=>$v) 245 | { 246 | $string .= "[$k]=>\"".$this->$k.'", '; 247 | } 248 | $string .= ')'; 249 | } 250 | if (count($this->_)>0) 251 | { 252 | $string .= ' $_ ('; 253 | foreach ($this->_ as $k=>$v) 254 | { 255 | if (is_array($v)) 256 | { 257 | $string .= "[$k]=>("; 258 | foreach ($v as $k2=>$v2) 259 | { 260 | $string .= "[$k2]=>\"".$v2.'", '; 261 | } 262 | $string .= ")"; 263 | } else { 264 | $string .= "[$k]=>\"".$v.'", '; 265 | } 266 | } 267 | $string .= ")"; 268 | } 269 | 270 | if (isset($this->text)) 271 | { 272 | $string .= " text: (" . $this->text . ")"; 273 | } 274 | 275 | $string .= " HDOM_INNER_INFO: '"; 276 | if (isset($node->_[HDOM_INFO_INNER])) 277 | { 278 | $string .= $node->_[HDOM_INFO_INNER] . "'"; 279 | } 280 | else 281 | { 282 | $string .= ' NULL '; 283 | } 284 | 285 | $string .= " children: " . count($this->children); 286 | $string .= " nodes: " . count($this->nodes); 287 | $string .= " tag_start: " . $this->tag_start; 288 | $string .= "\n"; 289 | 290 | if ($echo) 291 | { 292 | echo $string; 293 | return; 294 | } 295 | else 296 | { 297 | return $string; 298 | } 299 | } 300 | 301 | /** 302 | * Return or set parent node 303 | * 304 | * @param object|null $parent (optional) The parent node, `null` to return 305 | * the current parent node. 306 | * @return object|null The parent node 307 | */ 308 | function parent($parent=null) 309 | { 310 | // I am SURE that this doesn't work properly. 311 | // It fails to unset the current node from it's current parents nodes or children list first. 312 | if ($parent !== null) 313 | { 314 | $this->parent = $parent; 315 | $this->parent->nodes[] = $this; 316 | $this->parent->children[] = $this; 317 | } 318 | 319 | return $this->parent; 320 | } 321 | 322 | /** 323 | * @return bool True if the node has at least one child node 324 | */ 325 | function has_child() 326 | { 327 | return !empty($this->children); 328 | } 329 | 330 | /** 331 | * Get child node at specified index 332 | * 333 | * @param int $idx The index of the child node to return, `-1` to return all 334 | * child nodes. 335 | * @return object|array|null The child node at the specified index, all child 336 | * nodes or null if the index is invalid. 337 | */ 338 | function children($idx=-1) 339 | { 340 | if ($idx===-1) 341 | { 342 | return $this->children; 343 | } 344 | if (isset($this->children[$idx])) 345 | { 346 | return $this->children[$idx]; 347 | } 348 | return null; 349 | } 350 | 351 | /** 352 | * Get first child node 353 | * 354 | * @return object|null The first child node or null if the current node has 355 | * no child nodes. 356 | * 357 | * @todo Use `empty()` instead of `count()` to improve performance on large 358 | * arrays. 359 | */ 360 | function first_child() 361 | { 362 | if (count($this->children)>0) 363 | { 364 | return $this->children[0]; 365 | } 366 | return null; 367 | } 368 | 369 | /** 370 | * Get last child node 371 | * 372 | * @return object|null The last child node or null if the current node has 373 | * no child nodes. 374 | * 375 | * @todo Use `end()` to slightly improve performance on large arrays. 376 | */ 377 | function last_child() 378 | { 379 | if (($count=count($this->children))>0) 380 | { 381 | return $this->children[$count-1]; 382 | } 383 | return null; 384 | } 385 | 386 | /** 387 | * Get next sibling node 388 | * 389 | * @return object|null The sibling node or null if the current node has no 390 | * sibling nodes. 391 | */ 392 | function next_sibling() 393 | { 394 | if ($this->parent===null) 395 | { 396 | return null; 397 | } 398 | 399 | $idx = 0; 400 | $count = count($this->parent->children); 401 | while ($idx<$count && $this!==$this->parent->children[$idx]) 402 | { 403 | ++$idx; 404 | } 405 | if (++$idx>=$count) 406 | { 407 | return null; 408 | } 409 | return $this->parent->children[$idx]; 410 | } 411 | 412 | /** 413 | * Get previous sibling node 414 | * 415 | * @return object|null The sibling node or null if the current node has no 416 | * sibling nodes. 417 | */ 418 | function prev_sibling() 419 | { 420 | if ($this->parent===null) return null; 421 | $idx = 0; 422 | $count = count($this->parent->children); 423 | while ($idx<$count && $this!==$this->parent->children[$idx]) 424 | ++$idx; 425 | if (--$idx<0) return null; 426 | return $this->parent->children[$idx]; 427 | } 428 | 429 | /** 430 | * Traverse ancestors to the first matching tag. 431 | * 432 | * @param string $tag Tag to find 433 | * @return object|null First matching node in the DOM tree or null if no 434 | * match was found. 435 | * 436 | * @todo Null is returned implicitly by calling ->parent on the root node. 437 | * This behaviour could change at any time, rendering this function invalid. 438 | */ 439 | function find_ancestor_tag($tag) 440 | { 441 | global $debug_object; 442 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 443 | 444 | // Start by including ourselves in the comparison. 445 | $returnDom = $this; 446 | 447 | while (!is_null($returnDom)) 448 | { 449 | if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } 450 | 451 | if ($returnDom->tag == $tag) 452 | { 453 | break; 454 | } 455 | $returnDom = $returnDom->parent; 456 | } 457 | return $returnDom; 458 | } 459 | 460 | /** 461 | * Get node's inner text (everything inside the opening and closing tags) 462 | * 463 | * @return string 464 | */ 465 | function innertext() 466 | { 467 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 468 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 469 | 470 | $ret = ''; 471 | foreach ($this->nodes as $n) 472 | $ret .= $n->outertext(); 473 | return $ret; 474 | } 475 | 476 | /** 477 | * Get node's outer text (everything including the opening and closing tags) 478 | * 479 | * @return string 480 | */ 481 | function outertext() 482 | { 483 | global $debug_object; 484 | if (is_object($debug_object)) 485 | { 486 | $text = ''; 487 | if ($this->tag == 'text') 488 | { 489 | if (!empty($this->text)) 490 | { 491 | $text = " with text: " . $this->text; 492 | } 493 | } 494 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 495 | } 496 | 497 | if ($this->tag==='root') return $this->innertext(); 498 | 499 | // trigger callback 500 | if ($this->dom && $this->dom->callback!==null) 501 | { 502 | call_user_func_array($this->dom->callback, array($this)); 503 | } 504 | 505 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; 506 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 507 | 508 | // render begin tag 509 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) 510 | { 511 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 512 | } else { 513 | $ret = ""; 514 | } 515 | 516 | // render inner text 517 | if (isset($this->_[HDOM_INFO_INNER])) 518 | { 519 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. 520 | if ($this->tag != "br") 521 | { 522 | $ret .= $this->_[HDOM_INFO_INNER]; 523 | } 524 | } else { 525 | if ($this->nodes) 526 | { 527 | foreach ($this->nodes as $n) 528 | { 529 | $ret .= $this->convert_text($n->outertext()); 530 | } 531 | } 532 | } 533 | 534 | // render end tag 535 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) 536 | $ret .= 'tag.'>'; 537 | return $ret; 538 | } 539 | 540 | /** 541 | * Get node's plain text (everything excluding all tags) 542 | * 543 | * @return string 544 | */ 545 | function text() 546 | { 547 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 548 | switch ($this->nodetype) 549 | { 550 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 551 | case HDOM_TYPE_COMMENT: return ''; 552 | case HDOM_TYPE_UNKNOWN: return ''; 553 | } 554 | if (strcasecmp($this->tag, 'script')===0) return ''; 555 | if (strcasecmp($this->tag, 'style')===0) return ''; 556 | 557 | $ret = ''; 558 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. 559 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. 560 | // WHY is this happening? 561 | if (!is_null($this->nodes)) 562 | { 563 | foreach ($this->nodes as $n) 564 | { 565 | // Start paragraph after a blank line 566 | if ($n->tag == 'p') 567 | { 568 | $ret .= "\n\n"; 569 | } 570 | 571 | $ret .= $this->convert_text($n->text()); 572 | 573 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. 574 | if ($n->tag == "span") 575 | { 576 | $ret .= $this->dom->default_span_text; 577 | } 578 | } 579 | } 580 | return trim($ret); 581 | } 582 | 583 | /** 584 | * Get node's xml text (inner text as a CDATA section) 585 | * 586 | * @return string 587 | */ 588 | function xmltext() 589 | { 590 | $ret = $this->innertext(); 591 | $ret = str_ireplace('', '', $ret); 593 | return $ret; 594 | } 595 | 596 | // build node's text with tag 597 | function makeup() 598 | { 599 | // text, comment, unknown 600 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 601 | 602 | $ret = '<'.$this->tag; 603 | $i = -1; 604 | 605 | foreach ($this->attr as $key=>$val) 606 | { 607 | ++$i; 608 | 609 | // skip removed attribute 610 | if ($val===null || $val===false) 611 | continue; 612 | 613 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 614 | //no value attr: nowrap, checked selected... 615 | if ($val===true) 616 | $ret .= $key; 617 | else { 618 | switch ($this->_[HDOM_INFO_QUOTE][$i]) 619 | { 620 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 621 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; 622 | default: $quote = ''; 623 | } 624 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; 625 | } 626 | } 627 | $ret = $this->dom->restore_noise($ret); 628 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 629 | } 630 | 631 | // find elements by css selector 632 | //PaperG - added ability for find to lowercase the value of the selector. 633 | function find($selector, $idx=null, $lowercase=false) 634 | { 635 | $selectors = $this->parse_selector($selector); 636 | if (($count=count($selectors))===0) return array(); 637 | $found_keys = array(); 638 | 639 | // find each selector 640 | for ($c=0; $c<$count; ++$c) 641 | { 642 | // The change on the below line was documented on the sourceforge code tracker id 2788009 643 | // used to be: if (($levle=count($selectors[0]))===0) return array(); 644 | if (($levle=count($selectors[$c]))===0) return array(); 645 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); 646 | 647 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); 648 | 649 | // handle descendant selectors, no recursive! 650 | for ($l=0; $l<$levle; ++$l) 651 | { 652 | $ret = array(); 653 | foreach ($head as $k=>$v) 654 | { 655 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; 656 | //PaperG - Pass this optional parameter on to the seek function. 657 | $n->seek($selectors[$c][$l], $ret, $lowercase); 658 | } 659 | $head = $ret; 660 | } 661 | 662 | foreach ($head as $k=>$v) 663 | { 664 | if (!isset($found_keys[$k])) 665 | { 666 | $found_keys[$k] = 1; 667 | } 668 | } 669 | } 670 | 671 | // sort keys 672 | ksort($found_keys); 673 | 674 | $found = array(); 675 | foreach ($found_keys as $k=>$v) 676 | $found[] = $this->dom->nodes[$k]; 677 | 678 | // return nth-element or array 679 | if (is_null($idx)) return $found; 680 | else if ($idx<0) $idx = count($found) + $idx; 681 | return (isset($found[$idx])) ? $found[$idx] : null; 682 | } 683 | 684 | // seek for given conditions 685 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. 686 | protected function seek($selector, &$ret, $lowercase=false) 687 | { 688 | global $debug_object; 689 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 690 | 691 | list($tag, $key, $val, $exp, $no_key) = $selector; 692 | 693 | // xpath index 694 | if ($tag && $key && is_numeric($key)) 695 | { 696 | $count = 0; 697 | foreach ($this->children as $c) 698 | { 699 | if ($tag==='*' || $tag===$c->tag) { 700 | if (++$count==$key) { 701 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; 702 | return; 703 | } 704 | } 705 | } 706 | return; 707 | } 708 | 709 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 710 | if ($end==0) { 711 | $parent = $this->parent; 712 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { 713 | $end -= 1; 714 | $parent = $parent->parent; 715 | } 716 | $end += $parent->_[HDOM_INFO_END]; 717 | } 718 | 719 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { 720 | $node = $this->dom->nodes[$i]; 721 | 722 | $pass = true; 723 | 724 | if ($tag==='*' && !$key) { 725 | if (in_array($node, $this->children, true)) 726 | $ret[$i] = 1; 727 | continue; 728 | } 729 | 730 | // compare tag 731 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} 732 | // compare key 733 | if ($pass && $key) { 734 | if ($no_key) { 735 | if (isset($node->attr[$key])) $pass=false; 736 | } else { 737 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; 738 | } 739 | } 740 | // compare value 741 | if ($pass && $key && $val && $val!=='*') { 742 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? 743 | if ($key == "plaintext") { 744 | // $node->plaintext actually returns $node->text(); 745 | $nodeKeyValue = $node->text(); 746 | } else { 747 | // this is a normal search, we want the value of that attribute of the tag. 748 | $nodeKeyValue = $node->attr[$key]; 749 | } 750 | if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 751 | 752 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 753 | if ($lowercase) { 754 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); 755 | } else { 756 | $check = $this->match($exp, $val, $nodeKeyValue); 757 | } 758 | if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} 759 | 760 | // handle multiple class 761 | if (!$check && strcasecmp($key, 'class')===0) { 762 | foreach (explode(' ',$node->attr[$key]) as $k) { 763 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. 764 | if (!empty($k)) { 765 | if ($lowercase) { 766 | $check = $this->match($exp, strtolower($val), strtolower($k)); 767 | } else { 768 | $check = $this->match($exp, $val, $k); 769 | } 770 | if ($check) break; 771 | } 772 | } 773 | } 774 | if (!$check) $pass = false; 775 | } 776 | if ($pass) $ret[$i] = 1; 777 | unset($node); 778 | } 779 | // It's passed by reference so this is actually what this function returns. 780 | if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} 781 | } 782 | 783 | protected function match($exp, $pattern, $value) { 784 | global $debug_object; 785 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 786 | 787 | switch ($exp) { 788 | case '=': 789 | return ($value===$pattern); 790 | case '!=': 791 | return ($value!==$pattern); 792 | case '^=': 793 | return preg_match("/^".preg_quote($pattern,'/')."/", $value); 794 | case '$=': 795 | return preg_match("/".preg_quote($pattern,'/')."$/", $value); 796 | case '*=': 797 | if ($pattern[0]=='/') { 798 | return preg_match($pattern, $value); 799 | } 800 | return preg_match("/".$pattern."/i", $value); 801 | } 802 | return false; 803 | } 804 | 805 | protected function parse_selector($selector_string) { 806 | global $debug_object; 807 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 808 | 809 | // pattern of CSS selectors, modified from mootools 810 | // Paperg: Add the colon to the attrbute, so that it properly finds like google does. 811 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. 812 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. 813 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. 814 | // farther study is required to determine of this should be documented or removed. 815 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 816 | $pattern = "/([\w:\*-]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 817 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 818 | if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} 819 | 820 | $selectors = array(); 821 | $result = array(); 822 | //print_r($matches); 823 | 824 | foreach ($matches as $m) { 825 | $m[0] = trim($m[0]); 826 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; 827 | // for browser generated xpath 828 | if ($m[1]==='tbody') continue; 829 | 830 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); 831 | if (!empty($m[2])) {$key='id'; $val=$m[2];} 832 | if (!empty($m[3])) {$key='class'; $val=$m[3];} 833 | if (!empty($m[4])) {$key=$m[4];} 834 | if (!empty($m[5])) {$exp=$m[5];} 835 | if (!empty($m[6])) {$val=$m[6];} 836 | 837 | // convert to lowercase 838 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} 839 | //elements that do NOT have the specified attribute 840 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} 841 | 842 | $result[] = array($tag, $key, $val, $exp, $no_key); 843 | if (trim($m[7])===',') { 844 | $selectors[] = $result; 845 | $result = array(); 846 | } 847 | } 848 | if (count($result)>0) 849 | $selectors[] = $result; 850 | return $selectors; 851 | } 852 | 853 | function __get($name) 854 | { 855 | if (isset($this->attr[$name])) 856 | { 857 | return $this->convert_text($this->attr[$name]); 858 | } 859 | switch ($name) 860 | { 861 | case 'outertext': return $this->outertext(); 862 | case 'innertext': return $this->innertext(); 863 | case 'plaintext': return $this->text(); 864 | case 'xmltext': return $this->xmltext(); 865 | default: return array_key_exists($name, $this->attr); 866 | } 867 | } 868 | 869 | function __set($name, $value) 870 | { 871 | global $debug_object; 872 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 873 | 874 | switch ($name) 875 | { 876 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 877 | case 'innertext': 878 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 879 | return $this->_[HDOM_INFO_INNER] = $value; 880 | } 881 | if (!isset($this->attr[$name])) 882 | { 883 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 884 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 885 | } 886 | $this->attr[$name] = $value; 887 | } 888 | 889 | function __isset($name) 890 | { 891 | switch ($name) 892 | { 893 | case 'outertext': return true; 894 | case 'innertext': return true; 895 | case 'plaintext': return true; 896 | } 897 | //no value attr: nowrap, checked selected... 898 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 899 | } 900 | 901 | function __unset($name) { 902 | if (isset($this->attr[$name])) 903 | unset($this->attr[$name]); 904 | } 905 | 906 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. 907 | function convert_text($text) 908 | { 909 | global $debug_object; 910 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 911 | 912 | $converted_text = $text; 913 | 914 | $sourceCharset = ""; 915 | $targetCharset = ""; 916 | 917 | if ($this->dom) 918 | { 919 | $sourceCharset = strtoupper($this->dom->_charset); 920 | $targetCharset = strtoupper($this->dom->_target_charset); 921 | } 922 | if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 923 | 924 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 925 | { 926 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 927 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) 928 | { 929 | $converted_text = $text; 930 | } 931 | else 932 | { 933 | $converted_text = iconv($sourceCharset, $targetCharset, $text); 934 | } 935 | } 936 | 937 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 938 | if ($targetCharset == 'UTF-8') 939 | { 940 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") 941 | { 942 | $converted_text = substr($converted_text, 3); 943 | } 944 | if (substr($converted_text, -3) == "\xef\xbb\xbf") 945 | { 946 | $converted_text = substr($converted_text, 0, -3); 947 | } 948 | } 949 | 950 | return $converted_text; 951 | } 952 | 953 | /** 954 | * Returns true if $string is valid UTF-8 and false otherwise. 955 | * 956 | * @param mixed $str String to be tested 957 | * @return boolean 958 | */ 959 | static function is_utf8($str) 960 | { 961 | $c=0; $b=0; 962 | $bits=0; 963 | $len=strlen($str); 964 | for($i=0; $i<$len; $i++) 965 | { 966 | $c=ord($str[$i]); 967 | if($c > 128) 968 | { 969 | if(($c >= 254)) return false; 970 | elseif($c >= 252) $bits=6; 971 | elseif($c >= 248) $bits=5; 972 | elseif($c >= 240) $bits=4; 973 | elseif($c >= 224) $bits=3; 974 | elseif($c >= 192) $bits=2; 975 | else return false; 976 | if(($i+$bits) > $len) return false; 977 | while($bits > 1) 978 | { 979 | $i++; 980 | $b=ord($str[$i]); 981 | if($b < 128 || $b > 191) return false; 982 | $bits--; 983 | } 984 | } 985 | } 986 | return true; 987 | } 988 | /* 989 | function is_utf8($string) 990 | { 991 | //this is buggy 992 | return (utf8_encode(utf8_decode($string)) == $string); 993 | } 994 | */ 995 | 996 | /** 997 | * Function to try a few tricks to determine the displayed size of an img on the page. 998 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. 999 | * 1000 | * @author John Schlick 1001 | * @version April 19 2012 1002 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. 1003 | */ 1004 | function get_display_size() 1005 | { 1006 | global $debug_object; 1007 | 1008 | $width = -1; 1009 | $height = -1; 1010 | 1011 | if ($this->tag !== 'img') 1012 | { 1013 | return false; 1014 | } 1015 | 1016 | // See if there is aheight or width attribute in the tag itself. 1017 | if (isset($this->attr['width'])) 1018 | { 1019 | $width = $this->attr['width']; 1020 | } 1021 | 1022 | if (isset($this->attr['height'])) 1023 | { 1024 | $height = $this->attr['height']; 1025 | } 1026 | 1027 | // Now look for an inline style. 1028 | if (isset($this->attr['style'])) 1029 | { 1030 | // Thanks to user gnarf from stackoverflow for this regular expression. 1031 | $attributes = array(); 1032 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); 1033 | foreach ($matches as $match) { 1034 | $attributes[$match[1]] = $match[2]; 1035 | } 1036 | 1037 | // If there is a width in the style attributes: 1038 | if (isset($attributes['width']) && $width == -1) 1039 | { 1040 | // check that the last two characters are px (pixels) 1041 | if (strtolower(substr($attributes['width'], -2)) == 'px') 1042 | { 1043 | $proposed_width = substr($attributes['width'], 0, -2); 1044 | // Now make sure that it's an integer and not something stupid. 1045 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) 1046 | { 1047 | $width = $proposed_width; 1048 | } 1049 | } 1050 | } 1051 | 1052 | // If there is a width in the style attributes: 1053 | if (isset($attributes['height']) && $height == -1) 1054 | { 1055 | // check that the last two characters are px (pixels) 1056 | if (strtolower(substr($attributes['height'], -2)) == 'px') 1057 | { 1058 | $proposed_height = substr($attributes['height'], 0, -2); 1059 | // Now make sure that it's an integer and not something stupid. 1060 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) 1061 | { 1062 | $height = $proposed_height; 1063 | } 1064 | } 1065 | } 1066 | 1067 | } 1068 | 1069 | // Future enhancement: 1070 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. 1071 | 1072 | // Far future enhancement 1073 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width 1074 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. 1075 | 1076 | // ridiculously far future development 1077 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. 1078 | 1079 | $result = array('height' => $height, 1080 | 'width' => $width); 1081 | return $result; 1082 | } 1083 | 1084 | // camel naming conventions 1085 | function getAllAttributes() {return $this->attr;} 1086 | function getAttribute($name) {return $this->__get($name);} 1087 | function setAttribute($name, $value) {$this->__set($name, $value);} 1088 | function hasAttribute($name) {return $this->__isset($name);} 1089 | function removeAttribute($name) {$this->__set($name, null);} 1090 | function getElementById($id) {return $this->find("#$id", 0);} 1091 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 1092 | function getElementByTagName($name) {return $this->find($name, 0);} 1093 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} 1094 | function parentNode() {return $this->parent();} 1095 | function childNodes($idx=-1) {return $this->children($idx);} 1096 | function firstChild() {return $this->first_child();} 1097 | function lastChild() {return $this->last_child();} 1098 | function nextSibling() {return $this->next_sibling();} 1099 | function previousSibling() {return $this->prev_sibling();} 1100 | function hasChildNodes() {return $this->has_child();} 1101 | function nodeName() {return $this->tag;} 1102 | function appendChild($node) {$node->parent($this); return $node;} 1103 | 1104 | } 1105 | 1106 | /** 1107 | * simple html dom parser 1108 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. 1109 | * Paperg - change $size from protected to public so we can easily access it 1110 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. 1111 | * 1112 | * @package PlaceLocalInclude 1113 | */ 1114 | class simple_html_dom 1115 | { 1116 | /** 1117 | * The root node of the document 1118 | * 1119 | * @var object 1120 | */ 1121 | public $root = null; 1122 | 1123 | /** 1124 | * List of nodes in the current DOM 1125 | * 1126 | * @var array 1127 | */ 1128 | public $nodes = array(); 1129 | 1130 | /** 1131 | * Callback function to run for each element in the DOM. 1132 | * 1133 | * @var callable|null 1134 | */ 1135 | public $callback = null; 1136 | 1137 | /** 1138 | * Indicates how tags and attributes are matched 1139 | * 1140 | * @var bool When set to **true** tags and attributes will be converted to 1141 | * lowercase before matching. 1142 | */ 1143 | public $lowercase = false; 1144 | 1145 | /** 1146 | * Original document size 1147 | * 1148 | * Holds the original document size. 1149 | * 1150 | * @var int 1151 | */ 1152 | public $original_size; 1153 | 1154 | /** 1155 | * Current document size 1156 | * 1157 | * Holds the current document size. The document size is determined by the 1158 | * string length of ({@see simple_html_dom::$doc}). 1159 | * 1160 | * _Note_: Using this variable is more efficient than calling `strlen($doc)` 1161 | * 1162 | * @var int 1163 | * */ 1164 | public $size; 1165 | 1166 | /** 1167 | * Current position in the document 1168 | * 1169 | * @var int 1170 | */ 1171 | protected $pos; 1172 | 1173 | /** 1174 | * The document 1175 | * 1176 | * @var string 1177 | */ 1178 | protected $doc; 1179 | 1180 | /** 1181 | * Current character 1182 | * 1183 | * Holds the current character at position {@see simple_html_dom::$pos} in 1184 | * the document {@see simple_html_dom::$doc} 1185 | * 1186 | * _Note_: Using this variable is more efficient than calling `substr($doc, $pos, 1)` 1187 | * 1188 | * @var string 1189 | */ 1190 | protected $char; 1191 | 1192 | protected $cursor; 1193 | 1194 | /** 1195 | * Parent node of the next node detected by the parser 1196 | * 1197 | * @var object 1198 | */ 1199 | protected $parent; 1200 | protected $noise = array(); 1201 | 1202 | /** 1203 | * Tokens considered blank in HTML 1204 | * 1205 | * @var string 1206 | */ 1207 | protected $token_blank = " \t\r\n"; 1208 | 1209 | /** 1210 | * Tokens to identify the equal sign for attributes, stopping either at the 1211 | * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e. 1212 | * "") 1213 | * 1214 | * @var string 1215 | */ 1216 | protected $token_equal = ' =/>'; 1217 | 1218 | /** 1219 | * Tokens to identify the end of a tag name. A tag name either ends on the 1220 | * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t") 1221 | * 1222 | * @var string 1223 | */ 1224 | protected $token_slash = " />\r\n\t"; 1225 | 1226 | /** 1227 | * Tokens to identify the end of an attribute 1228 | * 1229 | * @var string 1230 | */ 1231 | protected $token_attr = ' >'; 1232 | 1233 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. 1234 | public $_charset = ''; 1235 | public $_target_charset = ''; 1236 | 1237 | /** 1238 | * Innertext for
elements 1239 | * 1240 | * @var string 1241 | */ 1242 | protected $default_br_text = ""; 1243 | 1244 | /** 1245 | * Suffix for elements 1246 | * 1247 | * @var string 1248 | */ 1249 | public $default_span_text = ""; 1250 | 1251 | /** 1252 | * Defines a list of self-closing tags (Void elements) according to the HTML 1253 | * Specification 1254 | * 1255 | * _Remarks_: 1256 | * - Use `isset()` instead of `in_array()` on array elements to boost 1257 | * performance about 30% 1258 | * - Sort elements by name for better readability! 1259 | * 1260 | * @link https://www.w3.org/TR/html HTML Specification 1261 | * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements 1262 | */ 1263 | protected $self_closing_tags = array( 1264 | 'area'=>1, 1265 | 'base'=>1, 1266 | 'br'=>1, 1267 | 'col'=>1, 1268 | 'embed'=>1, 1269 | 'hr'=>1, 1270 | 'img'=>1, 1271 | 'input'=>1, 1272 | 'link'=>1, 1273 | 'meta'=>1, 1274 | 'param'=>1, 1275 | 'source'=>1, 1276 | 'track'=>1, 1277 | 'wbr'=>1 1278 | ); 1279 | 1280 | /** 1281 | * Defines a list of tags which - if closed - close all optional closing 1282 | * elements within if they haven't been closed yet. (So, an element where 1283 | * neither opening nor closing tag is omissible consistently closes every 1284 | * optional closing element within) 1285 | * 1286 | * _Remarks_: 1287 | * - Use `isset()` instead of `in_array()` on array elements to boost 1288 | * performance about 30% 1289 | * - Sort elements by name for better readability! 1290 | */ 1291 | protected $block_tags = array( 1292 | 'body'=>1, 1293 | 'div'=>1, 1294 | 'form'=>1, 1295 | 'root'=>1, 1296 | 'span'=>1, 1297 | 'table'=>1 1298 | ); 1299 | 1300 | /** 1301 | * Defines elements whose end tag is omissible. 1302 | * 1303 | * * key = Name of an element whose end tag is omissible. 1304 | * * value = Names of elements whose end tag is omissible, that are closed 1305 | * by the current element. 1306 | * 1307 | * _Remarks_: 1308 | * - Use `isset()` instead of `in_array()` on array elements to boost 1309 | * performance about 30% 1310 | * - Sort elements by name for better readability! 1311 | * 1312 | * **Example** 1313 | * 1314 | * An `li` element’s end tag may be omitted if the `li` element is immediately 1315 | * followed by another `li` element. To do that, add following element to the 1316 | * array: 1317 | * 1318 | * ```php 1319 | * 'li' => array('li'), 1320 | * ``` 1321 | * 1322 | * With this, the following two examples are considered equal. Note that the 1323 | * second example is missing the closing tags on `li` elements. 1324 | * 1325 | * ```html 1326 | *
  • First Item
  • Second Item
1327 | * ``` 1328 | * 1329 | *
  • First Item
  • Second Item
1330 | * 1331 | * ```html 1332 | *
  • First Item
  • Second Item
1333 | * ``` 1334 | * 1335 | *
  • First Item
  • Second Item
1336 | * 1337 | * @var array A two-dimensional array where the key is the name of an 1338 | * element whose end tag is omissible and the value is an array of elements 1339 | * whose end tag is omissible, that are closed by the current element. 1340 | * 1341 | * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags 1342 | * 1343 | * @todo The implementation of optional closing tags doesn't work in all cases 1344 | * because it only consideres elements who close other optional closing 1345 | * tags, not taking into account that some (non-blocking) tags should close 1346 | * these optional closing tags. For example, the end tag for "p" is omissible 1347 | * and can be closed by an "address" element, whose end tag is NOT omissible. 1348 | * Currently a "p" element without closing tag stops at the next "p" element 1349 | * or blocking tag, even if it contains other elements. 1350 | * 1351 | * @todo Known sourceforge issue #2977341 1352 | * B tags that are not closed cause us to return everything to the end of 1353 | * the document. 1354 | */ 1355 | protected $optional_closing_tags = array( 1356 | 'b'=>array('b'=>1), // Not optional, see https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1357 | 'dd'=>array('dd'=>1, 'dt'=>1), 1358 | 'dl'=>array('dd'=>1, 'dt'=>1), // Not optional, see https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1359 | 'dt'=>array('dd'=>1, 'dt'=>1), 1360 | 'li'=>array('li'=>1), 1361 | 'optgroup'=>array('optgroup'=>1, 'option'=>1), 1362 | 'option'=>array('optgroup'=>1, 'option'=>1), 1363 | 'p'=>array('p'=>1), 1364 | 'rp'=>array('rp'=>1, 'rt'=>1), 1365 | 'rt'=>array('rp'=>1, 'rt'=>1), 1366 | 'td'=>array('td'=>1, 'th'=>1), 1367 | 'th'=>array('td'=>1, 'th'=>1), 1368 | 'tr'=>array('td'=>1, 'th'=>1, 'tr'=>1), 1369 | ); 1370 | 1371 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0) 1372 | { 1373 | if ($str) 1374 | { 1375 | if (preg_match("/^http:\/\//i",$str) || is_file($str)) 1376 | { 1377 | $this->load_file($str); 1378 | } 1379 | else 1380 | { 1381 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options); 1382 | } 1383 | } 1384 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. 1385 | if (!$forceTagsClosed) { 1386 | $this->optional_closing_array=array(); 1387 | } 1388 | $this->_target_charset = $target_charset; 1389 | } 1390 | 1391 | function __destruct() 1392 | { 1393 | $this->clear(); 1394 | } 1395 | 1396 | // load html from string 1397 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0) 1398 | { 1399 | global $debug_object; 1400 | 1401 | // prepare 1402 | $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1403 | 1404 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1405 | // Script tags removal now preceeds style tag removal. 1406 | // strip out