├── JSLikeHTMLElement.php ├── README.md └── Readability.php /JSLikeHTMLElement.php: -------------------------------------------------------------------------------- 1 | registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 16 | * $doc->loadHTML('

Para 1

Para 2

'); 17 | * $elem = $doc->getElementsByTagName('div')->item(0); 18 | * 19 | * // print innerHTML 20 | * echo $elem->innerHTML; // prints '

Para 1

Para 2

' 21 | * echo "\n\n"; 22 | * 23 | * // set innerHTML 24 | * $elem->innerHTML = 'FiveFilters.org'; 25 | * echo $elem->innerHTML; // prints 'FiveFilters.org' 26 | * echo "\n\n"; 27 | * 28 | * // print document (with our changes) 29 | * echo $doc->saveXML(); 30 | * @endcode 31 | * 32 | * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net 33 | * @see http://fivefilters.org (the project this was written for) 34 | */ 35 | class JSLikeHTMLElement extends DOMElement 36 | { 37 | /** 38 | * Used for setting innerHTML like it's done in JavaScript: 39 | * @code 40 | * $div->innerHTML = '

Chapter 2

The story begins...

'; 41 | * @endcode 42 | */ 43 | public function __set($name, $value) { 44 | if ($name == 'innerHTML') { 45 | // first, empty the element 46 | for ($x=$this->childNodes->length-1; $x>=0; $x--) { 47 | $this->removeChild($this->childNodes->item($x)); 48 | } 49 | // $value holds our new inner HTML 50 | if ($value != '') { 51 | $f = $this->ownerDocument->createDocumentFragment(); 52 | // appendXML() expects well-formed markup (XHTML) 53 | $result = @$f->appendXML($value); // @ to suppress PHP warnings 54 | if ($result) { 55 | if ($f->hasChildNodes()) $this->appendChild($f); 56 | } else { 57 | // $value is probably ill-formed 58 | $f = new DOMDocument(); 59 | $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); 60 | // Using will generate a warning, but so will bad HTML 61 | // (and by this point, bad HTML is what we've got). 62 | // We use it (and suppress the warning) because an HTML fragment will 63 | // be wrapped around tags which we don't really want to keep. 64 | // Note: despite the warning, if loadHTML succeeds it will return true. 65 | $result = @$f->loadHTML(''.$value.''); 66 | if ($result) { 67 | $import = $f->getElementsByTagName('htmlfragment')->item(0); 68 | foreach ($import->childNodes as $child) { 69 | $importedNode = $this->ownerDocument->importNode($child, true); 70 | $this->appendChild($importedNode); 71 | } 72 | } else { 73 | // oh well, we tried, we really did. :( 74 | // this element is now empty 75 | } 76 | } 77 | } 78 | } else { 79 | $trace = debug_backtrace(); 80 | trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); 81 | } 82 | } 83 | 84 | /** 85 | * Used for getting innerHTML like it's done in JavaScript: 86 | * @code 87 | * $string = $div->innerHTML; 88 | * @endcode 89 | */ 90 | public function __get($name) 91 | { 92 | if ($name == 'innerHTML') { 93 | $inner = ''; 94 | foreach ($this->childNodes as $child) { 95 | $inner .= $this->ownerDocument->saveXML($child); 96 | } 97 | return $inner; 98 | } 99 | 100 | $trace = debug_backtrace(); 101 | trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); 102 | return null; 103 | } 104 | 105 | public function __toString() 106 | { 107 | return '['.$this->tagName.']'; 108 | } 109 | } 110 | ?> -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | php-readability 2 | =============== 3 | 4 | PHP Readability fork from http://code.fivefilters.org/p/php-readability/ 5 | 6 | the getContent() function return just text without HTML tags. 7 | 8 | Usage 9 | ----- 10 | 11 | $doc = new Readability(); 12 | $doc->input('http://google.com'); 13 | $doc->init(); 14 | $content = $doc->getContent(); -------------------------------------------------------------------------------- /Readability.php: -------------------------------------------------------------------------------- 1 | init(); 59 | // echo $r->articleContent->innerHTML; 60 | 61 | 62 | class Readability 63 | { 64 | public $version = '1.7.1-without-multi-page'; 65 | public $convertLinksToFootnotes = false; 66 | public $revertForcedParagraphElements = true; 67 | public $articleTitle; 68 | public $articleContent; 69 | public $dom; 70 | public $url = null; // optional - URL where HTML was retrieved 71 | public $debug = false; 72 | protected $body = null; // 73 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later 74 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. 75 | protected $success = false; // indicates whether we were able to extract or not 76 | 77 | /** 78 | * All of the regular expressions in use within readability. 79 | * Defined up here so we don't instantiate them repeatedly in loops. 80 | **/ 81 | public $regexps = array( 82 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i', 83 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 84 | 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i', 85 | 'negative' => '/combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', 86 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', 87 | 'replaceBrs' => '/(]*>[ \n\r\t]*){2,}/i', 88 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', 89 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() 90 | 'normalize' => '/\s{2,}/', 91 | 'killBreaks' => '/((\s| ?)*){1,}/', 92 | 'video' => '/http:\/\/(www\.)?(youtube|vimeo)\.com/i', 93 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' 94 | ); 95 | 96 | /* constants */ 97 | const FLAG_STRIP_UNLIKELYS = 1; 98 | const FLAG_WEIGHT_CLASSES = 2; 99 | const FLAG_CLEAN_CONDITIONALLY = 4; 100 | 101 | /** 102 | * Create instance of Readability 103 | * @param string UTF-8 encoded string 104 | * @param string (optional) URL associated with HTML (used for footnotes) 105 | */ 106 | function __construct() 107 | { 108 | 109 | } 110 | 111 | function _get_curl($url) { 112 | $ch = curl_init(); 113 | curl_setopt($ch, CURLOPT_URL, $url); 114 | curl_setopt($ch, CURLOPT_HEADER, false); 115 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 116 | curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); 117 | $data = curl_exec($ch); 118 | 119 | if ($data == false) 120 | { 121 | $error = curl_error($ch); 122 | } 123 | 124 | curl_close($ch); 125 | 126 | if ($data == false) 127 | { 128 | return false; 129 | } 130 | 131 | return $data; 132 | } 133 | 134 | function input($url=null) 135 | { 136 | if(empty($url)) return false; 137 | $html = $this->_get_curl($url); 138 | preg_match("/charset=([\w|\-]+);?/", $html, $match); 139 | $charset = isset($match[1]) ? $match[1] : 'UTF-8'; 140 | 141 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset); 142 | 143 | // if (function_exists('tidy_parse_string')) { 144 | // $tidy = tidy_parse_string($html, array('quot-nbsp'=>false), $charset); 145 | // $tidy->cleanRepair(); 146 | // $html = $tidy->value; 147 | // } 148 | // $html = html_entity_decode($html,null,$charset); 149 | 150 | unset($this->dom); 151 | unset($this->articleTitle); 152 | unset($this->articleContent); 153 | $this->url = null; 154 | $this->body = null; 155 | $this->bodyCache = null; 156 | $this->success = false; 157 | $this->flags = 7; 158 | 159 | /* Turn all double br's into p's */ 160 | $html = preg_replace($this->regexps['replaceBrs'], '

', $html); 161 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); 162 | $this->dom = new DOMDocument(); 163 | $this->dom->preserveWhiteSpace = false; 164 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 165 | if (trim($html) == '') $html = ''; 166 | @$this->dom->loadHTML($html); 167 | $this->url = $url; 168 | } 169 | 170 | /** 171 | * Get article title element 172 | * @return DOMElement 173 | */ 174 | public function getTitle() { 175 | return $this->articleTitle; 176 | } 177 | 178 | /** 179 | * Get article content element 180 | * @return DOMElement 181 | */ 182 | public function getContent() { 183 | return preg_replace('/\s+|\t+|\n+/i', ' ', trim(strip_tags($this->articleContent->innerHTML))); 184 | // var_dump(trim(strip_tags($this->articleContent->innerHTML))); 185 | // return trim(strip_tags($this->articleContent->innerHTML)); 186 | } 187 | 188 | /** 189 | * Runs readability. 190 | * 191 | * Workflow: 192 | * 1. Prep the document by removing script tags, css, etc. 193 | * 2. Build readability's DOM tree. 194 | * 3. Grab the article content from the current dom tree. 195 | * 4. Replace the current DOM tree with the new one. 196 | * 5. Read peacefully. 197 | * 198 | * @return boolean true if we found content, false otherwise 199 | **/ 200 | public function init() 201 | { 202 | if (!isset($this->dom->documentElement)) return false; 203 | $this->removeScripts($this->dom); 204 | //die($this->getInnerHTML($this->dom->documentElement)); 205 | 206 | // Assume successful outcome 207 | $this->success = true; 208 | 209 | $bodyElems = $this->dom->getElementsByTagName('body'); 210 | if ($bodyElems->length > 0) { 211 | if ($this->bodyCache == null) { 212 | $this->bodyCache = $bodyElems->item(0)->innerHTML; 213 | } 214 | if ($this->body == null) { 215 | $this->body = $bodyElems->item(0); 216 | } 217 | } 218 | 219 | $this->prepDocument(); 220 | 221 | //die($this->dom->documentElement->parentNode->nodeType); 222 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); 223 | //die($this->getInnerHTML($this->dom->documentElement)); 224 | 225 | /* Build readability's DOM tree */ 226 | $overlay = $this->dom->createElement('div'); 227 | $innerDiv = $this->dom->createElement('div'); 228 | $articleTitle = $this->getArticleTitle(); 229 | $articleContent = $this->grabArticle(); 230 | 231 | if (!$articleContent) { 232 | $this->success = false; 233 | $articleContent = $this->dom->createElement('div'); 234 | $articleContent->setAttribute('id', 'readability-content'); 235 | $articleContent->innerHTML = '

Sorry, Readability was unable to parse this page for content.

'; 236 | } 237 | 238 | $overlay->setAttribute('id', 'readOverlay'); 239 | $innerDiv->setAttribute('id', 'readInner'); 240 | 241 | /* Glue the structure of our document together. */ 242 | $innerDiv->appendChild($articleTitle); 243 | $innerDiv->appendChild($articleContent); 244 | $overlay->appendChild($innerDiv); 245 | 246 | /* Clear the old HTML, insert the new content. */ 247 | $this->body->innerHTML = ''; 248 | $this->body->appendChild($overlay); 249 | //document.body.insertBefore(overlay, document.body.firstChild); 250 | $this->body->removeAttribute('style'); 251 | 252 | $this->postProcessContent($articleContent); 253 | 254 | // Set title and content instance variables 255 | $this->articleTitle = $articleTitle; 256 | $this->articleContent = $articleContent; 257 | 258 | return $this->success; 259 | } 260 | 261 | /** 262 | * Debug 263 | */ 264 | protected function dbg($msg) { 265 | if ($this->debug) echo '* ',$msg, '
', "\n"; 266 | } 267 | 268 | /** 269 | * Run any post-process modifications to article content as necessary. 270 | * 271 | * @param DOMElement 272 | * @return void 273 | */ 274 | public function postProcessContent($articleContent) { 275 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 276 | $this->addFootnotes($articleContent); 277 | } 278 | } 279 | 280 | /** 281 | * Get the article title as an H1. 282 | * 283 | * @return DOMElement 284 | */ 285 | protected function getArticleTitle() { 286 | $curTitle = ''; 287 | $origTitle = ''; 288 | 289 | try { 290 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); 291 | } catch(Exception $e) {} 292 | 293 | if (preg_match('/ [\|\-] /', $curTitle)) 294 | { 295 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); 296 | 297 | if (count(explode(' ', $curTitle)) < 3) { 298 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); 299 | } 300 | } 301 | else if (strpos($curTitle, ': ') !== false) 302 | { 303 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); 304 | 305 | if (count(explode(' ', $curTitle)) < 3) { 306 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); 307 | } 308 | } 309 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) 310 | { 311 | $hOnes = $this->dom->getElementsByTagName('h1'); 312 | if($hOnes->length == 1) 313 | { 314 | $curTitle = $this->getInnerText($hOnes->item(0)); 315 | } 316 | } 317 | 318 | $curTitle = trim($curTitle); 319 | 320 | if (count(explode(' ', $curTitle)) <= 4) { 321 | $curTitle = $origTitle; 322 | } 323 | 324 | $articleTitle = $this->dom->createElement('h1'); 325 | $articleTitle->innerHTML = $curTitle; 326 | 327 | return $articleTitle; 328 | } 329 | 330 | /** 331 | * Prepare the HTML document for readability to scrape it. 332 | * This includes things like stripping javascript, CSS, and handling terrible markup. 333 | * 334 | * @return void 335 | **/ 336 | protected function prepDocument() { 337 | /** 338 | * In some cases a body element can't be found (if the HTML is totally hosed for example) 339 | * so we create a new body node and append it to the document. 340 | */ 341 | if ($this->body == null) 342 | { 343 | $this->body = $this->dom->createElement('body'); 344 | $this->dom->documentElement->appendChild($this->body); 345 | } 346 | $this->body->setAttribute('id', 'readabilityBody'); 347 | 348 | /* Remove all style tags in head */ 349 | $styleTags = $this->dom->getElementsByTagName('style'); 350 | for ($i = $styleTags->length-1; $i >= 0; $i--) 351 | { 352 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); 353 | } 354 | 355 | /* Turn all double br's into p's */ 356 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 357 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '

').replace(readability.regexps.replaceFonts, '<$1span>'); 358 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. 359 | // Manipulating innerHTML as it's done in JS is not possible in PHP. 360 | } 361 | 362 | /** 363 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. 364 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php 365 | * 366 | * @return void 367 | **/ 368 | public function addFootnotes($articleContent) { 369 | $footnotesWrapper = $this->dom->createElement('div'); 370 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); 371 | $footnotesWrapper->innerHTML = '

References

'; 372 | 373 | $articleFootnotes = $this->dom->createElement('ol'); 374 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); 375 | $footnotesWrapper->appendChild($articleFootnotes); 376 | 377 | $articleLinks = $articleContent->getElementsByTagName('a'); 378 | 379 | $linkCount = 0; 380 | for ($i = 0; $i < $articleLinks->length; $i++) 381 | { 382 | $articleLink = $articleLinks->item($i); 383 | $footnoteLink = $articleLink->cloneNode(true); 384 | $refLink = $this->dom->createElement('a'); 385 | $footnote = $this->dom->createElement('li'); 386 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); 387 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); 388 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, 389 | $linkText = $this->getInnerText($articleLink); 390 | 391 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { 392 | continue; 393 | } 394 | 395 | $linkCount++; 396 | 397 | /** Add a superscript reference after the article link */ 398 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); 399 | $refLink->innerHTML = '^{[' . $linkCount . ']}'; 400 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); 401 | $refLink->setAttribute('style', 'color: inherit;'); 402 | 403 | //TODO: does this work or should we use DOMNode.isSameNode()? 404 | if ($articleLink->parentNode->lastChild == $articleLink) { 405 | $articleLink->parentNode->appendChild($refLink); 406 | } else { 407 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); 408 | } 409 | 410 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); 411 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); 412 | 413 | $footnote->innerHTML = '^{^} '; 414 | 415 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); 416 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); 417 | 418 | $footnote->appendChild($footnoteLink); 419 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')'; 420 | 421 | $articleFootnotes->appendChild($footnote); 422 | } 423 | 424 | if ($linkCount > 0) { 425 | $articleContent->appendChild($footnotesWrapper); 426 | } 427 | } 428 | 429 | /** 430 | * Reverts P elements with class 'readability-styled' 431 | * to text nodes - which is what they were before. 432 | * 433 | * @param DOMElement 434 | * @return void 435 | */ 436 | function revertReadabilityStyledElements($articleContent) { 437 | $xpath = new DOMXPath($articleContent->ownerDocument); 438 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); 439 | //$elems = $articleContent->getElementsByTagName('p'); 440 | for ($i = $elems->length-1; $i >= 0; $i--) { 441 | $e = $elems->item($i); 442 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); 443 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { 444 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); 445 | //} 446 | } 447 | } 448 | 449 | /** 450 | * Prepare the article node for display. Clean out any inline styles, 451 | * iframes, forms, strip extraneous

tags, etc. 452 | * 453 | * @param DOMElement 454 | * @return void 455 | */ 456 | function prepArticle($articleContent) { 457 | $this->cleanStyles($articleContent); 458 | $this->killBreaks($articleContent); 459 | if ($this->revertForcedParagraphElements) { 460 | $this->revertReadabilityStyledElements($articleContent); 461 | } 462 | 463 | /* Clean out junk from the article content */ 464 | $this->cleanConditionally($articleContent, 'form'); 465 | $this->clean($articleContent, 'object'); 466 | $this->clean($articleContent, 'h1'); 467 | 468 | /** 469 | * If there is only one h2, they are probably using it 470 | * as a header and not a subheader, so remove it since we already have a header. 471 | ***/ 472 | if ($articleContent->getElementsByTagName('h2')->length == 1) { 473 | $this->clean($articleContent, 'h2'); 474 | } 475 | $this->clean($articleContent, 'iframe'); 476 | 477 | $this->cleanHeaders($articleContent); 478 | 479 | /* Do these last as the previous stuff may have removed junk that will affect these */ 480 | $this->cleanConditionally($articleContent, 'table'); 481 | $this->cleanConditionally($articleContent, 'ul'); 482 | $this->cleanConditionally($articleContent, 'div'); 483 | 484 | /* Remove extra paragraphs */ 485 | $articleParagraphs = $articleContent->getElementsByTagName('p'); 486 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) 487 | { 488 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; 489 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; 490 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; 491 | 492 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') 493 | { 494 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); 495 | } 496 | } 497 | 498 | try { 499 | $articleContent->innerHTML = preg_replace('/]*>\s*

innerHTML); 500 | //articleContent.innerHTML = articleContent.innerHTML.replace(/]*>\s*

dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); 504 | } 505 | } 506 | 507 | /** 508 | * Initialize a node with the readability object. Also checks the 509 | * className/id for special names to add to its score. 510 | * 511 | * @param Element 512 | * @return void 513 | **/ 514 | protected function initializeNode($node) { 515 | $readability = $this->dom->createAttribute('readability'); 516 | $readability->value = 0; // this is our contentScore 517 | $node->setAttributeNode($readability); 518 | 519 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case 520 | case 'DIV': 521 | $readability->value += 5; 522 | break; 523 | 524 | case 'PRE': 525 | case 'TD': 526 | case 'BLOCKQUOTE': 527 | $readability->value += 3; 528 | break; 529 | 530 | case 'ADDRESS': 531 | case 'OL': 532 | case 'UL': 533 | case 'DL': 534 | case 'DD': 535 | case 'DT': 536 | case 'LI': 537 | case 'FORM': 538 | $readability->value -= 3; 539 | break; 540 | 541 | case 'H1': 542 | case 'H2': 543 | case 'H3': 544 | case 'H4': 545 | case 'H5': 546 | case 'H6': 547 | case 'TH': 548 | $readability->value -= 5; 549 | break; 550 | } 551 | $readability->value += $this->getClassWeight($node); 552 | } 553 | 554 | /*** 555 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 556 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 557 | * 558 | * @return DOMElement 559 | **/ 560 | protected function grabArticle($page=null) { 561 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); 562 | if (!$page) $page = $this->dom; 563 | $allElements = $page->getElementsByTagName('*'); 564 | /** 565 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs 566 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) 567 | * 568 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 569 | * TODO: Shouldn't this be a reverse traversal? 570 | **/ 571 | $node = null; 572 | $nodesToScore = array(); 573 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { 574 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { 575 | //$node = $targetList->item($nodeIndex); 576 | $tagName = strtoupper($node->tagName); 577 | /* Remove unlikely candidates */ 578 | if ($stripUnlikelyCandidates) { 579 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); 580 | if ( 581 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && 582 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && 583 | $tagName != 'BODY' 584 | ) 585 | { 586 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); 587 | //$nodesToRemove[] = $node; 588 | $node->parentNode->removeChild($node); 589 | $nodeIndex--; 590 | continue; 591 | } 592 | } 593 | 594 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { 595 | $nodesToScore[] = $node; 596 | } 597 | 598 | /* Turn all divs that don't have children block level elements into p's */ 599 | if ($tagName == 'DIV') { 600 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { 601 | //$this->dbg('Altering div to p'); 602 | $newNode = $this->dom->createElement('p'); 603 | try { 604 | $newNode->innerHTML = $node->innerHTML; 605 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); 606 | $node->parentNode->replaceChild($newNode, $node); 607 | $nodeIndex--; 608 | $nodesToScore[] = $node; // or $newNode? 609 | } 610 | catch(Exception $e) { 611 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); 612 | } 613 | } 614 | else 615 | { 616 | /* EXPERIMENTAL */ 617 | // TODO: change these p elements back to text nodes after processing 618 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { 619 | $childNode = $node->childNodes->item($i); 620 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE 621 | //$this->dbg('replacing text node with a p tag with the same content.'); 622 | $p = $this->dom->createElement('p'); 623 | $p->innerHTML = $childNode->nodeValue; 624 | $p->setAttribute('style', 'display: inline;'); 625 | $p->setAttribute('class', 'readability-styled'); 626 | $childNode->parentNode->replaceChild($p, $childNode); 627 | } 628 | } 629 | } 630 | } 631 | } 632 | 633 | /** 634 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. 635 | * Then add their score to their parent node. 636 | * 637 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 638 | **/ 639 | $candidates = array(); 640 | for ($pt=0; $pt < count($nodesToScore); $pt++) { 641 | $parentNode = $nodesToScore[$pt]->parentNode; 642 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; 643 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); 644 | $innerText = $this->getInnerText($nodesToScore[$pt]); 645 | 646 | if (!$parentNode || !isset($parentNode->tagName)) { 647 | continue; 648 | } 649 | 650 | /* If this paragraph is less than 25 characters, don't even count it. */ 651 | if(strlen($innerText) < 25) { 652 | continue; 653 | } 654 | 655 | /* Initialize readability data for the parent. */ 656 | if (!$parentNode->hasAttribute('readability')) 657 | { 658 | $this->initializeNode($parentNode); 659 | $candidates[] = $parentNode; 660 | } 661 | 662 | /* Initialize readability data for the grandparent. */ 663 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) 664 | { 665 | $this->initializeNode($grandParentNode); 666 | $candidates[] = $grandParentNode; 667 | } 668 | 669 | $contentScore = 0; 670 | 671 | /* Add a point for the paragraph itself as a base. */ 672 | $contentScore++; 673 | 674 | /* Add points for any commas within this paragraph */ 675 | $contentScore += count(explode(',', $innerText)); 676 | 677 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 678 | $contentScore += min(floor(strlen($innerText) / 100), 3); 679 | 680 | /* Add the score to the parent. The grandparent gets half. */ 681 | $parentNode->getAttributeNode('readability')->value += $contentScore; 682 | 683 | if ($grandParentNode) { 684 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; 685 | } 686 | } 687 | 688 | /** 689 | * After we've calculated scores, loop through all of the possible candidate nodes we found 690 | * and find the one with the highest score. 691 | **/ 692 | $topCandidate = null; 693 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) 694 | { 695 | /** 696 | * Scale the final candidates score based on link density. Good content should have a 697 | * relatively small link density (5% or less) and be mostly unaffected by this operation. 698 | **/ 699 | $readability = $candidates[$c]->getAttributeNode('readability'); 700 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); 701 | 702 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); 703 | 704 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { 705 | $topCandidate = $candidates[$c]; 706 | } 707 | } 708 | 709 | /** 710 | * If we still have no top candidate, just use the body as a last resort. 711 | * We also have to copy the body node so it is something we can modify. 712 | **/ 713 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') 714 | { 715 | $topCandidate = $this->dom->createElement('div'); 716 | if ($page instanceof DOMDocument) { 717 | if (!isset($page->documentElement)) { 718 | // we don't have a body either? what a mess! :) 719 | } else { 720 | $topCandidate->innerHTML = $page->documentElement->innerHTML; 721 | $page->documentElement->innerHTML = ''; 722 | $page->documentElement->appendChild($topCandidate); 723 | } 724 | } else { 725 | $topCandidate->innerHTML = $page->innerHTML; 726 | $page->innerHTML = ''; 727 | $page->appendChild($topCandidate); 728 | } 729 | $this->initializeNode($topCandidate); 730 | } 731 | 732 | /** 733 | * Now that we have the top candidate, look through its siblings for content that might also be related. 734 | * Things like preambles, content split by ads that we removed, etc. 735 | **/ 736 | $articleContent = $this->dom->createElement('div'); 737 | $articleContent->setAttribute('id', 'readability-content'); 738 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); 739 | $siblingNodes = $topCandidate->parentNode->childNodes; 740 | if (!isset($siblingNodes)) { 741 | $siblingNodes = new stdClass; 742 | $siblingNodes->length = 0; 743 | } 744 | 745 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) 746 | { 747 | $siblingNode = $siblingNodes->item($s); 748 | $append = false; 749 | 750 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); 751 | 752 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); 753 | 754 | if ($siblingNode === $topCandidate) 755 | // or if ($siblingNode->isSameNode($topCandidate)) 756 | { 757 | $append = true; 758 | } 759 | 760 | $contentBonus = 0; 761 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ 762 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { 763 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; 764 | } 765 | 766 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) 767 | { 768 | $append = true; 769 | } 770 | 771 | if (strtoupper($siblingNode->nodeName) == 'P') { 772 | $linkDensity = $this->getLinkDensity($siblingNode); 773 | $nodeContent = $this->getInnerText($siblingNode); 774 | $nodeLength = strlen($nodeContent); 775 | 776 | if ($nodeLength > 80 && $linkDensity < 0.25) 777 | { 778 | $append = true; 779 | } 780 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) 781 | { 782 | $append = true; 783 | } 784 | } 785 | 786 | if ($append) 787 | { 788 | $this->dbg('Appending node: ' . $siblingNode->nodeName); 789 | 790 | $nodeToAppend = null; 791 | $sibNodeName = strtoupper($siblingNode->nodeName); 792 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { 793 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 794 | 795 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); 796 | $nodeToAppend = $this->dom->createElement('div'); 797 | try { 798 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); 799 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; 800 | } 801 | catch(Exception $e) 802 | { 803 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); 804 | $nodeToAppend = $siblingNode; 805 | $s--; 806 | $sl--; 807 | } 808 | } else { 809 | $nodeToAppend = $siblingNode; 810 | $s--; 811 | $sl--; 812 | } 813 | 814 | /* To ensure a node does not interfere with readability styles, remove its classnames */ 815 | $nodeToAppend->removeAttribute('class'); 816 | 817 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ 818 | $articleContent->appendChild($nodeToAppend); 819 | } 820 | } 821 | 822 | /** 823 | * So we have all of the content that we need. Now we clean it up for presentation. 824 | **/ 825 | $this->prepArticle($articleContent); 826 | 827 | /** 828 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. 829 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 830 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 831 | * finding the -right- content. 832 | **/ 833 | if (strlen($this->getInnerText($articleContent, false)) < 250) 834 | { 835 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 836 | // in the meantime, we check and create an empty element if it's not there. 837 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); 838 | $this->body->innerHTML = $this->bodyCache; 839 | 840 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { 841 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); 842 | return $this->grabArticle($this->body); 843 | } 844 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 845 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); 846 | return $this->grabArticle($this->body); 847 | } 848 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 849 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); 850 | return $this->grabArticle($this->body); 851 | } 852 | else { 853 | return false; 854 | } 855 | } 856 | return $articleContent; 857 | } 858 | 859 | /** 860 | * Remove script tags from document 861 | * 862 | * @param DOMElement 863 | * @return void 864 | */ 865 | public function removeScripts($doc) { 866 | $scripts = $doc->getElementsByTagName('script'); 867 | for($i = $scripts->length-1; $i >= 0; $i--) 868 | { 869 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); 870 | } 871 | } 872 | 873 | /** 874 | * Get the inner text of a node. 875 | * This also strips out any excess whitespace to be found. 876 | * 877 | * @param DOMElement $ 878 | * @param boolean $normalizeSpaces (default: true) 879 | * @return string 880 | **/ 881 | public function getInnerText($e, $normalizeSpaces=true) { 882 | $textContent = ''; 883 | 884 | if (!isset($e->textContent) || $e->textContent == '') { 885 | return ''; 886 | } 887 | 888 | $textContent = trim($e->textContent); 889 | 890 | if ($normalizeSpaces) { 891 | return preg_replace($this->regexps['normalize'], ' ', $textContent); 892 | } else { 893 | return $textContent; 894 | } 895 | } 896 | 897 | /** 898 | * Get the number of times a string $s appears in the node $e. 899 | * 900 | * @param DOMElement $e 901 | * @param string - what to count. Default is "," 902 | * @return number (integer) 903 | **/ 904 | public function getCharCount($e, $s=',') { 905 | return substr_count($this->getInnerText($e), $s); 906 | } 907 | 908 | /** 909 | * Remove the style attribute on every $e and under. 910 | * 911 | * @param DOMElement $e 912 | * @return void 913 | */ 914 | public function cleanStyles($e) { 915 | if (!is_object($e)) return; 916 | $elems = $e->getElementsByTagName('*'); 917 | foreach ($elems as $elem) { 918 | $elem->removeAttribute('style'); 919 | } 920 | } 921 | 922 | /** 923 | * Get the density of links as a percentage of the content 924 | * This is the amount of text that is inside a link divided by the total text in the node. 925 | * 926 | * @param DOMElement $e 927 | * @return number (float) 928 | */ 929 | public function getLinkDensity($e) { 930 | $links = $e->getElementsByTagName('a'); 931 | $textLength = strlen($this->getInnerText($e)); 932 | $linkLength = 0; 933 | for ($i=0, $il=$links->length; $i < $il; $i++) 934 | { 935 | $linkLength += strlen($this->getInnerText($links->item($i))); 936 | } 937 | if ($textLength > 0) { 938 | return $linkLength / $textLength; 939 | } else { 940 | return 0; 941 | } 942 | } 943 | 944 | /** 945 | * Get an elements class/id weight. Uses regular expressions to tell if this 946 | * element looks good or bad. 947 | * 948 | * @param DOMElement $e 949 | * @return number (Integer) 950 | */ 951 | public function getClassWeight($e) { 952 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 953 | return 0; 954 | } 955 | 956 | $weight = 0; 957 | 958 | /* Look for a special classname */ 959 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') 960 | { 961 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { 962 | $weight -= 25; 963 | } 964 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { 965 | $weight += 25; 966 | } 967 | } 968 | 969 | /* Look for a special ID */ 970 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') 971 | { 972 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { 973 | $weight -= 25; 974 | } 975 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { 976 | $weight += 25; 977 | } 978 | } 979 | return $weight; 980 | } 981 | 982 | /** 983 | * Remove extraneous break tags from a node. 984 | * 985 | * @param DOMElement $node 986 | * @return void 987 | */ 988 | public function killBreaks($node) { 989 | $html = $node->innerHTML; 990 | $html = preg_replace($this->regexps['killBreaks'], '
', $html); 991 | $node->innerHTML = $html; 992 | } 993 | 994 | /** 995 | * Clean a node of all elements of type "tag". 996 | * (Unless it's a youtube/vimeo video. People love movies.) 997 | * 998 | * @param DOMElement $e 999 | * @param string $tag 1000 | * @return void 1001 | */ 1002 | public function clean($e, $tag) { 1003 | $targetList = $e->getElementsByTagName($tag); 1004 | $isEmbed = ($tag == 'object' || $tag == 'embed'); 1005 | 1006 | for ($y=$targetList->length-1; $y >= 0; $y--) { 1007 | /* Allow youtube and vimeo videos through as people usually want to see those. */ 1008 | if ($isEmbed) { 1009 | $attributeValues = ''; 1010 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { 1011 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) 1012 | } 1013 | 1014 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 1015 | if (preg_match($this->regexps['video'], $attributeValues)) { 1016 | continue; 1017 | } 1018 | 1019 | /* Then check the elements inside this element for the same. */ 1020 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { 1021 | continue; 1022 | } 1023 | } 1024 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); 1025 | } 1026 | } 1027 | 1028 | /** 1029 | * Clean an element of all tags of type "tag" if they look fishy. 1030 | * "Fishy" is an algorithm based on content length, classnames, 1031 | * link density, number of images & embeds, etc. 1032 | * 1033 | * @param DOMElement $e 1034 | * @param string $tag 1035 | * @return void 1036 | */ 1037 | public function cleanConditionally($e, $tag) { 1038 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 1039 | return; 1040 | } 1041 | 1042 | $tagsList = $e->getElementsByTagName($tag); 1043 | $curTagsLength = $tagsList->length; 1044 | 1045 | /** 1046 | * Gather counts for other typical elements embedded within. 1047 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. 1048 | * 1049 | * TODO: Consider taking into account original contentScore here. 1050 | */ 1051 | for ($i=$curTagsLength-1; $i >= 0; $i--) { 1052 | $weight = $this->getClassWeight($tagsList->item($i)); 1053 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; 1054 | 1055 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); 1056 | 1057 | if ($weight + $contentScore < 0) { 1058 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); 1059 | } 1060 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { 1061 | /** 1062 | * If there are not very many commas, and the number of 1063 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. 1064 | **/ 1065 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; 1066 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; 1067 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; 1068 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; 1069 | 1070 | $embedCount = 0; 1071 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); 1072 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { 1073 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { 1074 | $embedCount++; 1075 | } 1076 | } 1077 | 1078 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); 1079 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); 1080 | $toRemove = false; 1081 | 1082 | if ( $img > $p ) { 1083 | $toRemove = true; 1084 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { 1085 | $toRemove = true; 1086 | } else if ( $input > floor($p/3) ) { 1087 | $toRemove = true; 1088 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { 1089 | $toRemove = true; 1090 | } else if($weight < 25 && $linkDensity > 0.2) { 1091 | $toRemove = true; 1092 | } else if($weight >= 25 && $linkDensity > 0.5) { 1093 | $toRemove = true; 1094 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { 1095 | $toRemove = true; 1096 | } 1097 | 1098 | if ($toRemove) { 1099 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); 1100 | } 1101 | } 1102 | } 1103 | } 1104 | 1105 | /** 1106 | * Clean out spurious headers from an Element. Checks things like classnames and link density. 1107 | * 1108 | * @param DOMElement $e 1109 | * @return void 1110 | */ 1111 | public function cleanHeaders($e) { 1112 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { 1113 | $headers = $e->getElementsByTagName('h' . $headerIndex); 1114 | for ($i=$headers->length-1; $i >=0; $i--) { 1115 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { 1116 | $headers->item($i)->parentNode->removeChild($headers->item($i)); 1117 | } 1118 | } 1119 | } 1120 | } 1121 | 1122 | public function flagIsActive($flag) { 1123 | return ($this->flags & $flag) > 0; 1124 | } 1125 | 1126 | public function addFlag($flag) { 1127 | $this->flags = $this->flags | $flag; 1128 | } 1129 | 1130 | public function removeFlag($flag) { 1131 | $this->flags = $this->flags & ~$flag; 1132 | } 1133 | } 1134 | ?> --------------------------------------------------------------------------------