├── JSLikeHTMLElement.php ├── README.md └── Readability.php /JSLikeHTMLElement.php: -------------------------------------------------------------------------------- 1 | registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 16 | * $doc->loadHTML('
Para 1
Para 2
Para 1
Para 2
' 21 | * echo "\n\n"; 22 | * 23 | * // set innerHTML 24 | * $elem->innerHTML = 'FiveFilters.org'; 25 | * echo $elem->innerHTML; // prints 'FiveFilters.org' 26 | * echo "\n\n"; 27 | * 28 | * // print document (with our changes) 29 | * echo $doc->saveXML(); 30 | * @endcode 31 | * 32 | * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net 33 | * @see http://fivefilters.org (the project this was written for) 34 | */ 35 | class JSLikeHTMLElement extends DOMElement 36 | { 37 | /** 38 | * Used for setting innerHTML like it's done in JavaScript: 39 | * @code 40 | * $div->innerHTML = 'The story begins...
'; 41 | * @endcode 42 | */ 43 | public function __set($name, $value) { 44 | if ($name == 'innerHTML') { 45 | // first, empty the element 46 | for ($x=$this->childNodes->length-1; $x>=0; $x--) { 47 | $this->removeChild($this->childNodes->item($x)); 48 | } 49 | // $value holds our new inner HTML 50 | if ($value != '') { 51 | $f = $this->ownerDocument->createDocumentFragment(); 52 | // appendXML() expects well-formed markup (XHTML) 53 | $result = @$f->appendXML($value); // @ to suppress PHP warnings 54 | if ($result) { 55 | if ($f->hasChildNodes()) $this->appendChild($f); 56 | } else { 57 | // $value is probably ill-formed 58 | $f = new DOMDocument(); 59 | $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); 60 | // Using', $html); 161 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); 162 | $this->dom = new DOMDocument(); 163 | $this->dom->preserveWhiteSpace = false; 164 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 165 | if (trim($html) == '') $html = ''; 166 | @$this->dom->loadHTML($html); 167 | $this->url = $url; 168 | } 169 | 170 | /** 171 | * Get article title element 172 | * @return DOMElement 173 | */ 174 | public function getTitle() { 175 | return $this->articleTitle; 176 | } 177 | 178 | /** 179 | * Get article content element 180 | * @return DOMElement 181 | */ 182 | public function getContent() { 183 | return preg_replace('/\s+|\t+|\n+/i', ' ', trim(strip_tags($this->articleContent->innerHTML))); 184 | // var_dump(trim(strip_tags($this->articleContent->innerHTML))); 185 | // return trim(strip_tags($this->articleContent->innerHTML)); 186 | } 187 | 188 | /** 189 | * Runs readability. 190 | * 191 | * Workflow: 192 | * 1. Prep the document by removing script tags, css, etc. 193 | * 2. Build readability's DOM tree. 194 | * 3. Grab the article content from the current dom tree. 195 | * 4. Replace the current DOM tree with the new one. 196 | * 5. Read peacefully. 197 | * 198 | * @return boolean true if we found content, false otherwise 199 | **/ 200 | public function init() 201 | { 202 | if (!isset($this->dom->documentElement)) return false; 203 | $this->removeScripts($this->dom); 204 | //die($this->getInnerHTML($this->dom->documentElement)); 205 | 206 | // Assume successful outcome 207 | $this->success = true; 208 | 209 | $bodyElems = $this->dom->getElementsByTagName('body'); 210 | if ($bodyElems->length > 0) { 211 | if ($this->bodyCache == null) { 212 | $this->bodyCache = $bodyElems->item(0)->innerHTML; 213 | } 214 | if ($this->body == null) { 215 | $this->body = $bodyElems->item(0); 216 | } 217 | } 218 | 219 | $this->prepDocument(); 220 | 221 | //die($this->dom->documentElement->parentNode->nodeType); 222 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); 223 | //die($this->getInnerHTML($this->dom->documentElement)); 224 | 225 | /* Build readability's DOM tree */ 226 | $overlay = $this->dom->createElement('div'); 227 | $innerDiv = $this->dom->createElement('div'); 228 | $articleTitle = $this->getArticleTitle(); 229 | $articleContent = $this->grabArticle(); 230 | 231 | if (!$articleContent) { 232 | $this->success = false; 233 | $articleContent = $this->dom->createElement('div'); 234 | $articleContent->setAttribute('id', 'readability-content'); 235 | $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
'; 236 | } 237 | 238 | $overlay->setAttribute('id', 'readOverlay'); 239 | $innerDiv->setAttribute('id', 'readInner'); 240 | 241 | /* Glue the structure of our document together. */ 242 | $innerDiv->appendChild($articleTitle); 243 | $innerDiv->appendChild($articleContent); 244 | $overlay->appendChild($innerDiv); 245 | 246 | /* Clear the old HTML, insert the new content. */ 247 | $this->body->innerHTML = ''; 248 | $this->body->appendChild($overlay); 249 | //document.body.insertBefore(overlay, document.body.firstChild); 250 | $this->body->removeAttribute('style'); 251 | 252 | $this->postProcessContent($articleContent); 253 | 254 | // Set title and content instance variables 255 | $this->articleTitle = $articleTitle; 256 | $this->articleContent = $articleContent; 257 | 258 | return $this->success; 259 | } 260 | 261 | /** 262 | * Debug 263 | */ 264 | protected function dbg($msg) { 265 | if ($this->debug) echo '* ',$msg, '').replace(readability.regexps.replaceFonts, '<$1span>'); 358 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. 359 | // Manipulating innerHTML as it's done in JS is not possible in PHP. 360 | } 361 | 362 | /** 363 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. 364 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php 365 | * 366 | * @return void 367 | **/ 368 | public function addFootnotes($articleContent) { 369 | $footnotesWrapper = $this->dom->createElement('div'); 370 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); 371 | $footnotesWrapper->innerHTML = '
tags, etc.
452 | *
453 | * @param DOMElement
454 | * @return void
455 | */
456 | function prepArticle($articleContent) {
457 | $this->cleanStyles($articleContent);
458 | $this->killBreaks($articleContent);
459 | if ($this->revertForcedParagraphElements) {
460 | $this->revertReadabilityStyledElements($articleContent);
461 | }
462 |
463 | /* Clean out junk from the article content */
464 | $this->cleanConditionally($articleContent, 'form');
465 | $this->clean($articleContent, 'object');
466 | $this->clean($articleContent, 'h1');
467 |
468 | /**
469 | * If there is only one h2, they are probably using it
470 | * as a header and not a subheader, so remove it since we already have a header.
471 | ***/
472 | if ($articleContent->getElementsByTagName('h2')->length == 1) {
473 | $this->clean($articleContent, 'h2');
474 | }
475 | $this->clean($articleContent, 'iframe');
476 |
477 | $this->cleanHeaders($articleContent);
478 |
479 | /* Do these last as the previous stuff may have removed junk that will affect these */
480 | $this->cleanConditionally($articleContent, 'table');
481 | $this->cleanConditionally($articleContent, 'ul');
482 | $this->cleanConditionally($articleContent, 'div');
483 |
484 | /* Remove extra paragraphs */
485 | $articleParagraphs = $articleContent->getElementsByTagName('p');
486 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
487 | {
488 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
489 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
490 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
491 |
492 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
493 | {
494 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
495 | }
496 | }
497 |
498 | try {
499 | $articleContent->innerHTML = preg_replace('/
]*>\s*
innerHTML);
500 | //articleContent.innerHTML = articleContent.innerHTML.replace(/
]*>\s*
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
504 | }
505 | }
506 |
507 | /**
508 | * Initialize a node with the readability object. Also checks the
509 | * className/id for special names to add to its score.
510 | *
511 | * @param Element
512 | * @return void
513 | **/
514 | protected function initializeNode($node) {
515 | $readability = $this->dom->createAttribute('readability');
516 | $readability->value = 0; // this is our contentScore
517 | $node->setAttributeNode($readability);
518 |
519 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
520 | case 'DIV':
521 | $readability->value += 5;
522 | break;
523 |
524 | case 'PRE':
525 | case 'TD':
526 | case 'BLOCKQUOTE':
527 | $readability->value += 3;
528 | break;
529 |
530 | case 'ADDRESS':
531 | case 'OL':
532 | case 'UL':
533 | case 'DL':
534 | case 'DD':
535 | case 'DT':
536 | case 'LI':
537 | case 'FORM':
538 | $readability->value -= 3;
539 | break;
540 |
541 | case 'H1':
542 | case 'H2':
543 | case 'H3':
544 | case 'H4':
545 | case 'H5':
546 | case 'H6':
547 | case 'TH':
548 | $readability->value -= 5;
549 | break;
550 | }
551 | $readability->value += $this->getClassWeight($node);
552 | }
553 |
554 | /***
555 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
556 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
557 | *
558 | * @return DOMElement
559 | **/
560 | protected function grabArticle($page=null) {
561 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
562 | if (!$page) $page = $this->dom;
563 | $allElements = $page->getElementsByTagName('*');
564 | /**
565 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
566 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
567 | *
568 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
569 | * TODO: Shouldn't this be a reverse traversal?
570 | **/
571 | $node = null;
572 | $nodesToScore = array();
573 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
574 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
575 | //$node = $targetList->item($nodeIndex);
576 | $tagName = strtoupper($node->tagName);
577 | /* Remove unlikely candidates */
578 | if ($stripUnlikelyCandidates) {
579 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
580 | if (
581 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
582 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
583 | $tagName != 'BODY'
584 | )
585 | {
586 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
587 | //$nodesToRemove[] = $node;
588 | $node->parentNode->removeChild($node);
589 | $nodeIndex--;
590 | continue;
591 | }
592 | }
593 |
594 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
595 | $nodesToScore[] = $node;
596 | }
597 |
598 | /* Turn all divs that don't have children block level elements into p's */
599 | if ($tagName == 'DIV') {
600 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
601 | //$this->dbg('Altering div to p');
602 | $newNode = $this->dom->createElement('p');
603 | try {
604 | $newNode->innerHTML = $node->innerHTML;
605 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
606 | $node->parentNode->replaceChild($newNode, $node);
607 | $nodeIndex--;
608 | $nodesToScore[] = $node; // or $newNode?
609 | }
610 | catch(Exception $e) {
611 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
612 | }
613 | }
614 | else
615 | {
616 | /* EXPERIMENTAL */
617 | // TODO: change these p elements back to text nodes after processing
618 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
619 | $childNode = $node->childNodes->item($i);
620 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE
621 | //$this->dbg('replacing text node with a p tag with the same content.');
622 | $p = $this->dom->createElement('p');
623 | $p->innerHTML = $childNode->nodeValue;
624 | $p->setAttribute('style', 'display: inline;');
625 | $p->setAttribute('class', 'readability-styled');
626 | $childNode->parentNode->replaceChild($p, $childNode);
627 | }
628 | }
629 | }
630 | }
631 | }
632 |
633 | /**
634 | * Loop through all paragraphs, and assign a score to them based on how content-y they look.
635 | * Then add their score to their parent node.
636 | *
637 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
638 | **/
639 | $candidates = array();
640 | for ($pt=0; $pt < count($nodesToScore); $pt++) {
641 | $parentNode = $nodesToScore[$pt]->parentNode;
642 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
643 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
644 | $innerText = $this->getInnerText($nodesToScore[$pt]);
645 |
646 | if (!$parentNode || !isset($parentNode->tagName)) {
647 | continue;
648 | }
649 |
650 | /* If this paragraph is less than 25 characters, don't even count it. */
651 | if(strlen($innerText) < 25) {
652 | continue;
653 | }
654 |
655 | /* Initialize readability data for the parent. */
656 | if (!$parentNode->hasAttribute('readability'))
657 | {
658 | $this->initializeNode($parentNode);
659 | $candidates[] = $parentNode;
660 | }
661 |
662 | /* Initialize readability data for the grandparent. */
663 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
664 | {
665 | $this->initializeNode($grandParentNode);
666 | $candidates[] = $grandParentNode;
667 | }
668 |
669 | $contentScore = 0;
670 |
671 | /* Add a point for the paragraph itself as a base. */
672 | $contentScore++;
673 |
674 | /* Add points for any commas within this paragraph */
675 | $contentScore += count(explode(',', $innerText));
676 |
677 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
678 | $contentScore += min(floor(strlen($innerText) / 100), 3);
679 |
680 | /* Add the score to the parent. The grandparent gets half. */
681 | $parentNode->getAttributeNode('readability')->value += $contentScore;
682 |
683 | if ($grandParentNode) {
684 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
685 | }
686 | }
687 |
688 | /**
689 | * After we've calculated scores, loop through all of the possible candidate nodes we found
690 | * and find the one with the highest score.
691 | **/
692 | $topCandidate = null;
693 | for ($c=0, $cl=count($candidates); $c < $cl; $c++)
694 | {
695 | /**
696 | * Scale the final candidates score based on link density. Good content should have a
697 | * relatively small link density (5% or less) and be mostly unaffected by this operation.
698 | **/
699 | $readability = $candidates[$c]->getAttributeNode('readability');
700 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
701 |
702 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
703 |
704 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
705 | $topCandidate = $candidates[$c];
706 | }
707 | }
708 |
709 | /**
710 | * If we still have no top candidate, just use the body as a last resort.
711 | * We also have to copy the body node so it is something we can modify.
712 | **/
713 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
714 | {
715 | $topCandidate = $this->dom->createElement('div');
716 | if ($page instanceof DOMDocument) {
717 | if (!isset($page->documentElement)) {
718 | // we don't have a body either? what a mess! :)
719 | } else {
720 | $topCandidate->innerHTML = $page->documentElement->innerHTML;
721 | $page->documentElement->innerHTML = '';
722 | $page->documentElement->appendChild($topCandidate);
723 | }
724 | } else {
725 | $topCandidate->innerHTML = $page->innerHTML;
726 | $page->innerHTML = '';
727 | $page->appendChild($topCandidate);
728 | }
729 | $this->initializeNode($topCandidate);
730 | }
731 |
732 | /**
733 | * Now that we have the top candidate, look through its siblings for content that might also be related.
734 | * Things like preambles, content split by ads that we removed, etc.
735 | **/
736 | $articleContent = $this->dom->createElement('div');
737 | $articleContent->setAttribute('id', 'readability-content');
738 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
739 | $siblingNodes = $topCandidate->parentNode->childNodes;
740 | if (!isset($siblingNodes)) {
741 | $siblingNodes = new stdClass;
742 | $siblingNodes->length = 0;
743 | }
744 |
745 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
746 | {
747 | $siblingNode = $siblingNodes->item($s);
748 | $append = false;
749 |
750 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
751 |
752 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
753 |
754 | if ($siblingNode === $topCandidate)
755 | // or if ($siblingNode->isSameNode($topCandidate))
756 | {
757 | $append = true;
758 | }
759 |
760 | $contentBonus = 0;
761 | /* Give a bonus if sibling nodes and top candidates have the example same classname */
762 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
763 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
764 | }
765 |
766 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
767 | {
768 | $append = true;
769 | }
770 |
771 | if (strtoupper($siblingNode->nodeName) == 'P') {
772 | $linkDensity = $this->getLinkDensity($siblingNode);
773 | $nodeContent = $this->getInnerText($siblingNode);
774 | $nodeLength = strlen($nodeContent);
775 |
776 | if ($nodeLength > 80 && $linkDensity < 0.25)
777 | {
778 | $append = true;
779 | }
780 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
781 | {
782 | $append = true;
783 | }
784 | }
785 |
786 | if ($append)
787 | {
788 | $this->dbg('Appending node: ' . $siblingNode->nodeName);
789 |
790 | $nodeToAppend = null;
791 | $sibNodeName = strtoupper($siblingNode->nodeName);
792 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
793 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
794 |
795 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
796 | $nodeToAppend = $this->dom->createElement('div');
797 | try {
798 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
799 | $nodeToAppend->innerHTML = $siblingNode->innerHTML;
800 | }
801 | catch(Exception $e)
802 | {
803 | $this->dbg('Could not alter siblingNode to div, reverting back to original.');
804 | $nodeToAppend = $siblingNode;
805 | $s--;
806 | $sl--;
807 | }
808 | } else {
809 | $nodeToAppend = $siblingNode;
810 | $s--;
811 | $sl--;
812 | }
813 |
814 | /* To ensure a node does not interfere with readability styles, remove its classnames */
815 | $nodeToAppend->removeAttribute('class');
816 |
817 | /* Append sibling and subtract from our list because it removes the node when you append to another node */
818 | $articleContent->appendChild($nodeToAppend);
819 | }
820 | }
821 |
822 | /**
823 | * So we have all of the content that we need. Now we clean it up for presentation.
824 | **/
825 | $this->prepArticle($articleContent);
826 |
827 | /**
828 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
829 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
830 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
831 | * finding the -right- content.
832 | **/
833 | if (strlen($this->getInnerText($articleContent, false)) < 250)
834 | {
835 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
836 | // in the meantime, we check and create an empty element if it's not there.
837 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
838 | $this->body->innerHTML = $this->bodyCache;
839 |
840 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
841 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
842 | return $this->grabArticle($this->body);
843 | }
844 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
845 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
846 | return $this->grabArticle($this->body);
847 | }
848 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
849 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
850 | return $this->grabArticle($this->body);
851 | }
852 | else {
853 | return false;
854 | }
855 | }
856 | return $articleContent;
857 | }
858 |
859 | /**
860 | * Remove script tags from document
861 | *
862 | * @param DOMElement
863 | * @return void
864 | */
865 | public function removeScripts($doc) {
866 | $scripts = $doc->getElementsByTagName('script');
867 | for($i = $scripts->length-1; $i >= 0; $i--)
868 | {
869 | $scripts->item($i)->parentNode->removeChild($scripts->item($i));
870 | }
871 | }
872 |
873 | /**
874 | * Get the inner text of a node.
875 | * This also strips out any excess whitespace to be found.
876 | *
877 | * @param DOMElement $
878 | * @param boolean $normalizeSpaces (default: true)
879 | * @return string
880 | **/
881 | public function getInnerText($e, $normalizeSpaces=true) {
882 | $textContent = '';
883 |
884 | if (!isset($e->textContent) || $e->textContent == '') {
885 | return '';
886 | }
887 |
888 | $textContent = trim($e->textContent);
889 |
890 | if ($normalizeSpaces) {
891 | return preg_replace($this->regexps['normalize'], ' ', $textContent);
892 | } else {
893 | return $textContent;
894 | }
895 | }
896 |
897 | /**
898 | * Get the number of times a string $s appears in the node $e.
899 | *
900 | * @param DOMElement $e
901 | * @param string - what to count. Default is ","
902 | * @return number (integer)
903 | **/
904 | public function getCharCount($e, $s=',') {
905 | return substr_count($this->getInnerText($e), $s);
906 | }
907 |
908 | /**
909 | * Remove the style attribute on every $e and under.
910 | *
911 | * @param DOMElement $e
912 | * @return void
913 | */
914 | public function cleanStyles($e) {
915 | if (!is_object($e)) return;
916 | $elems = $e->getElementsByTagName('*');
917 | foreach ($elems as $elem) {
918 | $elem->removeAttribute('style');
919 | }
920 | }
921 |
922 | /**
923 | * Get the density of links as a percentage of the content
924 | * This is the amount of text that is inside a link divided by the total text in the node.
925 | *
926 | * @param DOMElement $e
927 | * @return number (float)
928 | */
929 | public function getLinkDensity($e) {
930 | $links = $e->getElementsByTagName('a');
931 | $textLength = strlen($this->getInnerText($e));
932 | $linkLength = 0;
933 | for ($i=0, $il=$links->length; $i < $il; $i++)
934 | {
935 | $linkLength += strlen($this->getInnerText($links->item($i)));
936 | }
937 | if ($textLength > 0) {
938 | return $linkLength / $textLength;
939 | } else {
940 | return 0;
941 | }
942 | }
943 |
944 | /**
945 | * Get an elements class/id weight. Uses regular expressions to tell if this
946 | * element looks good or bad.
947 | *
948 | * @param DOMElement $e
949 | * @return number (Integer)
950 | */
951 | public function getClassWeight($e) {
952 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
953 | return 0;
954 | }
955 |
956 | $weight = 0;
957 |
958 | /* Look for a special classname */
959 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
960 | {
961 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
962 | $weight -= 25;
963 | }
964 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
965 | $weight += 25;
966 | }
967 | }
968 |
969 | /* Look for a special ID */
970 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
971 | {
972 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
973 | $weight -= 25;
974 | }
975 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
976 | $weight += 25;
977 | }
978 | }
979 | return $weight;
980 | }
981 |
982 | /**
983 | * Remove extraneous break tags from a node.
984 | *
985 | * @param DOMElement $node
986 | * @return void
987 | */
988 | public function killBreaks($node) {
989 | $html = $node->innerHTML;
990 | $html = preg_replace($this->regexps['killBreaks'], '
', $html);
991 | $node->innerHTML = $html;
992 | }
993 |
994 | /**
995 | * Clean a node of all elements of type "tag".
996 | * (Unless it's a youtube/vimeo video. People love movies.)
997 | *
998 | * @param DOMElement $e
999 | * @param string $tag
1000 | * @return void
1001 | */
1002 | public function clean($e, $tag) {
1003 | $targetList = $e->getElementsByTagName($tag);
1004 | $isEmbed = ($tag == 'object' || $tag == 'embed');
1005 |
1006 | for ($y=$targetList->length-1; $y >= 0; $y--) {
1007 | /* Allow youtube and vimeo videos through as people usually want to see those. */
1008 | if ($isEmbed) {
1009 | $attributeValues = '';
1010 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
1011 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
1012 | }
1013 |
1014 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */
1015 | if (preg_match($this->regexps['video'], $attributeValues)) {
1016 | continue;
1017 | }
1018 |
1019 | /* Then check the elements inside this element for the same. */
1020 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
1021 | continue;
1022 | }
1023 | }
1024 | $targetList->item($y)->parentNode->removeChild($targetList->item($y));
1025 | }
1026 | }
1027 |
1028 | /**
1029 | * Clean an element of all tags of type "tag" if they look fishy.
1030 | * "Fishy" is an algorithm based on content length, classnames,
1031 | * link density, number of images & embeds, etc.
1032 | *
1033 | * @param DOMElement $e
1034 | * @param string $tag
1035 | * @return void
1036 | */
1037 | public function cleanConditionally($e, $tag) {
1038 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
1039 | return;
1040 | }
1041 |
1042 | $tagsList = $e->getElementsByTagName($tag);
1043 | $curTagsLength = $tagsList->length;
1044 |
1045 | /**
1046 | * Gather counts for other typical elements embedded within.
1047 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1048 | *
1049 | * TODO: Consider taking into account original contentScore here.
1050 | */
1051 | for ($i=$curTagsLength-1; $i >= 0; $i--) {
1052 | $weight = $this->getClassWeight($tagsList->item($i));
1053 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
1054 |
1055 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
1056 |
1057 | if ($weight + $contentScore < 0) {
1058 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1059 | }
1060 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
1061 | /**
1062 | * If there are not very many commas, and the number of
1063 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1064 | **/
1065 | $p = $tagsList->item($i)->getElementsByTagName('p')->length;
1066 | $img = $tagsList->item($i)->getElementsByTagName('img')->length;
1067 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1068 | $input = $tagsList->item($i)->getElementsByTagName('input')->length;
1069 |
1070 | $embedCount = 0;
1071 | $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1072 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1073 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1074 | $embedCount++;
1075 | }
1076 | }
1077 |
1078 | $linkDensity = $this->getLinkDensity($tagsList->item($i));
1079 | $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1080 | $toRemove = false;
1081 |
1082 | if ( $img > $p ) {
1083 | $toRemove = true;
1084 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1085 | $toRemove = true;
1086 | } else if ( $input > floor($p/3) ) {
1087 | $toRemove = true;
1088 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1089 | $toRemove = true;
1090 | } else if($weight < 25 && $linkDensity > 0.2) {
1091 | $toRemove = true;
1092 | } else if($weight >= 25 && $linkDensity > 0.5) {
1093 | $toRemove = true;
1094 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
1095 | $toRemove = true;
1096 | }
1097 |
1098 | if ($toRemove) {
1099 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1100 | }
1101 | }
1102 | }
1103 | }
1104 |
1105 | /**
1106 | * Clean out spurious headers from an Element. Checks things like classnames and link density.
1107 | *
1108 | * @param DOMElement $e
1109 | * @return void
1110 | */
1111 | public function cleanHeaders($e) {
1112 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1113 | $headers = $e->getElementsByTagName('h' . $headerIndex);
1114 | for ($i=$headers->length-1; $i >=0; $i--) {
1115 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
1116 | $headers->item($i)->parentNode->removeChild($headers->item($i));
1117 | }
1118 | }
1119 | }
1120 | }
1121 |
1122 | public function flagIsActive($flag) {
1123 | return ($this->flags & $flag) > 0;
1124 | }
1125 |
1126 | public function addFlag($flag) {
1127 | $this->flags = $this->flags | $flag;
1128 | }
1129 |
1130 | public function removeFlag($flag) {
1131 | $this->flags = $this->flags & ~$flag;
1132 | }
1133 | }
1134 | ?>
--------------------------------------------------------------------------------