.
466 | * Whitespace between
elements are ignored. For example:
467 | *
abc
block.
477 | var replaced = false;
478 |
479 | // If we find a
chain, remove the
s until we hit another element
480 | // or non-whitespace. This leaves behind the first
in the chain
481 | // (which will be replaced with a
later).
482 | while ((next = this._nextElement(next)) && (next.tagName == "BR")) {
483 | replaced = true;
484 | var brSibling = next.nextSibling;
485 | next.parentNode.removeChild(next);
486 | next = brSibling;
487 | }
488 |
489 | // If we removed a
chain, replace the remaining
with a
. Add 490 | // all sibling nodes as children of the
until we hit another
491 | // chain.
492 | if (replaced) {
493 | var p = this._doc.createElement("p");
494 | br.parentNode.replaceChild(p, br);
495 |
496 | next = p.nextSibling;
497 | while (next) {
498 | // If we've hit another
, we're done adding children to this
. 499 | if (next.tagName == "BR") { 500 | var nextElem = this._nextElement(next.nextSibling); 501 | if (nextElem && nextElem.tagName == "BR") 502 | break; 503 | } 504 | 505 | if (!this._isPhrasingContent(next)) 506 | break; 507 | 508 | // Otherwise, make this node a child of the new
. 509 | var sibling = next.nextSibling; 510 | p.appendChild(next); 511 | next = sibling; 512 | } 513 | 514 | while (p.lastChild && this._isWhitespace(p.lastChild)) { 515 | p.removeChild(p.lastChild); 516 | } 517 | 518 | if (p.parentNode.tagName === "P") 519 | this._setNodeTag(p.parentNode, "DIV"); 520 | } 521 | }); 522 | }, 523 | 524 | _setNodeTag: function (node, tag) { 525 | this.log("_setNodeTag", node, tag); 526 | if (node.__JSDOMParser__) { 527 | node.localName = tag.toLowerCase(); 528 | node.tagName = tag.toUpperCase(); 529 | return node; 530 | } 531 | 532 | var replacement = node.ownerDocument.createElement(tag); 533 | while (node.firstChild) { 534 | replacement.appendChild(node.firstChild); 535 | } 536 | node.parentNode.replaceChild(replacement, node); 537 | if (node.readability) 538 | replacement.readability = node.readability; 539 | 540 | for (var i = 0; i < node.attributes.length; i++) { 541 | replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); 542 | } 543 | return replacement; 544 | }, 545 | 546 | /** 547 | * Prepare the article node for display. Clean out any inline styles, 548 | * iframes, forms, strip extraneous
tags, etc. 549 | * 550 | * @param Element 551 | * @return void 552 | **/ 553 | _prepArticle: function (articleContent) { 554 | this._cleanStyles(articleContent); 555 | 556 | // Check for data tables before we continue, to avoid removing items in 557 | // those tables, which will often be isolated even though they're 558 | // visually linked to other content-ful elements (text, images, etc.). 559 | this._markDataTables(articleContent); 560 | 561 | // Clean out junk from the article content 562 | this._cleanConditionally(articleContent, "form"); 563 | this._cleanConditionally(articleContent, "fieldset"); 564 | this._clean(articleContent, "object"); 565 | this._clean(articleContent, "embed"); 566 | this._clean(articleContent, "h1"); 567 | this._clean(articleContent, "footer"); 568 | this._clean(articleContent, "link"); 569 | this._clean(articleContent, "aside"); 570 | 571 | // Clean out elements have "share" in their id/class combinations from final top candidates, 572 | // which means we don't remove the top candidates even they have "share". 573 | this._forEachNode(articleContent.children, function (topCandidate) { 574 | this._cleanMatchedNodes(topCandidate, /share/); 575 | }); 576 | 577 | // If there is only one h2 and its text content substantially equals article title, 578 | // they are probably using it as a header and not a subheader, 579 | // so remove it since we already extract the title separately. 580 | var h2 = articleContent.getElementsByTagName("h2"); 581 | if (h2.length === 1) { 582 | var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length; 583 | if (Math.abs(lengthSimilarRate) < 0.5) { 584 | var titlesMatch = false; 585 | if (lengthSimilarRate > 0) { 586 | titlesMatch = h2[0].textContent.includes(this._articleTitle); 587 | } else { 588 | titlesMatch = this._articleTitle.includes(h2[0].textContent); 589 | } 590 | if (titlesMatch) { 591 | this._clean(articleContent, "h2"); 592 | } 593 | } 594 | } 595 | 596 | this._clean(articleContent, "iframe"); 597 | this._clean(articleContent, "input"); 598 | this._clean(articleContent, "textarea"); 599 | this._clean(articleContent, "select"); 600 | this._clean(articleContent, "button"); 601 | this._cleanHeaders(articleContent); 602 | 603 | // Do these last as the previous stuff may have removed junk 604 | // that will affect these 605 | this._cleanConditionally(articleContent, "table"); 606 | this._cleanConditionally(articleContent, "ul"); 607 | this._cleanConditionally(articleContent, "div"); 608 | 609 | // Remove extra paragraphs 610 | this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) { 611 | var imgCount = paragraph.getElementsByTagName("img").length; 612 | var embedCount = paragraph.getElementsByTagName("embed").length; 613 | var objectCount = paragraph.getElementsByTagName("object").length; 614 | // At this point, nasty iframes have been removed, only remain embedded video ones. 615 | var iframeCount = paragraph.getElementsByTagName("iframe").length; 616 | var totalCount = imgCount + embedCount + objectCount + iframeCount; 617 | 618 | return totalCount === 0 && !this._getInnerText(paragraph, false); 619 | }); 620 | 621 | this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function (br) { 622 | var next = this._nextElement(br.nextSibling); 623 | if (next && next.tagName == "P") 624 | br.parentNode.removeChild(br); 625 | }); 626 | 627 | // Remove single-cell tables 628 | this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function (table) { 629 | var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; 630 | if (this._hasSingleTagInsideElement(tbody, "TR")) { 631 | var row = tbody.firstElementChild; 632 | if (this._hasSingleTagInsideElement(row, "TD")) { 633 | var cell = row.firstElementChild; 634 | cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); 635 | table.parentNode.replaceChild(cell, table); 636 | } 637 | } 638 | }); 639 | }, 640 | 641 | /** 642 | * Initialize a node with the readability object. Also checks the 643 | * className/id for special names to add to its score. 644 | * 645 | * @param Element 646 | * @return void 647 | **/ 648 | _initializeNode: function (node) { 649 | node.readability = {"contentScore": 0}; 650 | 651 | switch (node.tagName) { 652 | case "DIV": 653 | node.readability.contentScore += 5; 654 | break; 655 | 656 | case "PRE": 657 | case "TD": 658 | case "BLOCKQUOTE": 659 | node.readability.contentScore += 3; 660 | break; 661 | 662 | case "ADDRESS": 663 | case "OL": 664 | case "UL": 665 | case "DL": 666 | case "DD": 667 | case "DT": 668 | case "LI": 669 | case "FORM": 670 | node.readability.contentScore -= 3; 671 | break; 672 | 673 | case "H1": 674 | case "H2": 675 | case "H3": 676 | case "H4": 677 | case "H5": 678 | case "H6": 679 | case "TH": 680 | node.readability.contentScore -= 5; 681 | break; 682 | } 683 | 684 | node.readability.contentScore += this._getClassWeight(node); 685 | }, 686 | 687 | _removeAndGetNext: function (node) { 688 | var nextNode = this._getNextNode(node, true); 689 | node.parentNode.removeChild(node); 690 | return nextNode; 691 | }, 692 | 693 | /** 694 | * Traverse the DOM from node to node, starting at the node passed in. 695 | * Pass true for the second parameter to indicate this node itself 696 | * (and its kids) are going away, and we want the next node over. 697 | * 698 | * Calling this in a loop will traverse the DOM depth-first. 699 | */ 700 | _getNextNode: function (node, ignoreSelfAndKids) { 701 | // First check for kids if those aren't being ignored 702 | if (!ignoreSelfAndKids && node.firstElementChild) { 703 | return node.firstElementChild; 704 | } 705 | // Then for siblings... 706 | if (node.nextElementSibling) { 707 | return node.nextElementSibling; 708 | } 709 | // And finally, move up the parent chain *and* find a sibling 710 | // (because this is depth-first traversal, we will have already 711 | // seen the parent nodes themselves). 712 | do { 713 | node = node.parentNode; 714 | } while (node && !node.nextElementSibling); 715 | return node && node.nextElementSibling; 716 | }, 717 | 718 | _checkByline: function (node, matchString) { 719 | if (this._articleByline) { 720 | return false; 721 | } 722 | 723 | if (node.getAttribute !== undefined) { 724 | var rel = node.getAttribute("rel"); 725 | } 726 | 727 | if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { 728 | this._articleByline = node.textContent.trim(); 729 | return true; 730 | } 731 | 732 | return false; 733 | }, 734 | 735 | _getNodeAncestors: function (node, maxDepth) { 736 | maxDepth = maxDepth || 0; 737 | var i = 0, ancestors = []; 738 | while (node.parentNode) { 739 | ancestors.push(node.parentNode); 740 | if (maxDepth && ++i === maxDepth) 741 | break; 742 | node = node.parentNode; 743 | } 744 | return ancestors; 745 | }, 746 | 747 | /*** 748 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 749 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 750 | * 751 | * @param page a document to run upon. Needs to be a full document, complete with body. 752 | * @return Element 753 | **/ 754 | _grabArticle: function (page) { 755 | this.log("**** grabArticle ****"); 756 | var doc = this._doc; 757 | var isPaging = (page !== null ? true : false); 758 | page = page ? page : this._doc.body; 759 | 760 | // We can't grab an article if we don't have a page! 761 | if (!page) { 762 | this.log("No body found in document. Abort."); 763 | return null; 764 | } 765 | 766 | var pageCacheHtml = page.innerHTML; 767 | 768 | while (true) { 769 | var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); 770 | 771 | // First, node prepping. Trash nodes that look cruddy (like ones with the 772 | // class name "comment", etc), and turn divs into P tags where they have been 773 | // used inappropriately (as in, where they contain no other block level elements.) 774 | var elementsToScore = []; 775 | var node = this._doc.documentElement; 776 | 777 | while (node) { 778 | var matchString = node.className + " " + node.id; 779 | 780 | if (!this._isProbablyVisible(node)) { 781 | this.log("Removing hidden node - " + matchString); 782 | node = this._removeAndGetNext(node); 783 | continue; 784 | } 785 | 786 | // Check to see if this node is a byline, and remove it if it is. 787 | if (this._checkByline(node, matchString)) { 788 | node = this._removeAndGetNext(node); 789 | continue; 790 | } 791 | 792 | // Remove unlikely candidates 793 | if (stripUnlikelyCandidates) { 794 | if (this.REGEXPS.unlikelyCandidates.test(matchString) && 795 | !this.REGEXPS.okMaybeItsACandidate.test(matchString) && 796 | node.tagName !== "BODY" && 797 | node.tagName !== "A") { 798 | this.log("Removing unlikely candidate - " + matchString); 799 | node = this._removeAndGetNext(node); 800 | continue; 801 | } 802 | } 803 | 804 | // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). 805 | if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || 806 | node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || 807 | node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && 808 | this._isElementWithoutContent(node)) { 809 | node = this._removeAndGetNext(node); 810 | continue; 811 | } 812 | 813 | if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { 814 | elementsToScore.push(node); 815 | } 816 | 817 | // Turn all divs that don't have children block level elements into p's 818 | if (node.tagName === "DIV") { 819 | // Put phrasing content into paragraphs. 820 | var p = null; 821 | var childNode = node.firstChild; 822 | while (childNode) { 823 | var nextSibling = childNode.nextSibling; 824 | if (this._isPhrasingContent(childNode)) { 825 | if (p !== null) { 826 | p.appendChild(childNode); 827 | } else if (!this._isWhitespace(childNode)) { 828 | p = doc.createElement("p"); 829 | node.replaceChild(p, childNode); 830 | p.appendChild(childNode); 831 | } 832 | } else if (p !== null) { 833 | while (p.lastChild && this._isWhitespace(p.lastChild)) { 834 | p.removeChild(p.lastChild); 835 | } 836 | p = null; 837 | } 838 | childNode = nextSibling; 839 | } 840 | 841 | // Sites like http://mobile.slate.com encloses each paragraph with a DIV 842 | // element. DIVs with only a P element inside and no text content can be 843 | // safely converted into plain P elements to avoid confusing the scoring 844 | // algorithm with DIVs with are, in practice, paragraphs. 845 | if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { 846 | var newNode = node.children[0]; 847 | node.parentNode.replaceChild(newNode, node); 848 | node = newNode; 849 | elementsToScore.push(node); 850 | } else if (!this._hasChildBlockElement(node)) { 851 | node = this._setNodeTag(node, "P"); 852 | elementsToScore.push(node); 853 | } 854 | } 855 | node = this._getNextNode(node); 856 | } 857 | 858 | /** 859 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. 860 | * Then add their score to their parent node. 861 | * 862 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 863 | **/ 864 | var candidates = []; 865 | this._forEachNode(elementsToScore, function (elementToScore) { 866 | if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") 867 | return; 868 | 869 | // If this paragraph is less than 25 characters, don't even count it. 870 | var innerText = this._getInnerText(elementToScore); 871 | if (innerText.length < 25) 872 | return; 873 | 874 | // Exclude nodes with no ancestor. 875 | var ancestors = this._getNodeAncestors(elementToScore, 3); 876 | if (ancestors.length === 0) 877 | return; 878 | 879 | var contentScore = 0; 880 | 881 | // Add a point for the paragraph itself as a base. 882 | contentScore += 1; 883 | 884 | // Add points for any commas within this paragraph. 885 | contentScore += innerText.split(",").length; 886 | 887 | // For every 100 characters in this paragraph, add another point. Up to 3 points. 888 | contentScore += Math.min(Math.floor(innerText.length / 100), 3); 889 | 890 | // Initialize and score ancestors. 891 | this._forEachNode(ancestors, function (ancestor, level) { 892 | if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") 893 | return; 894 | 895 | if (typeof(ancestor.readability) === "undefined") { 896 | this._initializeNode(ancestor); 897 | candidates.push(ancestor); 898 | } 899 | 900 | // Node score divider: 901 | // - parent: 1 (no division) 902 | // - grandparent: 2 903 | // - great grandparent+: ancestor level * 3 904 | if (level === 0) 905 | var scoreDivider = 1; 906 | else if (level === 1) 907 | scoreDivider = 2; 908 | else 909 | scoreDivider = level * 3; 910 | ancestor.readability.contentScore += contentScore / scoreDivider; 911 | }); 912 | }); 913 | 914 | // After we've calculated scores, loop through all of the possible 915 | // candidate nodes we found and find the one with the highest score. 916 | var topCandidates = []; 917 | for (var c = 0, cl = candidates.length; c < cl; c += 1) { 918 | var candidate = candidates[c]; 919 | 920 | // Scale the final candidates score based on link density. Good content 921 | // should have a relatively small link density (5% or less) and be mostly 922 | // unaffected by this operation. 923 | var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); 924 | candidate.readability.contentScore = candidateScore; 925 | 926 | this.log("Candidate:", candidate, "with score " + candidateScore); 927 | 928 | for (var t = 0; t < this._nbTopCandidates; t++) { 929 | var aTopCandidate = topCandidates[t]; 930 | 931 | if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { 932 | topCandidates.splice(t, 0, candidate); 933 | if (topCandidates.length > this._nbTopCandidates) 934 | topCandidates.pop(); 935 | break; 936 | } 937 | } 938 | } 939 | 940 | var topCandidate = topCandidates[0] || null; 941 | var neededToCreateTopCandidate = false; 942 | var parentOfTopCandidate; 943 | 944 | // If we still have no top candidate, just use the body as a last resort. 945 | // We also have to copy the body node so it is something we can modify. 946 | if (topCandidate === null || topCandidate.tagName === "BODY") { 947 | // Move all of the page's children into topCandidate 948 | topCandidate = doc.createElement("DIV"); 949 | neededToCreateTopCandidate = true; 950 | // Move everything (not just elements, also text nodes etc.) into the container 951 | // so we even include text directly in the body: 952 | var kids = page.childNodes; 953 | while (kids.length) { 954 | this.log("Moving child out:", kids[0]); 955 | topCandidate.appendChild(kids[0]); 956 | } 957 | 958 | page.appendChild(topCandidate); 959 | 960 | this._initializeNode(topCandidate); 961 | } else if (topCandidate) { 962 | // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array 963 | // and whose scores are quite closed with current `topCandidate` node. 964 | var alternativeCandidateAncestors = []; 965 | for (var i = 1; i < topCandidates.length; i++) { 966 | if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { 967 | alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); 968 | } 969 | } 970 | var MINIMUM_TOPCANDIDATES = 3; 971 | if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { 972 | parentOfTopCandidate = topCandidate.parentNode; 973 | while (parentOfTopCandidate.tagName !== "BODY") { 974 | var listsContainingThisAncestor = 0; 975 | for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { 976 | listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); 977 | } 978 | if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { 979 | topCandidate = parentOfTopCandidate; 980 | break; 981 | } 982 | parentOfTopCandidate = parentOfTopCandidate.parentNode; 983 | } 984 | } 985 | if (!topCandidate.readability) { 986 | this._initializeNode(topCandidate); 987 | } 988 | 989 | // Because of our bonus system, parents of candidates might have scores 990 | // themselves. They get half of the node. There won't be nodes with higher 991 | // scores than our topCandidate, but if we see the score going *up* in the first 992 | // few steps up the tree, that's a decent sign that there might be more content 993 | // lurking in other places that we want to unify in. The sibling stuff 994 | // below does some of that - but only if we've looked high enough up the DOM 995 | // tree. 996 | parentOfTopCandidate = topCandidate.parentNode; 997 | var lastScore = topCandidate.readability.contentScore; 998 | // The scores shouldn't get too low. 999 | var scoreThreshold = lastScore / 3; 1000 | while (parentOfTopCandidate.tagName !== "BODY") { 1001 | if (!parentOfTopCandidate.readability) { 1002 | parentOfTopCandidate = parentOfTopCandidate.parentNode; 1003 | continue; 1004 | } 1005 | var parentScore = parentOfTopCandidate.readability.contentScore; 1006 | if (parentScore < scoreThreshold) 1007 | break; 1008 | if (parentScore > lastScore) { 1009 | // Alright! We found a better parent to use. 1010 | topCandidate = parentOfTopCandidate; 1011 | break; 1012 | } 1013 | lastScore = parentOfTopCandidate.readability.contentScore; 1014 | parentOfTopCandidate = parentOfTopCandidate.parentNode; 1015 | } 1016 | 1017 | // If the top candidate is the only child, use parent instead. This will help sibling 1018 | // joining logic when adjacent content is actually located in parent's sibling node. 1019 | parentOfTopCandidate = topCandidate.parentNode; 1020 | while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { 1021 | topCandidate = parentOfTopCandidate; 1022 | parentOfTopCandidate = topCandidate.parentNode; 1023 | } 1024 | if (!topCandidate.readability) { 1025 | this._initializeNode(topCandidate); 1026 | } 1027 | } 1028 | 1029 | // Now that we have the top candidate, look through its siblings for content 1030 | // that might also be related. Things like preambles, content split by ads 1031 | // that we removed, etc. 1032 | var articleContent = doc.createElement("DIV"); 1033 | if (isPaging) 1034 | articleContent.id = "readability-content"; 1035 | 1036 | var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); 1037 | // Keep potential top candidate's parent node to try to get text direction of it later. 1038 | parentOfTopCandidate = topCandidate.parentNode; 1039 | var siblings = parentOfTopCandidate.children; 1040 | 1041 | for (var s = 0, sl = siblings.length; s < sl; s++) { 1042 | var sibling = siblings[s]; 1043 | var append = false; 1044 | 1045 | this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); 1046 | this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); 1047 | 1048 | if (sibling === topCandidate) { 1049 | append = true; 1050 | } else { 1051 | var contentBonus = 0; 1052 | 1053 | // Give a bonus if sibling nodes and top candidates have the example same classname 1054 | if (sibling.className === topCandidate.className && topCandidate.className !== "") 1055 | contentBonus += topCandidate.readability.contentScore * 0.2; 1056 | 1057 | if (sibling.readability && 1058 | ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { 1059 | append = true; 1060 | } else if (sibling.nodeName === "P") { 1061 | var linkDensity = this._getLinkDensity(sibling); 1062 | var nodeContent = this._getInnerText(sibling); 1063 | var nodeLength = nodeContent.length; 1064 | 1065 | if (nodeLength > 80 && linkDensity < 0.25) { 1066 | append = true; 1067 | } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && 1068 | nodeContent.search(/\.( |$)/) !== -1) { 1069 | append = true; 1070 | } 1071 | } 1072 | } 1073 | 1074 | if (append) { 1075 | this.log("Appending node:", sibling); 1076 | 1077 | if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { 1078 | // We have a node that isn't a common block level element, like a form or td tag. 1079 | // Turn it into a div so it doesn't get filtered out later by accident. 1080 | this.log("Altering sibling:", sibling, "to div."); 1081 | 1082 | sibling = this._setNodeTag(sibling, "DIV"); 1083 | } 1084 | 1085 | articleContent.appendChild(sibling); 1086 | // siblings is a reference to the children array, and 1087 | // sibling is removed from the array when we call appendChild(). 1088 | // As a result, we must revisit this index since the nodes 1089 | // have been shifted. 1090 | s -= 1; 1091 | sl -= 1; 1092 | } 1093 | } 1094 | 1095 | if (this._debug) 1096 | this.log("Article content pre-prep: " + articleContent.innerHTML); 1097 | // So we have all of the content that we need. Now we clean it up for presentation. 1098 | this._prepArticle(articleContent); 1099 | if (this._debug) 1100 | this.log("Article content post-prep: " + articleContent.innerHTML); 1101 | 1102 | if (neededToCreateTopCandidate) { 1103 | // We already created a fake div thing, and there wouldn't have been any siblings left 1104 | // for the previous loop, so there's no point trying to create a new div, and then 1105 | // move all the children over. Just assign IDs and class names here. No need to append 1106 | // because that already happened anyway. 1107 | topCandidate.id = "readability-page-1"; 1108 | topCandidate.className = "page"; 1109 | } else { 1110 | var div = doc.createElement("DIV"); 1111 | div.id = "readability-page-1"; 1112 | div.className = "page"; 1113 | var children = articleContent.childNodes; 1114 | while (children.length) { 1115 | div.appendChild(children[0]); 1116 | } 1117 | articleContent.appendChild(div); 1118 | } 1119 | 1120 | if (this._debug) 1121 | this.log("Article content after paging: " + articleContent.innerHTML); 1122 | 1123 | var parseSuccessful = true; 1124 | 1125 | // Now that we've gone through the full algorithm, check to see if 1126 | // we got any meaningful content. If we didn't, we may need to re-run 1127 | // grabArticle with different flags set. This gives us a higher likelihood of 1128 | // finding the content, and the sieve approach gives us a higher likelihood of 1129 | // finding the -right- content. 1130 | var textLength = this._getInnerText(articleContent, true).length; 1131 | if (textLength < this._charThreshold) { 1132 | parseSuccessful = false; 1133 | page.innerHTML = pageCacheHtml; 1134 | 1135 | if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { 1136 | this._removeFlag(this.FLAG_STRIP_UNLIKELYS); 1137 | this._attempts.push({articleContent: articleContent, textLength: textLength}); 1138 | } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 1139 | this._removeFlag(this.FLAG_WEIGHT_CLASSES); 1140 | this._attempts.push({articleContent: articleContent, textLength: textLength}); 1141 | } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 1142 | this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); 1143 | this._attempts.push({articleContent: articleContent, textLength: textLength}); 1144 | } else { 1145 | this._attempts.push({articleContent: articleContent, textLength: textLength}); 1146 | // No luck after removing flags, just return the longest text we found during the different loops 1147 | this._attempts.sort(function (a, b) { 1148 | return a.textLength < b.textLength; 1149 | }); 1150 | 1151 | // But first check if we actually have something 1152 | if (!this._attempts[0].textLength) { 1153 | return null; 1154 | } 1155 | 1156 | articleContent = this._attempts[0].articleContent; 1157 | parseSuccessful = true; 1158 | } 1159 | } 1160 | 1161 | if (parseSuccessful) { 1162 | // Find out text direction from ancestors of final top candidate. 1163 | var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); 1164 | this._someNode(ancestors, function (ancestor) { 1165 | if (!ancestor.tagName) 1166 | return false; 1167 | var articleDir = ancestor.getAttribute("dir"); 1168 | if (articleDir) { 1169 | this._articleDir = articleDir; 1170 | return true; 1171 | } 1172 | return false; 1173 | }); 1174 | return articleContent; 1175 | } 1176 | } 1177 | }, 1178 | 1179 | /** 1180 | * Check whether the input string could be a byline. 1181 | * This verifies that the input is a string, and that the length 1182 | * is less than 100 chars. 1183 | * 1184 | * @param possibleByline {string} - a string to check whether its a byline. 1185 | * @return Boolean - whether the input string is a byline. 1186 | */ 1187 | _isValidByline: function (byline) { 1188 | if (typeof byline == "string" || byline instanceof String) { 1189 | byline = byline.trim(); 1190 | return (byline.length > 0) && (byline.length < 100); 1191 | } 1192 | return false; 1193 | }, 1194 | 1195 | /** 1196 | * Attempts to get excerpt and byline metadata for the article. 1197 | * 1198 | * @return Object with optional "excerpt" and "byline" properties 1199 | */ 1200 | _getArticleMetadata: function () { 1201 | var metadata = {}; 1202 | var values = {}; 1203 | var metaElements = this._doc.getElementsByTagName("meta"); 1204 | 1205 | // Match "description", or Twitter's "twitter:description" (Cards) 1206 | // in name attribute. 1207 | var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi; 1208 | 1209 | // Match Facebook's Open Graph title & description properties. 1210 | var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi; 1211 | 1212 | // Find description tags. 1213 | this._forEachNode(metaElements, function (element) { 1214 | var elementName = element.getAttribute("name"); 1215 | var elementProperty = element.getAttribute("property"); 1216 | 1217 | if ([elementName, elementProperty].indexOf("author") !== -1) { 1218 | metadata.byline = element.getAttribute("content"); 1219 | return; 1220 | } 1221 | 1222 | var name = null; 1223 | if (namePattern.test(elementName)) { 1224 | name = elementName; 1225 | } else if (propertyPattern.test(elementProperty)) { 1226 | name = elementProperty; 1227 | } 1228 | 1229 | if (name) { 1230 | var content = element.getAttribute("content"); 1231 | if (content) { 1232 | // Convert to lowercase and remove any whitespace 1233 | // so we can match below. 1234 | name = name.toLowerCase().replace(/\s/g, ""); 1235 | values[name] = content.trim(); 1236 | } 1237 | } 1238 | }); 1239 | 1240 | if ("description" in values) { 1241 | metadata.excerpt = values["description"]; 1242 | } else if ("og:description" in values) { 1243 | // Use facebook open graph description. 1244 | metadata.excerpt = values["og:description"]; 1245 | } else if ("twitter:description" in values) { 1246 | // Use twitter cards description. 1247 | metadata.excerpt = values["twitter:description"]; 1248 | } 1249 | 1250 | metadata.title = this._getArticleTitle(); 1251 | if (!metadata.title) { 1252 | if ("og:title" in values) { 1253 | // Use facebook open graph title. 1254 | metadata.title = values["og:title"]; 1255 | } else if ("twitter:title" in values) { 1256 | // Use twitter cards title. 1257 | metadata.title = values["twitter:title"]; 1258 | } 1259 | } 1260 | 1261 | return metadata; 1262 | }, 1263 | 1264 | /** 1265 | * Removes script tags from the document. 1266 | * 1267 | * @param Element 1268 | **/ 1269 | _removeScripts: function (doc) { 1270 | this._removeNodes(doc.getElementsByTagName("script"), function (scriptNode) { 1271 | scriptNode.nodeValue = ""; 1272 | scriptNode.removeAttribute("src"); 1273 | return true; 1274 | }); 1275 | this._removeNodes(doc.getElementsByTagName("noscript")); 1276 | }, 1277 | 1278 | /** 1279 | * Check if this node has only whitespace and a single element with given tag 1280 | * Returns false if the DIV node contains non-empty text nodes 1281 | * or if it contains no element with given tag or more than 1 element. 1282 | * 1283 | * @param Element 1284 | * @param string tag of child element 1285 | **/ 1286 | _hasSingleTagInsideElement: function (element, tag) { 1287 | // There should be exactly 1 element child with given tag 1288 | if (element.children.length != 1 || element.children[0].tagName !== tag) { 1289 | return false; 1290 | } 1291 | 1292 | // And there should be no text nodes with real content 1293 | return !this._someNode(element.childNodes, function (node) { 1294 | return node.nodeType === this.TEXT_NODE && 1295 | this.REGEXPS.hasContent.test(node.textContent); 1296 | }); 1297 | }, 1298 | 1299 | _isElementWithoutContent: function (node) { 1300 | return node.nodeType === this.ELEMENT_NODE && 1301 | node.textContent.trim().length == 0 && 1302 | (node.children.length == 0 || 1303 | node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length); 1304 | }, 1305 | 1306 | /** 1307 | * Determine whether element has any children block level elements. 1308 | * 1309 | * @param Element 1310 | */ 1311 | _hasChildBlockElement: function (element) { 1312 | return this._someNode(element.childNodes, function (node) { 1313 | return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 || 1314 | this._hasChildBlockElement(node); 1315 | }); 1316 | }, 1317 | 1318 | /*** 1319 | * Determine if a node qualifies as phrasing content. 1320 | * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content 1321 | **/ 1322 | _isPhrasingContent: function (node) { 1323 | return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 || 1324 | ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") && 1325 | this._everyNode(node.childNodes, this._isPhrasingContent)); 1326 | }, 1327 | 1328 | _isWhitespace: function (node) { 1329 | return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) || 1330 | (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR"); 1331 | }, 1332 | 1333 | /** 1334 | * Get the inner text of a node - cross browser compatibly. 1335 | * This also strips out any excess whitespace to be found. 1336 | * 1337 | * @param Element 1338 | * @param Boolean normalizeSpaces (default: true) 1339 | * @return string 1340 | **/ 1341 | _getInnerText: function (e, normalizeSpaces) { 1342 | normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces; 1343 | var textContent = e.textContent.trim(); 1344 | 1345 | if (normalizeSpaces) { 1346 | return textContent.replace(this.REGEXPS.normalize, " "); 1347 | } 1348 | return textContent; 1349 | }, 1350 | 1351 | /** 1352 | * Get the number of times a string s appears in the node e. 1353 | * 1354 | * @param Element 1355 | * @param string - what to split on. Default is "," 1356 | * @return number (integer) 1357 | **/ 1358 | _getCharCount: function (e, s) { 1359 | s = s || ","; 1360 | return this._getInnerText(e).split(s).length - 1; 1361 | }, 1362 | 1363 | /** 1364 | * Remove the style attribute on every e and under. 1365 | * TODO: Test if getElementsByTagName(*) is faster. 1366 | * 1367 | * @param Element 1368 | * @return void 1369 | **/ 1370 | _cleanStyles: function (e) { 1371 | if (!e || e.tagName.toLowerCase() === "svg") 1372 | return; 1373 | 1374 | // Remove `style` and deprecated presentational attributes 1375 | for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { 1376 | e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); 1377 | } 1378 | 1379 | if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { 1380 | e.removeAttribute("width"); 1381 | e.removeAttribute("height"); 1382 | } 1383 | 1384 | var cur = e.firstElementChild; 1385 | while (cur !== null) { 1386 | this._cleanStyles(cur); 1387 | cur = cur.nextElementSibling; 1388 | } 1389 | }, 1390 | 1391 | /** 1392 | * Get the density of links as a percentage of the content 1393 | * This is the amount of text that is inside a link divided by the total text in the node. 1394 | * 1395 | * @param Element 1396 | * @return number (float) 1397 | **/ 1398 | _getLinkDensity: function (element) { 1399 | var textLength = this._getInnerText(element).length; 1400 | if (textLength === 0) 1401 | return 0; 1402 | 1403 | var linkLength = 0; 1404 | 1405 | // XXX implement _reduceNodeList? 1406 | this._forEachNode(element.getElementsByTagName("a"), function (linkNode) { 1407 | linkLength += this._getInnerText(linkNode).length; 1408 | }); 1409 | 1410 | return linkLength / textLength; 1411 | }, 1412 | 1413 | /** 1414 | * Get an elements class/id weight. Uses regular expressions to tell if this 1415 | * element looks good or bad. 1416 | * 1417 | * @param Element 1418 | * @return number (Integer) 1419 | **/ 1420 | _getClassWeight: function (e) { 1421 | if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) 1422 | return 0; 1423 | 1424 | var weight = 0; 1425 | 1426 | // Look for a special classname 1427 | if (typeof(e.className) === "string" && e.className !== "") { 1428 | if (this.REGEXPS.negative.test(e.className)) 1429 | weight -= 25; 1430 | 1431 | if (this.REGEXPS.positive.test(e.className)) 1432 | weight += 25; 1433 | } 1434 | 1435 | // Look for a special ID 1436 | if (typeof(e.id) === "string" && e.id !== "") { 1437 | if (this.REGEXPS.negative.test(e.id)) 1438 | weight -= 25; 1439 | 1440 | if (this.REGEXPS.positive.test(e.id)) 1441 | weight += 25; 1442 | } 1443 | 1444 | return weight; 1445 | }, 1446 | 1447 | /** 1448 | * Clean a node of all elements of type "tag". 1449 | * (Unless it's a youtube/vimeo video. People love movies.) 1450 | * 1451 | * @param Element 1452 | * @param string tag to clean 1453 | * @return void 1454 | **/ 1455 | _clean: function (e, tag) { 1456 | var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; 1457 | 1458 | this._removeNodes(e.getElementsByTagName(tag), function (element) { 1459 | // Allow youtube and vimeo videos through as people usually want to see those. 1460 | if (isEmbed) { 1461 | var attributeValues = [].map.call(element.attributes, function (attr) { 1462 | return attr.value; 1463 | }).join("|"); 1464 | 1465 | // First, check the elements attributes to see if any of them contain youtube or vimeo 1466 | if (this.REGEXPS.videos.test(attributeValues)) 1467 | return false; 1468 | 1469 | // Then check the elements inside this element for the same. 1470 | if (this.REGEXPS.videos.test(element.innerHTML)) 1471 | return false; 1472 | } 1473 | 1474 | return true; 1475 | }); 1476 | }, 1477 | 1478 | /** 1479 | * Check if a given node has one of its ancestor tag name matching the 1480 | * provided one. 1481 | * @param HTMLElement node 1482 | * @param String tagName 1483 | * @param Number maxDepth 1484 | * @param Function filterFn a filter to invoke to determine whether this node 'counts' 1485 | * @return Boolean 1486 | */ 1487 | _hasAncestorTag: function (node, tagName, maxDepth, filterFn) { 1488 | maxDepth = maxDepth || 3; 1489 | tagName = tagName.toUpperCase(); 1490 | var depth = 0; 1491 | while (node.parentNode) { 1492 | if (maxDepth > 0 && depth > maxDepth) 1493 | return false; 1494 | if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode))) 1495 | return true; 1496 | node = node.parentNode; 1497 | depth++; 1498 | } 1499 | return false; 1500 | }, 1501 | 1502 | /** 1503 | * Return an object indicating how many rows and columns this table has. 1504 | */ 1505 | _getRowAndColumnCount: function (table) { 1506 | var rows = 0; 1507 | var columns = 0; 1508 | var trs = table.getElementsByTagName("tr"); 1509 | for (var i = 0; i < trs.length; i++) { 1510 | var rowspan = trs[i].getAttribute("rowspan") || 0; 1511 | if (rowspan) { 1512 | rowspan = parseInt(rowspan, 10); 1513 | } 1514 | rows += (rowspan || 1); 1515 | 1516 | // Now look for column-related info 1517 | var columnsInThisRow = 0; 1518 | var cells = trs[i].getElementsByTagName("td"); 1519 | for (var j = 0; j < cells.length; j++) { 1520 | var colspan = cells[j].getAttribute("colspan") || 0; 1521 | if (colspan) { 1522 | colspan = parseInt(colspan, 10); 1523 | } 1524 | columnsInThisRow += (colspan || 1); 1525 | } 1526 | columns = Math.max(columns, columnsInThisRow); 1527 | } 1528 | return {rows: rows, columns: columns}; 1529 | }, 1530 | 1531 | /** 1532 | * Look for 'data' (as opposed to 'layout') tables, for which we use 1533 | * similar checks as 1534 | * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920 1535 | */ 1536 | _markDataTables: function (root) { 1537 | var tables = root.getElementsByTagName("table"); 1538 | for (var i = 0; i < tables.length; i++) { 1539 | var table = tables[i]; 1540 | var role = table.getAttribute("role"); 1541 | if (role == "presentation") { 1542 | table._readabilityDataTable = false; 1543 | continue; 1544 | } 1545 | var datatable = table.getAttribute("datatable"); 1546 | if (datatable == "0") { 1547 | table._readabilityDataTable = false; 1548 | continue; 1549 | } 1550 | var summary = table.getAttribute("summary"); 1551 | if (summary) { 1552 | table._readabilityDataTable = true; 1553 | continue; 1554 | } 1555 | 1556 | var caption = table.getElementsByTagName("caption")[0]; 1557 | if (caption && caption.childNodes.length > 0) { 1558 | table._readabilityDataTable = true; 1559 | continue; 1560 | } 1561 | 1562 | // If the table has a descendant with any of these tags, consider a data table: 1563 | var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"]; 1564 | var descendantExists = function (tag) { 1565 | return !!table.getElementsByTagName(tag)[0]; 1566 | }; 1567 | if (dataTableDescendants.some(descendantExists)) { 1568 | this.log("Data table because found data-y descendant"); 1569 | table._readabilityDataTable = true; 1570 | continue; 1571 | } 1572 | 1573 | // Nested tables indicate a layout table: 1574 | if (table.getElementsByTagName("table")[0]) { 1575 | table._readabilityDataTable = false; 1576 | continue; 1577 | } 1578 | 1579 | var sizeInfo = this._getRowAndColumnCount(table); 1580 | if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) { 1581 | table._readabilityDataTable = true; 1582 | continue; 1583 | } 1584 | // Now just go by size entirely: 1585 | table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10; 1586 | } 1587 | }, 1588 | 1589 | /** 1590 | * Clean an element of all tags of type "tag" if they look fishy. 1591 | * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 1592 | * 1593 | * @return void 1594 | **/ 1595 | _cleanConditionally: function (e, tag) { 1596 | if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) 1597 | return; 1598 | 1599 | var isList = tag === "ul" || tag === "ol"; 1600 | 1601 | // Gather counts for other typical elements embedded within. 1602 | // Traverse backwards so we can remove nodes at the same time 1603 | // without effecting the traversal. 1604 | // 1605 | // TODO: Consider taking into account original contentScore here. 1606 | this._removeNodes(e.getElementsByTagName(tag), function (node) { 1607 | // First check if we're in a data table, in which case don't remove us. 1608 | var isDataTable = function (t) { 1609 | return t._readabilityDataTable; 1610 | }; 1611 | 1612 | if (this._hasAncestorTag(node, "table", -1, isDataTable)) { 1613 | return false; 1614 | } 1615 | 1616 | var weight = this._getClassWeight(node); 1617 | var contentScore = 0; 1618 | 1619 | this.log("Cleaning Conditionally", node); 1620 | 1621 | if (weight + contentScore < 0) { 1622 | return true; 1623 | } 1624 | 1625 | if (this._getCharCount(node, ",") < 10) { 1626 | // If there are not very many commas, and the number of 1627 | // non-paragraph elements is more than paragraphs or other 1628 | // ominous signs, remove the element. 1629 | var p = node.getElementsByTagName("p").length; 1630 | var img = node.getElementsByTagName("img").length; 1631 | var li = node.getElementsByTagName("li").length - 100; 1632 | var input = node.getElementsByTagName("input").length; 1633 | 1634 | var embedCount = 0; 1635 | var embeds = node.getElementsByTagName("embed"); 1636 | for (var ei = 0, il = embeds.length; ei < il; ei += 1) { 1637 | if (!this.REGEXPS.videos.test(embeds[ei].src)) 1638 | embedCount += 1; 1639 | } 1640 | 1641 | var linkDensity = this._getLinkDensity(node); 1642 | var contentLength = this._getInnerText(node).length; 1643 | 1644 | var haveToRemove = 1645 | (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || 1646 | (!isList && li > p) || 1647 | (input > Math.floor(p / 3)) || 1648 | (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || 1649 | (!isList && weight < 25 && linkDensity > 0.2) || 1650 | (weight >= 25 && linkDensity > 0.5) || 1651 | ((embedCount === 1 && contentLength < 75) || embedCount > 1); 1652 | return haveToRemove; 1653 | } 1654 | return false; 1655 | }); 1656 | }, 1657 | 1658 | /** 1659 | * Clean out elements whose id/class combinations match specific string. 1660 | * 1661 | * @param Element 1662 | * @param RegExp match id/class combination. 1663 | * @return void 1664 | **/ 1665 | _cleanMatchedNodes: function (e, regex) { 1666 | var endOfSearchMarkerNode = this._getNextNode(e, true); 1667 | var next = this._getNextNode(e); 1668 | while (next && next != endOfSearchMarkerNode) { 1669 | if (regex.test(next.className + " " + next.id)) { 1670 | next = this._removeAndGetNext(next); 1671 | } else { 1672 | next = this._getNextNode(next); 1673 | } 1674 | } 1675 | }, 1676 | 1677 | /** 1678 | * Clean out spurious headers from an Element. Checks things like classnames and link density. 1679 | * 1680 | * @param Element 1681 | * @return void 1682 | **/ 1683 | _cleanHeaders: function (e) { 1684 | for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { 1685 | this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) { 1686 | return this._getClassWeight(header) < 0; 1687 | }); 1688 | } 1689 | }, 1690 | 1691 | _flagIsActive: function (flag) { 1692 | return (this._flags & flag) > 0; 1693 | }, 1694 | 1695 | _removeFlag: function (flag) { 1696 | this._flags = this._flags & ~flag; 1697 | }, 1698 | 1699 | _isProbablyVisible: function (node) { 1700 | return node.style.display != "none" && !node.hasAttribute("hidden"); 1701 | }, 1702 | 1703 | /** 1704 | * Decides whether or not the document is reader-able without parsing the whole thing. 1705 | * 1706 | * @return boolean Whether or not we suspect parse() will suceeed at returning an article object. 1707 | */ 1708 | isProbablyReaderable: function (helperIsVisible) { 1709 | var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]); 1710 | 1711 | // Get