├── .gitignore ├── package.sh ├── misc ├── readability-module.js ├── readability-feedflock.js └── readability-ori.js ├── test.txt ├── LICENSE.txt ├── test ├── bugs │ ├── htmlparser.js │ └── jsdom-bug.js ├── clean-file.js ├── grab-pages.rb ├── clean-proxy.js ├── weird-pages │ └── w3c-css-no-closing-head.html └── nytime.html ├── README.md ├── package.json ├── notes.txt └── lib └── sprintf.js /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | log/*.log 3 | dist/* 4 | *.tmproj 5 | 6 | -------------------------------------------------------------------------------- /package.sh: -------------------------------------------------------------------------------- 1 | NAME=node-readability 2 | tar -zcf ./dist/readability.tgz -C .. --exclude=".*" --exclude="test*" $NAME/lib $NAME/LICENSE.txt $NAME/README.md $NAME/package.json -------------------------------------------------------------------------------- /misc/readability-module.js: -------------------------------------------------------------------------------- 1 | exports.parse = parse; 2 | var jsdom = require('jsdom'); 3 | var rdom = require('./readability-my2.js'); 4 | var util = require('util'); 5 | 6 | function parse(html, url, callback) { 7 | //util.debug(html); 8 | var doc = jsdom.jsdom(html, null, {url: url}); 9 | util.log('---DOM created'); 10 | var win = doc.parentWindow; 11 | if (!doc.body) { 12 | console.log('empty body'); 13 | return callback({title: '', content: ''}); 14 | } 15 | 16 | rdom.start(win, function(html) { 17 | //console.log(html); 18 | callback({title: document.title, content: html}); 19 | }); 20 | } -------------------------------------------------------------------------------- /test.txt: -------------------------------------------------------------------------------- 1 | http://127.0.0.1:3000/?url=http%3A%2F%2Fwww.bbc.co.uk%2Fukchina%2Fsimp%2Fentertainment%2F2010%2F11%2F101103_ent_harrypotter.shtml 2 | 3 | http://127.0.0.1:3000/?url=http://en.wikipedia.org/wiki/Ruby 4 | http://127.0.0.1:3000/?url=http://buzz.blogger.com/2010/10/safe-browsing-on-blogger.html 5 | http://127.0.0.1:3000/?url=http://www.ifanr.com/24614 6 | http://127.0.0.1:3000/?url=http://www.boston.com/news/politics/articles/2010/11/03/patrick_roars_to_a_2d_term/ 7 | 8 | 9 | problems: 10 | slow 11 | http://127.0.0.1:3000/?url=http://www.gazeta.ru/news/lastnews/ 12 | http://127.0.0.1:3000/?url=http://www.sqlite.org/fts3.html 13 | http://127.0.0.1:3000/?url=http://news.google.com.hk/nwshp?hl=zh-tw&tab=in 14 | 15 | returned html cannot be parsed by browser 16 | http://blog.zacharyvoase.com/2010/11/11/sockets-and-nodes-i/ 17 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Arrix Zhou 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /test/bugs/htmlparser.js: -------------------------------------------------------------------------------- 1 | var request = require('request'); 2 | var jsdom = require('jsdom'); 3 | 4 | var url = 'http://www.w3.org/TR/css3-2d-transforms/'; 5 | request({uri:url}, function (error, response, body) { 6 | var html = body; 7 | var doc = jsdom.jsdom(html, null, {url: url}); 8 | console.log(doc.head+''); //[ HEAD ] 9 | console.log(doc.body === null); //true 10 | console.log(doc.head.childNodes[9].tagName); //BODY 11 | }); 12 | 13 | var doc = jsdom.jsdom(html, null, {url: ''}); 14 | 15 | 16 | var HTML5 = require('html5'); 17 | var fs = require('fs'); 18 | var content = fs.readFileSync('test/css.html', 'utf-8'); 19 | var html = content; 20 | var jsdom = require('jsdom'); 21 | var browser = jsdom.browserAugmentation(jsdom.defaultLevel); 22 | 23 | var doc = new browser.HTMLDocument(); 24 | var parser = new HTML5.Parser({document: doc}); 25 | parser.parse(html); 26 | 27 | var doc2 = jsdom.jsdom(html, null, {parser: HTML5}); 28 | 29 | 30 | 31 | 32 | var htmlparser = require("htmlparser"); 33 | var handler = new htmlparser.DefaultHandler(function (error, dom) { 34 | 35 | }); 36 | var parser = new htmlparser.Parser(handler); 37 | parser.parseComplete(html); 38 | sys.puts(sys.inspect(handler.dom, false, null)); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # node-readability 2 | [Readability.js by Arc90](http://lab.arc90.com/experiments/readability/) ported to node.js. 3 | 4 | Blog post: [Server side readability with node.js](http://arrix.blogspot.com/2010/11/server-side-readability-with-nodejs.html) 5 | ## Requirements 6 | * [node.js](http://nodejs.org/) 7 | * [jsdom](https://github.com/tmpvar/jsdom) 8 | * [htmlparser](https://github.com/tautologistics/node-htmlparser) 9 | 10 | ## Live demo 11 | I'm working on it... 12 | ## Example 13 | 14 | var readability = require('readability'); 15 | //... 16 | // This is an very early example. The API is subject to change. 17 | readability.parse(html, url, function(result) { 18 | console.log(result.title, result.content); 19 | }); 20 | 21 | ## Performance 22 | In my testing of 140 pages with an average size of **58KB** collected from [digg](http://digg.com/news.rss), [delicious](http://feeds.delicious.com/v2/rss/?count=50) and [hacker news](http://news.ycombinator.com/rss), the average time taken for each page is about **1.1 seconds** on a Mac Mini (2.4G Intel Core 2 Duo). 23 | ## Limitation 24 | * no fetching next pages 25 | * no support for frames 26 | 27 | ## Plan 28 | * Performance optimization 29 | * Better API, more options 30 | * Support more readability features -------------------------------------------------------------------------------- /test/bugs/jsdom-bug.js: -------------------------------------------------------------------------------- 1 | // jsdom bug: Live NodeList isn't updated after DOM manipulation 2 | // node.js v0.2.4 3 | // jsdom@0.1.20 4 | // https://github.com/tmpvar/jsdom/issues/#issue/77 5 | 6 | var jsdom = require('jsdom'); 7 | var html = ' 

'; 8 | var window = jsdom.jsdom(html).createWindow(); 9 | var document = window.document; 10 | 11 | var all = document.getElementsByTagName('*'); 12 | var i = 2; 13 | var node = all[i]; 14 | console.log(''+node); //P#p1 15 | node.parentNode.removeChild(node); 16 | 17 | console.log(''+all[i]); //still P#p1. the live NodeList wasn't updated properly 18 | all.length; //trigger a refresh. the length getter calls update() 19 | console.log(''+all[i]); //P#p2 OK 20 | 21 | 22 | // innerHTML = '' doesn't removed all children 23 | // https://github.com/tmpvar/jsdom/issues/#issue/80 24 | (function() { 25 | var jsdom = require('jsdom'); 26 | var html = '

'; 27 | var doc = jsdom.jsdom(html); 28 | var win = doc.createWindow(); 29 | var b = doc.body; 30 | b.innerHTML = ''; 31 | console.log(b.innerHTML); //

32 | 33 | var arr = [0, 1, 2, 3, 4, 5]; 34 | arr.forEach(function(v, i) { 35 | console.log('[', i, '] ==', v); 36 | arr.splice(i, 1); 37 | }); 38 | // output 39 | // [ 0 ] == 0 40 | // [ 1 ] == 2 41 | // [ 2 ] == 4 42 | 43 | })(); 44 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "readability", 3 | "version": "0.1.0", 4 | "description": "Arc90's readability.js adapted to node.js", 5 | "keywords": [ 6 | "readability" 7 | ], 8 | "maintainers": [ 9 | { 10 | "name": "Arrix", 11 | "email": "arrixzhou@gmail.com", 12 | "web": "http://arrix.blogspot.com" 13 | } 14 | ], 15 | "contributors": [ 16 | { 17 | "name": "Arrix", 18 | "email": "arrixzhou@gmail.com", 19 | "web": "http://arrix.blogspot.com" 20 | }, 21 | { 22 | "name": "Vincent Cao", 23 | "email": "caojunvincent@gmail.com" 24 | } 25 | ], 26 | "bugs": { 27 | "mail": "arrixzhou@gmail.com", 28 | "web": "http://github.com/arrix/node-readability/issues" 29 | }, 30 | "licenses": [ 31 | { 32 | "type": "MIT", 33 | "url": "http://github.com/arrix/node-readability/LICENSE.txt" 34 | } 35 | ], 36 | "repositories": [ 37 | { 38 | "type": "git", 39 | "url": "http://github.com/arrix/node-readability.git" 40 | } 41 | ], 42 | "dependencies": { 43 | "mjsunit.runner": ">=0.1.0", 44 | "jsdom": ">=0.1.21", 45 | "htmlparser": ">=1.7.3" 46 | }, 47 | "engines" : { "node" : ">=0.2.5" }, 48 | "directories": { 49 | "lib": "lib" 50 | }, 51 | "main": "./lib/readability" 52 | } 53 | 54 | -------------------------------------------------------------------------------- /test/clean-file.js: -------------------------------------------------------------------------------- 1 | var http = require('http'), 2 | url_mod = require('url'), 3 | fs = require('fs'); 4 | 5 | var readability = require('../lib/readability.js'), 6 | sprintf = readability.sprintf; 7 | 8 | function cleanFile(path, url, cb) { 9 | var content = fs.readFileSync(path, 'utf-8'); 10 | readability.parse(content, url, {removeReadabilityArtifacts: false, removeClassNames: false, debug: true, profile: 1}, cb); 11 | } 12 | if (1) { 13 | cleanFile(__dirname + '/weird-pages/w3c-css-no-closing-head.html', '', function(info) { 14 | //console.log(info.content); 15 | }); 16 | 17 | return; 18 | } 19 | 20 | function batch_run() { 21 | var dir = __dirname + '/pages/'; 22 | var files = fs.readdirSync(dir); 23 | var results = []; 24 | //files.length = 10; 25 | files.forEach(function(f) { 26 | if (!/\.html/i.test(f)) return; 27 | console.log('######## Processing file...', f); 28 | cleanFile(dir + f, '', function(result) { 29 | results.push({time: result.time, file: f, inputLength: result.inputLength, error: result.error}); 30 | }); 31 | }); 32 | 33 | var total = 0, totalTime = 0; 34 | results.filter(function(v) {return !v.error}).sort(function(a, b) {return a.time - b.time;}).forEach(function(r) { 35 | total++; 36 | totalTime += r.time; 37 | console.log(sprintf('%5.2f\t%8d\t%10s', r.time, r.inputLength, r.file)); 38 | }); 39 | console.log('total:', total, "avg time:", totalTime/total); 40 | } 41 | 42 | batch_run(); 43 | -------------------------------------------------------------------------------- /test/grab-pages.rb: -------------------------------------------------------------------------------- 1 | require 'open-uri' 2 | require 'rexml/document' 3 | require 'fileutils' 4 | 5 | module Program 6 | class << self 7 | def fetch_feed(url) 8 | content = nil 9 | open(url) do |f| 10 | content = f.read 11 | end 12 | content 13 | end 14 | 15 | def fetch_digg_feed 16 | url = 'http://services.digg.com/2.0/story.getTopNews?type=rss' 17 | content = fetch_feed(url) 18 | content.force_encoding('iso-8859-1'); 19 | content.encode('utf-8') 20 | end 21 | 22 | def fetch_hackernews_feed 23 | url = 'http://news.ycombinator.com/rss' 24 | fetch_feed(url) 25 | end 26 | 27 | def fetch_delicious_feed 28 | url = 'http://feeds.delicious.com/v2/rss/?count=30' 29 | fetch_feed(url) 30 | end 31 | 32 | def parse_rss(feed) 33 | xml = REXML::Document.new(feed) 34 | xml.elements.each("//item") do |item| 35 | link = item.get_elements('link')[0].text.strip 36 | title = item.get_elements('title')[0].text.strip 37 | yield link, title 38 | end 39 | end 40 | 41 | def run 42 | dir = File.expand_path('../pages', __FILE__) 43 | FileUtils.mkdir(dir) unless File.exists? dir 44 | 45 | [fetch_digg_feed, fetch_hackernews_feed, fetch_delicious_feed].each do |feed| 46 | parse_rss(feed) do |url, title| 47 | filename = title.gsub(/\W/, '_') + '.html' 48 | filepath = File.join(dir, filename) 49 | puts "fetching #{url} as #{filepath}" 50 | puts `curl --connect-timeout=5 #{url} > #{filepath} &` 51 | sleep 1 52 | end 53 | end 54 | end 55 | 56 | end 57 | end 58 | 59 | if __FILE__ == $0 60 | Program.run 61 | end 62 | -------------------------------------------------------------------------------- /notes.txt: -------------------------------------------------------------------------------- 1 | # live NodeList 2 | NodeLists returned by node.childNodes and getElementsByXxx() apis are live which means changes to the DOM tree will be reflected in the NodeList when accessed. 3 | 4 | In jsdom's implementation, a live NodeList is updated when item() or length is accessed but not when the [index] is accessed. 5 | In a live NodeList iteration, you must carefully call list.update() (or just list.length) to trigger an update. 6 | Beware that NodeList update is very expensive! When possible, prefer DOM transversal over getElementsByXxx(); 7 | 8 | If no changes will be made the the subtree, it is a good idea to iterate over an Array. 9 | var arr = nodeList.toArray(); //toArray() is not in the standards 10 | var arr = Array.prototype.slice.call(nodeList); 11 | 12 | # nodeList._length 13 | WRONG: In jsom the length getter property of a NodeList calls .update() which re-query against the DOM tree. In a read only loop, it is more efficient to access ._length instead of .length. 14 | var nodes = ele.getElementsByTagName('div'), i, len; 15 | for (i = 0, len = nodes._length; i < len, i++) { 16 | //does not change the dom structure 17 | } 18 | childNodes._length may not be update to date!!!!! 19 | 20 | # .textContent 21 | readability.getInnerText is very frequently used function. My optimization for it reduced the total running time by half. 22 | // hundredfold faster 23 | // use native string.trim 24 | // jsdom's implementation of textContent is innerHTML + strip tags + HTMLDecode 25 | // here we replace it with an optimized tree walker 26 | 27 | # cleanStyles 28 | cleanStyles is recursive, it counts for most running time of prepArticle 29 | 30 | # security 31 | arbitrary js 32 | frames 33 | 34 | # performance 35 | grep TOTAL clean.log|cut -d ' ' -f5|sort -n 36 | 37 | irb> 38 | s = <' + 134 | title + '' + 135 | content + 136 | ''; 137 | } 138 | }); -------------------------------------------------------------------------------- /misc/readability-feedflock.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | //require.paths.unshift('./vendor'); 3 | var sys = require('sys'); 4 | var jsdom = require('jsdom'); 5 | //var htmlparser = require('./htmlparser'); 6 | //var level = jsdom.defaultLevel; 7 | // var doc = new (level.Document)(); 8 | // doc.createWindow = function() { 9 | // window = jsdom.windowAugmentation(level, { document: doc, parser: htmlparser }) 10 | // delete window.document.createWindow 11 | // return window 12 | // }; 13 | // var document = doc.createWindow().document; 14 | 15 | var document; 16 | var Client = { 17 | parse: function(content, callback) { 18 | document = jsdom.jsdom(content).createWindow().document; 19 | //document.innerHTML = content; 20 | //console.log(document.body); 21 | if (!document.body) { 22 | callback({content:'',title:''}); 23 | return; 24 | } 25 | 26 | // Replace all doubled-up
tags with

tags, and remove fonts. 27 | var pattern = new RegExp ("
[ \r\n\s]*
", "g"); 28 | document.body.innerHTML = document.body.innerHTML.replace(pattern, "

").replace(/<\/?font[^>]*>/g, ''); 29 | 30 | var allParagraphs = document.getElementsByTagName("p"); 31 | var contentDiv = null; 32 | var topDivParas =[]; 33 | 34 | var articleContent = document.createElement("DIV"); 35 | var articleTitle = document.title 36 | 37 | if (articleTitle) 38 | articleTitle = articleTitle.replace(/^\s+|\s+$/g, ''); 39 | 40 | // Study all the paragraphs and find the chunk that has the best score. 41 | // A score is determined by things like: Number of

's, commas, special classes, etc. 42 | for (var j=0; j < allParagraphs.length; j++) { 43 | var parentNode = allParagraphs[j].parentNode; 44 | 45 | if(typeof(parentNode) != 'undefined') { 46 | // Initialize readability data 47 | if(typeof parentNode.readability == 'undefined') 48 | { 49 | parentNode.readability = {"contentScore": 0}; 50 | 51 | // Look for a special classname 52 | if(parentNode.className.match(/(comment|meta|footer|footnote)/)) 53 | parentNode.readability.contentScore -= 50; 54 | else if(parentNode.className.match(/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/)) 55 | parentNode.readability.contentScore += 25; 56 | 57 | // Look for a special ID 58 | if(parentNode.id.match(/(comment|meta|footer|footnote)/)) 59 | parentNode.readability.contentScore -= 50; 60 | else if(parentNode.id.match(/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/)) 61 | parentNode.readability.contentScore += 25; 62 | } 63 | 64 | // Add a point for the paragraph found 65 | if(this.getInnerText(allParagraphs[j]).length > 10) 66 | parentNode.readability.contentScore++; 67 | 68 | // Add points for any commas within this paragraph 69 | parentNode.readability.contentScore += this.getCharCount(allParagraphs[j]); 70 | 71 | topDivParas.push({ 'node': parentNode, 'score': parentNode.readability.contentScore }); 72 | } 73 | } 74 | 75 | for (var i=0; i < topDivParas.length; i++) { 76 | var score = topDivParas[i].score; 77 | if (contentDiv == null || score > contentDiv.score) { 78 | contentDiv = { 'node': topDivParas[i].node, 'score': score } 79 | } 80 | } 81 | 82 | if (contentDiv == null) 83 | return callback({ content: '', title: '' }); 84 | 85 | var topDiv = contentDiv.node 86 | 87 | this.cleanStyles(topDiv); // Removes all style attributes 88 | topDiv = this.killDivs(topDiv); // Goes in and removes DIV's that have more non

stuff than

stuff 89 | topDiv = this.killBreaks(topDiv); // Removes any consecutive
's into just one
90 | 91 | // Cleans out junk from the topDiv just in case: 92 | topDiv = this.clean(topDiv, "form"); 93 | topDiv = this.clean(topDiv, "object"); 94 | topDiv = this.clean(topDiv, "table", 250); 95 | topDiv = this.clean(topDiv, "h1"); 96 | topDiv = this.clean(topDiv, "h2"); 97 | topDiv = this.clean(topDiv, "iframe"); 98 | 99 | articleContent.appendChild(topDiv); 100 | 101 | return callback({ content: articleContent.innerHTML, title: articleTitle }); 102 | }, 103 | getInnerText: function(e) { 104 | return e.textContent; 105 | }, 106 | getCharCount: function( e,s ) { 107 | s = s || ","; 108 | return this.getInnerText(e).split(s).length; 109 | }, 110 | cleanStyles: function( e ) { 111 | e = e || document; 112 | var cur = e.firstChild; 113 | 114 | // If we had a bad node, there's not much we can do. 115 | if(!e) 116 | return; 117 | 118 | // Remove any root styles, if we're able. 119 | if(typeof e.removeAttribute == 'function') 120 | e.removeAttribute('style'); 121 | 122 | // Go until there are no more child nodes 123 | while ( cur != null ) { 124 | if ( cur.nodeType == 1 ) { 125 | // Remove style attribute(s) : 126 | cur.removeAttribute("style"); 127 | this.cleanStyles( cur ); 128 | } 129 | cur = cur.nextSibling; 130 | } 131 | }, 132 | killDivs: function ( e ) { 133 | var divsList = e.getElementsByTagName( "div" ); 134 | var curDivLength = divsList.length; 135 | 136 | // Gather counts for other typical elements embedded within. 137 | // Traverse backwards so we can remove nodes at the same time without effecting the traversal. 138 | for (var i=curDivLength-1; i >= 0; i--) { 139 | var p = divsList[i].getElementsByTagName("p").length; 140 | var img = divsList[i].getElementsByTagName("img").length; 141 | var li = divsList[i].getElementsByTagName("li").length; 142 | var a = divsList[i].getElementsByTagName("a").length; 143 | var embed = divsList[i].getElementsByTagName("embed").length; 144 | 145 | // If the number of commas is less than 10 (bad sign) ... 146 | if ( this.getCharCount(divsList[i]) < 10) { 147 | // And the number of non-paragraph elements is more than paragraphs 148 | // or other ominous signs : 149 | if ( img > p || li > p || a > p || p == 0 || embed > 0) { 150 | divsList[i].parentNode.removeChild(divsList[i]); 151 | } 152 | } 153 | } 154 | return e; 155 | }, 156 | killBreaks: function ( e ) { 157 | e.innerHTML = e.innerHTML.replace(/((\s| ?)*){1,}/g,'
'); 158 | return e; 159 | }, 160 | clean: function(e, tags, minWords) { 161 | var targetList = e.getElementsByTagName( tags ); 162 | minWords = minWords || 1000000; 163 | 164 | for (var y=0; y < targetList.length; y++) { 165 | // If the text content isn't laden with words, remove the child: 166 | if (this.getCharCount(targetList[y], " ") < minWords) { 167 | targetList[y].parentNode.removeChild(targetList[y]); 168 | } 169 | } 170 | return e; 171 | } 172 | }; 173 | exports.Client = Client; 174 | })(); -------------------------------------------------------------------------------- /lib/sprintf.js: -------------------------------------------------------------------------------- 1 | /** 2 | sprintf() for JavaScript 0.7-beta1 3 | http://www.diveintojavascript.com/projects/javascript-sprintf 4 | 5 | Copyright (c) Alexandru Marasteanu 6 | All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | * Neither the name of sprintf() for JavaScript nor the 16 | names of its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL Alexandru Marasteanu BE LIABLE FOR ANY 23 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 26 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | Changelog: 32 | 2010.09.06 - 0.7-beta1 33 | - features: vsprintf, support for named placeholders 34 | - enhancements: format cache, reduced global namespace pollution 35 | 36 | 2010.05.22 - 0.6: 37 | - reverted to 0.4 and fixed the bug regarding the sign of the number 0 38 | Note: 39 | Thanks to Raphael Pigulla (http://www.n3rd.org/) 40 | who warned me about a bug in 0.5, I discovered that the last update was 41 | a regress. I appologize for that. 42 | 43 | 2010.05.09 - 0.5: 44 | - bug fix: 0 is now preceeded with a + sign 45 | - bug fix: the sign was not at the right position on padded results (Kamal Abdali) 46 | - switched from GPL to BSD license 47 | 48 | 2007.10.21 - 0.4: 49 | - unit test and patch (David Baird) 50 | 51 | 2007.09.17 - 0.3: 52 | - bug fix: no longer throws exception on empty paramenters (Hans Pufal) 53 | 54 | 2007.09.11 - 0.2: 55 | - feature: added argument swapping 56 | 57 | 2007.04.03 - 0.1: 58 | - initial release 59 | **/ 60 | 61 | var sprintf = (function() { 62 | function get_type(variable) { 63 | return Object.prototype.toString.call(variable).slice(8, -1).toLowerCase(); 64 | } 65 | function str_repeat(input, multiplier) { 66 | for (var output = []; multiplier > 0; output[--multiplier] = input) {/* do nothing */} 67 | return output.join(''); 68 | } 69 | 70 | var str_format = function() { 71 | if (!str_format.cache.hasOwnProperty(arguments[0])) { 72 | str_format.cache[arguments[0]] = str_format.parse(arguments[0]); 73 | } 74 | return str_format.format.call(null, str_format.cache[arguments[0]], arguments); 75 | }; 76 | 77 | str_format.format = function(parse_tree, argv) { 78 | var cursor = 1, tree_length = parse_tree.length, node_type = '', arg, output = [], i, k, match, pad, pad_character, pad_length; 79 | for (i = 0; i < tree_length; i++) { 80 | node_type = get_type(parse_tree[i]); 81 | if (node_type === 'string') { 82 | output.push(parse_tree[i]); 83 | } 84 | else if (node_type === 'array') { 85 | match = parse_tree[i]; // convenience purposes only 86 | if (match[2]) { // keyword argument 87 | arg = argv[cursor]; 88 | for (k = 0; k < match[2].length; k++) { 89 | if (!arg.hasOwnProperty(match[2][k])) { 90 | throw(sprintf('[sprintf] property "%s" does not exist', match[2][k])); 91 | } 92 | arg = arg[match[2][k]]; 93 | } 94 | } 95 | else if (match[1]) { // positional argument (explicit) 96 | arg = argv[match[1]]; 97 | } 98 | else { // positional argument (implicit) 99 | arg = argv[cursor++]; 100 | } 101 | 102 | if (/[^s]/.test(match[8]) && (get_type(arg) != 'number')) { 103 | throw(sprintf('[sprintf] expecting number but found %s', get_type(arg))); 104 | } 105 | switch (match[8]) { 106 | case 'b': arg = arg.toString(2); break; 107 | case 'c': arg = String.fromCharCode(arg); break; 108 | case 'd': arg = parseInt(arg, 10); break; 109 | case 'e': arg = match[7] ? arg.toExponential(match[7]) : arg.toExponential(); break; 110 | case 'f': arg = match[7] ? parseFloat(arg).toFixed(match[7]) : parseFloat(arg); break; 111 | case 'o': arg = arg.toString(8); break; 112 | case 's': arg = ((arg = String(arg)) && match[7] ? arg.substring(0, match[7]) : arg); break; 113 | case 'u': arg = Math.abs(arg); break; 114 | case 'x': arg = arg.toString(16); break; 115 | case 'X': arg = arg.toString(16).toUpperCase(); break; 116 | } 117 | arg = (/[def]/.test(match[8]) && match[3] && arg >= 0 ? '+'+ arg : arg); 118 | pad_character = match[4] ? match[4] == '0' ? '0' : match[4].charAt(1) : ' '; 119 | pad_length = match[6] - String(arg).length; 120 | pad = match[6] ? str_repeat(pad_character, pad_length) : ''; 121 | output.push(match[5] ? arg + pad : pad + arg); 122 | } 123 | } 124 | return output.join(''); 125 | }; 126 | 127 | str_format.cache = {}; 128 | 129 | str_format.parse = function(fmt) { 130 | var _fmt = fmt, match = [], parse_tree = [], arg_names = 0; 131 | while (_fmt) { 132 | if ((match = /^[^\x25]+/.exec(_fmt)) !== null) { 133 | parse_tree.push(match[0]); 134 | } 135 | else if ((match = /^\x25{2}/.exec(_fmt)) !== null) { 136 | parse_tree.push('%'); 137 | } 138 | else if ((match = /^\x25(?:([1-9]\d*)\$|\(([^\)]+)\))?(\+)?(0|'[^$])?(-)?(\d+)?(?:\.(\d+))?([b-fosuxX])/.exec(_fmt)) !== null) { 139 | if (match[2]) { 140 | arg_names |= 1; 141 | var field_list = [], replacement_field = match[2], field_match = []; 142 | if ((field_match = /^([a-z_][a-z_\d]*)/i.exec(replacement_field)) !== null) { 143 | field_list.push(field_match[1]); 144 | while ((replacement_field = replacement_field.substring(field_match[0].length)) !== '') { 145 | if ((field_match = /^\.([a-z_][a-z_\d]*)/i.exec(replacement_field)) !== null) { 146 | field_list.push(field_match[1]); 147 | } 148 | else if ((field_match = /^\[(\d+)\]/.exec(replacement_field)) !== null) { 149 | field_list.push(field_match[1]); 150 | } 151 | else { 152 | throw('[sprintf] huh?'); 153 | } 154 | } 155 | } 156 | else { 157 | throw('[sprintf] huh?'); 158 | } 159 | match[2] = field_list; 160 | } 161 | else { 162 | arg_names |= 2; 163 | } 164 | if (arg_names === 3) { 165 | throw('[sprintf] mixing positional and named placeholders is not (yet) supported'); 166 | } 167 | parse_tree.push(match); 168 | } 169 | else { 170 | throw('[sprintf] huh?'); 171 | } 172 | _fmt = _fmt.substring(match[0].length); 173 | } 174 | return parse_tree; 175 | }; 176 | 177 | return str_format; 178 | })(); 179 | 180 | var vsprintf = function(fmt, argv) { 181 | argv.unshift(fmt); 182 | return sprintf.apply(null, argv); 183 | }; 184 | 185 | exports.sprintf = sprintf; 186 | exports.vsprintf = vsprintf; -------------------------------------------------------------------------------- /test/weird-pages/w3c-css-no-closing-head.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | CSS 2D Transforms Module Level 3 7 | 8 | 9 | 14 | 16 | 17 | 18 |

19 |

W3C 21 | 22 |

CSS 2D Transforms Module Level 3

23 | 24 |

W3C Working Draft 01 December 25 | 2009

26 | 27 |
28 |
This version: 29 | 30 |
32 | http://www.w3.org/TR/2009/WD-css3-2d-transforms-20091201 33 | 34 |
Latest version: 35 | 36 |
http://www.w3.org/TR/css3-2d-transforms 38 | 39 | 40 |
Previous version: 41 | 42 |
43 | http://www.w3.org/TR/2009/WD-css3-2d-transforms-20090320 44 | 45 |
Editors: 46 | 47 |
Dean Jackson (Apple Inc) 49 | 50 |
David Hyatt (Apple Inc) 52 | 53 |
Chris Marrin (Apple Inc) 55 |
56 | 57 | 73 | 74 |
75 |
76 | 77 |

Abstract

78 | 79 |

CSS 2D Transforms allows elements rendered by CSS to be transformed in 80 | two-dimensional space. 81 | 82 |

Status of this document

83 | 84 | 85 |

This section describes the status of this document at the time of 86 | its publication. Other documents may supersede this document. A list of 87 | current W3C publications and the latest revision of this technical report 88 | can be found in the W3C technical reports 89 | index at http://www.w3.org/TR/. 90 | 91 |

Publication as a Working Draft does not imply endorsement by the W3C 92 | Membership. This is a draft document and may be updated, replaced or 93 | obsoleted by other documents at any time. It is inappropriate to cite this 94 | document as other than work in progress. 95 | 96 |

The (archived) public 98 | mailing list www-style@w3.org (see 99 | instructions) is preferred 100 | for discussion of this specification. When sending e-mail, please put the 101 | text “css3-2d-transforms” in the subject, preferably like 102 | this: “[css3-2d-transforms] …summary of 103 | comment…” 104 | 105 |

This document was produced by the CSS Working Group (part of 107 | the Style Activity). 108 | 109 |

This document was produced by a group operating under the 5 February 111 | 2004 W3C Patent Policy. W3C maintains a public list of any patent disclosures made in 114 | connection with the deliverables of the group; that page also includes 115 | instructions for disclosing a patent. An individual who has actual 116 | knowledge of a patent which the individual believes contains Essential 118 | Claim(s) must disclose the information in accordance with section 120 | 6 of the W3C Patent Policy.

121 | 122 | 123 |

The list of changes made to this specification is 124 | available. 125 | 126 |

Table of contents

127 | 128 | 129 | 171 | 172 | 173 |

1. Introduction

174 | 175 |

This section is not normative. 176 | 177 |

The CSS visual 178 | formatting model describes a coordinate system within which each 179 | element is positioned. Positions and sizes in this coordinate space can be 180 | thought of as being expressed in pixels, starting in the upper left corner 181 | of the parent with positive values proceeding to the right and down. 182 | 183 |

This coordinate space can be modified with the transform property. Using 186 | transform, elements can be translated, rotated and scaled in two 187 | dimensional space. The coordinate space behaves as described in the coordinate 189 | system transformations section of the SVG 1.1 specification. This is a 190 | coordinate system with two axes: the X axis increases horizontally to the 191 | right; the Y axis increases vertically downwards. 192 | 193 |

Specifying a value other than ‘none’ for the transform property establishes a 197 | new local coordinate system at the element that it is applied to. 198 | Transformations are cumulative. That is, elements establish their local 199 | coordinate system within the coordinate system of their parent. In this 200 | way, a transform property effectively 202 | accumulates all the transform properties of its 204 | ancestors. The accumulation of these transforms defines a current 205 | transformation matrix (CTM) for the element. 206 | 207 |

The transform property does not affect the flow of the content 208 | surrounding the transformed element. However, the value of the overflow 209 | area takes into account transformed elements. This behavior is similar to 210 | what happens when elements are translated via relative positioning. 211 | Therefore, if the value of the overflow property is scroll 214 | or auto, scrollbars will appear as needed 216 | to see content that is transformed outside the visible area. 217 | 218 |

Any value other than ‘none’ for 219 | the transform results in the creation of both a stacking context and a 220 | containing block. The object acts as a containing block for fixed 221 | positioned descendants. 222 | 223 |

Need to go into more detail here about why fixed 224 | positioned objects should do this, i.e., that it's much harder to 225 | implement otherwise.
226 | 227 |
There are two roles for transformations in layout: (1) 228 | transformations that adjust the position of the affected content without 229 | changing the normal layout of that content (much like relative 230 | positioning) and (2) transformation of the content prior to layout that 231 | affects the layout of that content. See http://lists.w3.org/Archives/Public/www-style/2007Oct/0209 233 | for examples of both cases. The "transform" property (as defined in this 234 | document) is equally useful for both roles. This document is focused on 235 | satisfying the first role. There is, however, an architectural question 236 | that arises because there needs to be a way to distinguish which role an 237 | author of a stylesheet wants. The key question is which is the default 238 | behavior/role for the "transform" property and how is the other 239 | behavior/role indicated by a stylesheet author. If you have an opinion on 240 | this topic, please send feedback.
241 | 242 |
What do fixed backgrounds do in transforms? They should 243 | probably ignore the transform completely, since - even transformed - the 244 | object should be acting as "porthole" through which the fixed background 245 | can be viewed in its original form.
246 | 247 |
This property should also be applicable to SVG elements.
248 | 249 |
We also need to specify that SVG transforms *do* combine 250 | with this transform, e.g., if a <foreignObject> is inside 251 | transformed SVG and then defines a transform of its own. This means we may 252 | potentially have to examine the current SVG transform and combine with it 253 | to set the correct transform.
254 | 255 | 256 |

2. The transform Property

259 | 260 |

A two-dimensional transformation is applied to an element through the 261 | transform property. This property 263 | contains a list of transform functions. 264 | The final transformation value for an element is obtained by performing a 265 | matrix concatenation of each entry in the list. The set of transform 266 | functions is similar to those allowed by SVG. 267 | 268 | 269 | 270 | 271 | 276 | 281 | 286 | 291 | 296 | 301 | 306 |
Name: 272 | 273 | transform 274 | 275 |
Value: 277 | 278 | none | <transform-function> [ <transform-function> ]* 279 | 280 |
Initial: 282 | 283 | none 284 | 285 |
Applies to: 287 | 288 | block-level and inline-level elements 289 | 290 |
Inherited: 292 | 293 | no 294 | 295 |
Percentages: 297 | 298 | refer to the size of the element's box 299 | 300 |
Media: 302 | 303 | visual 304 | 305 |
Computed value: 307 | 308 | Same as specified value. 309 |
310 | 311 | 312 |

3. The transform-origin 315 | Property

316 | 317 |

The transform-origin 319 | property establishes the origin of transformation for an element. This 320 | property is applied by first translating the element by the negated value 321 | of the property, then applying the element's transform, then translating 322 | by the property value. This effectively moves the desired transformation 323 | origin of the element to (0,0) in the local coordinate system, then 324 | applies the element's transform, then moves the element back to its 325 | original position. 326 | 327 |

If only one value is specified, the second value is assumed to be 328 | ‘center’. If at least one value is 329 | not a keyword, then the first value represents the horizontal position and 330 | the second represents the vertical position. Negative <percentage> 331 | and <length> values are allowed. 332 | 333 | 334 | 335 | 336 | 341 | 348 | 353 | 358 | 363 | 368 | 373 |
Name: 337 | 338 | transform-origin 339 | 340 |
Value: 342 | 343 | [ [ <percentage> | <length> | left | center | right ] [ 344 | <percentage> | <length> | top | center | bottom ]? ] | [ [ 345 | left | center | right ] || [ top | center | bottom ] ] 346 | 347 |
Initial: 349 | 350 | 50% 50% 351 | 352 |
Applies to: 354 | 355 | block-level and inline-level elements 356 | 357 |
Inherited: 359 | 360 | no 361 | 362 |
Percentages: 364 | 365 | refer to the size of the element's box 366 | 367 |
Media: 369 | 370 | visual 371 | 372 |
Computed value: 374 | 375 | For <length> the absolute value, otherwise a percentage 376 |
377 | 378 | 379 |

4. The Transformation 380 | Functions

381 | 382 |

The value of the transform 383 | property is a list of <transform-functions> applied in the order 384 | provided. The individual transform functions are separated by whitespace. 385 | The set of allowed transform functions is given below. In this list the 386 | type <translation-value> is defined as a <length> or 387 | <percentage> value, and the <angle> type is defined by CSS Values and Units. 389 | 390 |

391 |
matrix(<number>, <number>, 392 | <number>, <number>, <number>, <number>) 393 | 394 |
specifies a 2D transformation in the form of a transformation 396 | matrix of six values. matrix(a,b,c,d,e,f) is equivalent to applying the 398 | transformation matrix [a b c d e f]. 399 | 400 |
translate(<translation-value>[, 401 | <translation-value>]) 402 | 403 |
specifies a 2D 405 | translation by the vector [tx, ty], where tx is the first 406 | translation-value parameter and ty is the optional second 407 | translation-value parameter. If <ty> is not provided, ty 408 | has zero as a value. 409 | 410 |
translateX(<translation-value>) 411 | 412 |
specifies a translation 414 | by the given amount in the X direction. 415 | 416 |
translateY(<translation-value>) 417 | 418 |
specifies a translation 420 | by the given amount in the Y direction. 421 | 422 |
scale(<number>[, <number>]) 423 | 424 | 425 |
specifies a 2D scale 427 | operation by the [sx,sy] scaling vector described by the 2 parameters. If 428 | the second parameter is not provided, it is takes a value equal to the 429 | first. 430 | 431 |
scaleX(<number>) 432 | 433 |
specifies a scale operation using the [sx,1] scaling vector, where sx 434 | is given as the parameter. 435 | 436 |
scaleY(<number>) 437 | 438 |
specifies a scale operation using the [1,sy] scaling vector, where sy 439 | is given as the parameter. 440 | 441 |
rotate(<angle>) 442 | 443 |
specifies a 2D 445 | rotation by the angle specified in the parameter about the origin of 446 | the element, as defined by the transform-origin property. 448 | 449 |
skewX(<angle>) 450 | 451 |
specifies a skew 453 | transformation along the X axis by the given angle. 454 | 455 |
skewY(<angle>) 456 | 457 |
specifies a skew 459 | transformation along the Y axis by the given angle. 460 | 461 |
skew(<angle> [, <angle>]) 462 | 463 |
specifies a skew 465 | transformation along the X and Y axes. The first angle parameter 466 | specifies the skew on the X axis. The second angle parameter specifies 467 | the skew on the Y axis. If the second parameter is not given then a value 468 | of 0 is used for the Y angle (ie. no skew on the Y axis). 469 |
470 | 471 |

5. Transform Values and 472 | Lists

473 | 474 |

The <translation-value> values are defined as [<percentage> 475 | | <length>]. All other value types are described as CSS types. 477 | If a list of transforms is provided, then the net effect is as if each 478 | transform had been specified separately in the order provided. For 479 | example, 480 | 481 |

 482 |   <div style="transform:translate(-10px,-20px) scale(2) rotate(45deg) translate(5px,10px)"/>
 483 |   
484 | 485 |

is functionally equivalent to: 486 | 487 |

 488 |   <div style="transform:translate(-10px,-20px)">
 489 |     <div style="transform:scale(2)">
 490 |       <div style="transform:rotate(45deg)">
 491 |         <div style="transform:translate(5px,10px)">
 492 |         </div>
 493 |       </div>
 494 |     </div>
 495 |   </div>
 496 |   
497 | 498 |
499 |
 500 |   div {
 501 |       transform: translate(100px, 100px);
 502 |   }
 503 |   
504 | Move the element by 100 pixels in both the X and Y directions. 505 |
The 100px translation in X and Y
507 |
508 | 509 |
510 |
 511 |   div {
 512 |       height: 100px; width: 100px;
 513 |       transform: translate(80px, 80px) scale(1.5, 1.5) rotate(45deg);
 514 |   }
 515 |   
516 | Move the element by 80 pixels in both the X and Y directions, then scale 517 | the element by 150%, then rotate it 45 degrees clockwise about the Z axis. 518 | Note that the scale and rotate operate about the center of the element, 519 | since the element has the default transform-origin of 50% 50%. 520 |
The transform specified above
522 |
523 | 524 | 525 |

6. Transitions and animations 526 | between transform values

527 | 528 |

When animating or transitioning the value of a transform property the 529 | rules described below are applied. The ‘from’ transform is the transform at the start 531 | of the transition or current keyframe. The ‘end’ transform is the transform at the end of 533 | the transition or current keyframe. 534 | 535 |

605 | 606 |

In some cases, an animation might cause a transformation matrix to be 607 | singular or non-invertible. For example, an animation in which scale moves 608 | from 1 to -1. At the time when the matrix is in such a state, the 609 | transformed element is not rendered. 610 | 611 |

7. Matrix 612 | decomposition for animation

613 | 614 |

When interpolating between 2 matrices, each is decomposed into the 615 | corresponding translation, rotation, scale, skew, and perspective values. 616 | Not all matrices can be accurately described by these values. Those that 617 | can't are decomposed into the most accurate representation possible, using 618 | the technique below. This technique is taken from The "unmatrix" method in 619 | "Graphics Gems II, edited by Jim Arvo". The pseudocode below works on a 620 | 4x4 homogeneous matrix. A 3x2 2D matrix is therefore first converted to 621 | 4x4 homogeneous form. 622 | 623 |

 624 |   Input: matrix       ; a 4x4 matrix
 625 |   Output: translation ; a 3 component vector
 626 |           rotation    ; Euler angles, represented as a 3 component vector
 627 |           scale       ; a 3 component vector
 628 |           skew        ; skew factors XY,XZ,YZ represented as a 3 component vector
 629 |           perspective ; a 4 component vector
 630 |   Returns false if the matrix cannot be decomposed, true if it can
 631 | 
 632 |     Supporting functions (point is a 3 component vector, matrix is a 4x4 matrix):
 633 |       float  determinant(matrix)          returns the 4x4 determinant of the matrix
 634 |       matrix inverse(matrix)              returns the inverse of the passed matrix
 635 |       matrix transpose(matrix)            returns the transpose of the passed matrix
 636 |       point  multVecMatrix(point, matrix) multiplies the passed point by the passed matrix 
 637 |                                           and returns the transformed point
 638 |       float  length(point)                returns the length of the passed vector
 639 |       point  normalize(point)             normalizes the length of the passed point to 1
 640 |       float  dot(point, point)            returns the dot product of the passed points
 641 |       float  cos(float)                   returns the cosine of the passed angle in radians
 642 |       float  asin(float)                  returns the arcsine in radians of the passed value
 643 |       float  atan2(float y, float x)      returns the principal value of the arc tangent of 
 644 |                                           y/x, using the signs of both arguments to determine 
 645 |                                           the quadrant of the return value
 646 | 
 647 |     Decomposition also makes use of the following function:
 648 |       point combine(point a, point b, float ascl, float bscl)
 649 |           result[0] = (ascl * a[0]) + (bscl * b[0])
 650 |           result[1] = (ascl * a[1]) + (bscl * b[1])
 651 |           result[2] = (ascl * a[2]) + (bscl * b[2])
 652 |           return result
 653 | 
 654 | 
 655 |     // Normalize the matrix.
 656 |     if (matrix[3][3] == 0)
 657 |         return false
 658 | 
 659 |     for (i = 0; i < 4; i++)
 660 |         for (j = 0; j < 4; j++)
 661 |             matrix[i][j] /= matrix[3][3]
 662 | 
 663 |     // perspectiveMatrix is used to solve for perspective, but it also provides
 664 |     // an easy way to test for singularity of the upper 3x3 component.
 665 |     perspectiveMatrix = matrix
 666 | 
 667 |     for (i = 0; i < 3; i++)
 668 |         perspectiveMatrix[i][3] = 0
 669 | 
 670 |     perspectiveMatrix[3][3] = 1
 671 | 
 672 |     if (determinant(perspectiveMatrix) == 0)
 673 |         return false
 674 | 
 675 |     // First, isolate perspective.
 676 |     if (matrix[0][3] != 0 || matrix[1][3] != 0 || matrix[2][3] != 0)
 677 |         // rightHandSide is the right hand side of the equation.
 678 |         rightHandSide[0] = matrix[0][3];
 679 |         rightHandSide[1] = matrix[1][3];
 680 |         rightHandSide[2] = matrix[2][3];
 681 |         rightHandSide[3] = matrix[3][3];
 682 | 
 683 |         // Solve the equation by inverting perspectiveMatrix and multiplying
 684 |         // rightHandSide by the inverse.
 685 |         inversePerspectiveMatrix = inverse(perspectiveMatrix)
 686 |         transposedInversePerspectiveMatrix = transposeMatrix4(inversePerspectiveMatrix)
 687 |         perspective = multVecMatrix(rightHandSide, transposedInversePerspectiveMatrix)
 688 | 
 689 |          // Clear the perspective partition
 690 |         matrix[0][3] = matrix[1][3] = matrix[2][3] = 0
 691 |         matrix[3][3] = 1
 692 |     else
 693 |         // No perspective.
 694 |         perspective[0] = perspective[1] = perspective[2] = 0
 695 |         perspective[3] = 1
 696 | 
 697 |     // Next take care of translation
 698 |     translate[0] = matrix[3][0]
 699 |     matrix[3][0] = 0
 700 |     translate[1] = matrix[3][1]
 701 |     matrix[3][1] = 0
 702 |     translate[2] = matrix[3][2]
 703 |     matrix[3][2] = 0
 704 | 
 705 |     // Now get scale and shear. 'row' is a 3 element array of 3 component vectors
 706 |     for (i = 0; i < 3; i++)
 707 |         row[i][0] = matrix[i][0]
 708 |         row[i][1] = matrix[i][1]
 709 |         row[i][2] = matrix[i][2]
 710 | 
 711 |     // Compute X scale factor and normalize first row.
 712 |     scale[0] = length(row[0])
 713 |     row[0] = normalize(row[0])
 714 | 
 715 |     // Compute XY shear factor and make 2nd row orthogonal to 1st.
 716 |     skew[0] = dot(row[0], row[1])
 717 |     row[1] = combine(row[1], row[0], 1.0, -skew[0])
 718 | 
 719 |     // Now, compute Y scale and normalize 2nd row.
 720 |     scale[1] = length(row[1])
 721 |     row[1] = normalize(row[1])
 722 |     skew[0] /= scale[1];
 723 | 
 724 |     // Compute XZ and YZ shears, orthogonalize 3rd row
 725 |     skew[1] = dot(row[0], row[2])
 726 |     row[2] = combine(row[2], row[0], 1.0, -skew[1])
 727 |     skew[2] = dot(row[1], row[2])
 728 |     row[2] = combine(row[2], row[1], 1.0, -skew[2])
 729 | 
 730 |     // Next, get Z scale and normalize 3rd row.
 731 |     scale[2] = length(row[2])
 732 |     row[2] = normalize(row[2])
 733 |     skew[1] /= scale[2]
 734 |     skew[2] /= scale[2]
 735 | 
 736 |     // At this point, the matrix (in rows) is orthonormal.
 737 |     // Check for a coordinate system flip.  If the determinant
 738 |     // is -1, then negate the matrix and the scaling factors.
 739 |     pdum3 = cross(row[1], row[2])
 740 |     if (dot(row[0], pdum3) < 0)
 741 |         for (i = 0; i < 3; i++) {
 742 |             scale[0] *= -1;
 743 |             row[i][0] *= -1
 744 |             row[i][1] *= -1
 745 |             row[i][2] *= -1
 746 | 
 747 |     // Now, get the rotations ou
 748 |     rotate[1] = asin(-row[0][2]);
 749 |     if (cos(rotate[1]) != 0)
 750 |        rotate[0] = atan2(row[1][2], row[2][2]);
 751 |        rotate[2] = atan2(row[0][1], row[0][0]);
 752 |     else
 753 |        rotate[0] = atan2(-row[2][0], row[1][1]);
 754 |        rotate[2] = 0;
 755 | 
 756 |     return true;
 757 |     
758 | 759 |

Each component of each returned value is linearly interpolated with the 760 | corresponding component of the other matrix. The resulting components are 761 | then recomposed into a final matrix as though combining the following 762 | transform functions: 763 | 764 |

 765 |         matrix3d(1,0,0,0, 0,1,0,0, 0,0,1,0, perspective[0], perspective[1], perspective[2], perspective[3])
 766 |         translate3d(translation[0], translation[1], translation[2])
 767 |         rotateX(rotation[0]) rotateY(rotation[1]) rotateZ(rotation[2])
 768 |         matrix3d(1,0,0,0, 0,1,0,0, 0,skew[2],1,0, 0,0,0,1)
 769 |         matrix3d(1,0,0,0, 0,1,0,0, skew[1],0,1,0, 0,0,0,1)
 770 |         matrix3d(1,0,0,0, skew[0],1,0,0, 0,0,1,0, 0,0,0,1)
 771 |         scale3d(scale[0], scale[1], scale[2])
 772 |       
773 | 774 |

8. DOM Interfaces

775 | 776 |

This section describes the interfaces and functionality added to the 777 | DOM to support runtime access to the functionality described above. 778 | 779 |

8.1. CSSMatrix

780 | 781 |
782 |
Interface CSSMatrix 784 | 785 |
786 |

The CSSMatrix interface represents a 4x4 homogeneous 787 | matrix.

788 | 789 |
790 |
IDL Definition 791 | 792 |
793 |
794 |
 795 |   interface CSSMatrix {
 796 |       attribute float a;
 797 |       attribute float b;
 798 |       attribute float c;
 799 |       attribute float d;
 800 |       attribute float e;
 801 |       attribute float f;
 802 | 
 803 |       void        setMatrixValue(in DOMString string) raises(DOMException);
 804 |       CSSMatrix   multiply(in CSSMatrix secondMatrix);
 805 |       CSSMatrix   multiplyLeft(in CSSMatrix secondMatrix);
 806 |       CSSMatrix   inverse() raises(DOMException);
 807 |       CSSMatrix   translate(in float x, in float y);
 808 |       CSSMatrix   scale(in float scaleX, in float scaleY);
 809 |       CSSMatrix   skew(in float angleX, in float angleY);
 810 |       CSSMatrix   rotate(in float angle);
 811 |   };
812 |
813 |
814 |
815 | 816 | 817 |
Attributes 818 | 819 |
820 |
821 |
a-f of type float 823 | 824 |
Each of these attributes represents one of the values in the 3x2 825 | matrix.
826 |
827 |
828 | 829 | 830 |
Methods 831 | 832 |
833 |
834 | 835 |
setMatrixValue 837 | 838 |
839 |
The setMatrixValue method replaces 840 | the existing matrix with one computed from parsing the passed string 841 | as though it had been assigned to the transform property in a CSS 842 | style rule. 843 |
Parameters 844 |
845 |
846 |
string of type 847 | DOMString 848 | 849 |
The string to parse.
850 |
851 |
852 |
853 | 854 |
No Return Value
855 | 856 |
Exceptions 857 |
858 |
859 |
DOMException SYNTAX_ERR 860 | 861 |
Thrown when the provided string can not be parsed into a 862 | CSSMatrix. 863 |
864 |
865 |
866 |
867 |
868 | 869 | 870 | 871 |
multiply 873 | 874 |
875 |
The multiply method returns a new 876 | CSSMatrix which is the result of this matrix multiplied by the 877 | passed matrix, with the passed matrix to the right. This matrix is 878 | not modified. 879 |
Parameters 880 |
881 |
882 |
secondMatrix of type 883 | CSSMatrix 884 | 885 |
The matrix to multiply.
886 |
887 |
888 |
889 | 890 |
Return Value 891 |
892 |
893 |
CSSMatrix 894 | 895 |
The result matrix.
896 |
897 |
898 |
899 | 900 |
No Exceptions
901 |
902 |
903 | 904 | 905 | 906 |
multiplyLeft 908 | 909 |
910 |
The multiplyLeft method returns a new 911 | CSSMatrix which is the result of this matrix multiplied by the 912 | passed matrix, with the passed matrix to the left. This matrix is 913 | not modified. 914 |
Parameters 915 |
916 |
917 |
secondMatrix of type 918 | CSSMatrix 919 | 920 |
The matrix to multiply.
921 |
922 |
923 |
924 | 925 |
Return Value 926 |
927 |
928 |
CSSMatrix 929 | 930 |
The result matrix.
931 |
932 |
933 |
934 | 935 |
No Exceptions
936 |
937 |
938 | 939 | 940 | 941 |
inverse 943 | 944 |
945 |
The inverse method returns a new 946 | matrix which is the inverse of this matrix. This matrix is not 947 | modified. 948 |
No Parameters
949 | 950 |
Return Value 951 |
952 |
953 |
CSSMatrix 954 | 955 |
The inverted matrix.
956 |
957 |
958 |
959 | 960 |
Exceptions 961 |
962 |
963 |
DOMException NOT_SUPPORTED_ERR 964 | 965 |
Thrown when the CSSMatrix can not be inverted. 966 |
967 |
968 |
969 |
970 | 971 |
972 | 973 | 974 | 975 |
translate 977 | 978 |
979 |
The translate method returns a new 980 | matrix which is this matrix post multiplied by a translation matrix 981 | containing the passed values. This matrix is not modified. 982 |
Parameters 983 |
984 |
985 |
x of type 986 | float 987 | 988 |
The X component of the translation value.
989 | 990 |
y of type 991 | float 992 | 993 |
The Y component of the translation value.
994 |
995 |
996 |
997 | 998 |
Return Value 999 |
1000 |
1001 |
CSSMatrix 1002 | 1003 |
The result matrix.
1004 |
1005 |
1006 |
1007 | 1008 |
No Exceptions
1009 |
1010 | 1011 |
1012 | 1013 | 1014 | 1015 |
scale 1017 | 1018 |
1019 |
The scale method returns a new matrix 1020 | which is this matrix post multiplied by a scale matrix containing 1021 | the passed values. If the y component is undefined, the x component 1022 | value is used in its place. This matrix is not modified. 1023 |
Parameters 1024 |
1025 |
1026 |
scaleX of type 1027 | float 1028 | 1029 |
The X component of the scale value.
1030 | 1031 |
scaleY of type 1032 | float 1033 | 1034 |
The (optional) Y component of the scale value.
1035 |
1036 |
1037 |
1038 | 1039 |
Return Value 1040 |
1041 |
1042 |
CSSMatrix 1043 | 1044 |
The result matrix.
1045 |
1046 |
1047 |
1048 | 1049 |
No Exceptions
1050 |
1051 | 1052 |
1053 | 1054 | 1055 | 1056 |
rotate 1058 | 1059 |
1060 |
The rotate method returns a new 1061 | matrix which is this matrix post multiplied by a rotation matrix. 1062 | The rotation value is in degrees. This matrix is not modified. 1063 |
Parameters 1064 |
1065 |
1066 |
angle of type 1067 | float 1068 | 1069 |
The angle of rotation.
1070 |
1071 |
1072 |
1073 | 1074 |
Return Value 1075 |
1076 |
1077 |
CSSMatrix 1078 | 1079 |
The result matrix.
1080 |
1081 |
1082 |
1083 | 1084 |
No Exceptions
1085 |
1086 | 1087 |
1088 | 1089 | 1090 | 1091 |
skew 1093 | 1094 |
1095 |
The skew method returns a new matrix 1096 | which is this matrix post multiplied by a skew matrix. The rotation 1097 | value is in degrees. This matrix is not modified. 1098 |
Parameters 1099 |
1100 |
1101 |
angleX of type 1102 | float 1103 | 1104 |
The angle of skew along the X axis.
1105 | 1106 |
angleY of type 1107 | float 1108 | 1109 |
The angle of skew along the Y axis.
1110 |
1111 |
1112 |
1113 | 1114 |
Return Value 1115 |
1116 |
1117 |
CSSMatrix 1118 | 1119 |
The result matrix.
1120 |
1121 |
1122 |
1123 | 1124 |
No Exceptions
1125 |
1126 | 1127 |
1128 | 1129 |
1130 | 1131 |
1132 |
1133 | 1134 |
1135 | 1136 |

In addition to the interface listed above, the 1137 | getComputedStyle method of the Window object has 1138 | been updated. The transform property 1139 | of the style object returned by getComputedStyle contains a 1140 | DOMString of the form "matrix(a, b, c, d, e, f)" representing the 3x2 1141 | matrix that is the result of applying the individual functions listed in 1142 | the transform property. 1143 | 1144 |

9. References

1145 | 1146 |

Normative references

1147 | 1148 | 1149 | 1150 |
1151 |
1152 | 1153 |
1154 | 1155 | 1156 |

Other references

1157 | 1158 | 1159 | 1160 |
1161 |
1162 | 1163 |
1164 | 1165 | 1166 |

Property index

1167 | 1168 | 1169 | 1170 | 1171 | 1172 | 1187 | 1188 | 1203 |
Property 1173 | 1174 | Values 1175 | 1176 | Initial 1177 | 1178 | Applies to 1179 | 1180 | Inh. 1181 | 1182 | Percentages 1183 | 1184 | Media 1185 | 1186 |
transform 1189 | 1190 | none | <transform-function> [ <transform-function> ]* 1191 | 1192 | none 1193 | 1194 | block-level and inline-level elements 1195 | 1196 | no 1197 | 1198 | refer to the size of the element's box 1199 | 1200 | visual 1201 | 1202 |
transform-origin 1204 | 1205 | [ [ <percentage> | <length> | left | center | right ] [ 1206 | <percentage> | <length> | top | center | bottom ]? ] | [ [ 1207 | left | center | right ] || [ top | center | bottom ] ] 1208 | 1209 | 50% 50% 1210 | 1211 | block-level and inline-level elements 1212 | 1213 | no 1214 | 1215 | refer to the size of the element's box 1216 | 1217 | visual 1218 |
1219 | 1220 | 1221 |

Index

1222 | 1223 | 1224 | 1230 | 1231 | 1232 | 1239 | -------------------------------------------------------------------------------- /test/nytime.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | ‘Decision Points’ Tour Puts Bush in Spotlight - NYTimes.com 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 64 | 69 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 |
86 | 98 |
99 | 106 |
107 |
108 |
109 | 110 |
111 |
112 |
113 | 114 | 115 | 116 | 117 | 118 |
119 |
120 |
121 |
122 | 124 | 125 |
126 | 127 |

128 | 129 | 130 | Business Day 131 | 132 | Media & Advertising 133 |

134 | 135 |
136 | 170 | 198 | 199 | 200 |
201 |
202 |
203 |
204 | 205 |
206 |

With Book, Bush Is Back in Spotlight

207 | 208 | 209 | 245 |
246 |
247 |
248 |
    249 | 254 |
  • 255 | Print 256 |
  • 257 |
  • 258 | Single Page 259 |
  • 260 | 261 | 262 | 268 |
  • 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 |
    278 | Reprints 279 |
  • 280 |
    281 |
282 |
283 |
284 |
285 |
286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 |

296 | George W. Bush will end a self-imposed silence about his presidency in an NBC prime-time special on Monday, the eve of the release of his memoir, “Decision Points.” That the interviewer will be Matt Lauer, the co-host of the “Today” show, reveals calculations by Mr. Bush and his advisers, as well as a campaign by NBC.

297 |
298 |
299 | 300 | 301 |
302 | 308 |
Peter Kramer/NBC
309 |

Matt Lauer of “Today” interviewing George W. Bush in his first one-on-one interview since leaving the White House. The special will be shown on Monday at 8 p.m. Eastern time.

310 |
311 | 312 |
313 |

Related

314 | 322 |
323 |
324 |

Add to Portfolio

325 |
    326 | 327 | 328 |
  • CBS Corp
  • 329 | 330 |
331 |

Go to your Portfolio »

332 |
333 | 334 |
335 |
336 |

337 | In the past, the first interview of a controversial ex-president would be expected to go to the nation’s top evening news anchor, currently NBC’s Brian Williams. By choosing the top morning anchor instead, both sides are essentially endorsing the soft power of Matt Lauer.

338 | “He’s an extraordinarily fair interviewer,” said Jim Bell, the executive producer of “Today” and of the prime-time special. “We’re living in a time when some of television news is partisan, and Matt and the ‘Today’ show are decidedly not so.”

339 | That was a selling point for Mr. Bush and his advisers, who decided that “the first interview should be in a news context, with a network news anchor,” said David Drake, a senior vice president of Crown, the publisher of “Decision Points.”

340 | For NBC, the interview — which was taped over the course of two days in Texas late last month — is a major coup. “They talked about every subject under the sun,” said Steve Capus, the president of NBC News, who observed that Mr. Bush “has things he wants to get off his chest.”

341 | But critics of Mr. Bush — and there are many, with polls showing that most Americans still hold an unfavorable view of him — who would like to see a televised confrontation over issues like the Iraq war may come away disappointed. The tone of the prime-time special is conversational, not prosecutorial, and for that reason, “Lauer/Bush” is not likely to join “Frost/Nixon” in the public imagination.

342 | Mr. Bell pointedly called the special “a conversation with President Bush about his book,” not just his presidency. Many tough questions are asked, and the word “torture” is used, Mr. Bell emphasized, but it comes down to tone.

343 | Dana Perino, who was a White House press secretary while Mr. Bush was in office, said that tone was an important consideration for the TV book tour.

344 | “He’s not interested in having a debate about the policies,” Ms. Perino said of Mr. Bush. She elaborated later: “There’s been plenty of debates about the decisions he has made. Now he’s trying to explain what he was going through, and the conditions he was working under.”

345 | Doris Kearns Goodwin, the presidential historian, said Mr. Bush’s televised interview was not likely to deviate from the words in his memoir. But “there is some value in seeing his mood,” she said, including his attitude about the memoir itself. (Ms. Goodwin was a paid contributor to NBC until 2008.)

346 | To get the first shot at the Bush interview, each major television network pieced together its best proposal — a “package,” Mr. Drake said — and at least one other offered a prime-time special like NBC’s. He declined to share specifics, but said “it was a close decision.”

347 | The NBC interview is the start of a book tour like almost no other. Mr. Bush will sit down with Oprah Winfrey and Rush Limbaugh, as well as with all three prime-time hosts on the Fox News Channel this week. There will be print interviews, too, but the only one announced so far is with AARP The Magazine.

348 | Andrew Tyndall, who publishes a newsletter about the television news business, The Tyndall Report, said he suspected that Mr. Bush and his aides were striking a balance by selecting Mr. Lauer for the first interview. “On the one hand, you’re looking for comfort,” Mr. Tyndall said. “On the other hand, you don’t want the interview to be perceived as a series of softballs.”

349 | NBC executives privately agreed with that assessment, and said they thought that Mr. Bush would not have felt as comfortable with the network’s other top interviewers.

350 | Asked whether Mr. Williams or the “Meet the Press” moderator David Gregory pursued the interview, Mr. Capus said “I’m sure they did,” but that “Matt was the official push from NBC News and I’m thrilled that we got it.” Mr. Capus said that Mr. Lauer had a “rapport” with Mr. Bush in prior interviews.

351 | Along with comfort, audience size was important. “Today” is both the top-rated morning show and a highly sought-after outlet for authors.

354 |
355 |
356 | 357 | 358 |
359 | 360 |
361 |
370 | 371 |
372 |
373 |
374 |
375 |
376 |
377 |
    378 | 382 |
  • 383 | Print 384 |
  • 385 |
  • 386 | Single Page 387 |
  • 388 | 389 | 390 | 396 |
  • Reprints 397 |
  • 398 |
    399 |
400 |
401 | 402 |
403 |
404 |
405 | 408 |
409 |
410 | 411 | 417 | 418 | 419 |
420 |
Get Free E-mail Alerts on These Topics
421 |
422 | 423 | 424 | 425 | 426 | 448 |
449 |
450 |
451 |
452 |
453 |
454 | 455 | 456 | 457 |
458 |
459 | 461 | 467 |
468 |
469 |
470 | 471 |
472 |
473 |
474 | 475 | 476 |

MOST POPULAR - BUSINESS DAY

477 |
478 | 485 |
486 | 500 | 501 | 516 | 517 | 518 | 531 | 532 | 533 | 534 | 535 | 536 | 537 |
538 |
539 | 540 | 541 | 542 | 543 |
544 |
545 | 546 | 547 |
548 | 549 |
550 | 551 |

552 | Inside NYTimes.com

553 |
554 | 555 | 556 |
557 | 558 | 559 | 560 | 571 | 582 | 593 | 604 | 611 | 622 | 633 | 644 | 655 | 666 | 673 | 684 | 685 | 686 |
561 |
562 |
563 | Movies » 564 |
565 |
566 | Clayburgh’s Memorable ‘Unmarried Woman’ 567 |
568 |
Clayburgh’s Memorable ‘Unmarried Woman’
569 |
570 |
572 |
573 |
574 | Opinion » 575 |
576 |
577 | Disunion: Jamie Malanowski 578 |
579 |
Disunion: Jamie Malanowski
580 |
581 |
583 |
584 |
585 | Week in Review » 586 |
587 |
588 | Rightward, March: The Midterm Exit Polls 589 |
590 |
Rightward, March: The Midterm Exit Polls
591 |
592 |
594 |
595 |
596 | U.S. » 597 |
598 |
599 | Kindness of a Stranger That Still Resonates 600 |
601 |
Kindness of a Stranger That Still Resonates
602 |
603 |
605 |
606 |
Opinion »
607 |

The Stone: Speech and Harm

608 |

What is at the root of the power of slurs to cause unease, shock and pain?

609 |
610 |
612 |
613 |
614 | Business » 615 |
616 |
617 | High Hopes for Conan O’Brien’s Debut 618 |
619 |
High Hopes for Conan O’Brien’s Debut
620 |
621 |
687 |
688 | 689 |
690 | 691 |
692 | 756 |
757 |
758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 775 | 776 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | -------------------------------------------------------------------------------- /misc/readability-ori.js: -------------------------------------------------------------------------------- 1 | /*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */ 2 | /*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */ 3 | 4 | var dbg = (typeof console !== 'undefined') ? function(s) { 5 | console.log("Readability: " + s); 6 | } : function() {}; 7 | 8 | /* 9 | * Readability. An Arc90 Lab Experiment. 10 | * Website: http://lab.arc90.com/experiments/readability 11 | * Source: http://code.google.com/p/arc90labs-readability 12 | * 13 | * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. 14 | * 15 | * Copyright (c) 2010 Arc90 Inc 16 | * Readability is licensed under the Apache License, Version 2.0. 17 | **/ 18 | var readability = { 19 | version: '1.7.1', 20 | emailSrc: 'http://lab.arc90.com/experiments/readability/email.php', 21 | iframeLoads: 0, 22 | convertLinksToFootnotes: false, 23 | reversePageScroll: false, /* If they hold shift and hit space, scroll up */ 24 | frameHack: false, /** 25 | * The frame hack is to workaround a firefox bug where if you 26 | * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. 27 | * So we fake a scrollbar in the wrapping div. 28 | **/ 29 | biggestFrame: false, 30 | bodyCache: null, /* Cache the body HTML in case we need to re-use it later */ 31 | flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ 32 | 33 | /* constants */ 34 | FLAG_STRIP_UNLIKELYS: 0x1, 35 | FLAG_WEIGHT_CLASSES: 0x2, 36 | FLAG_CLEAN_CONDITIONALLY: 0x4, 37 | 38 | maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ 39 | parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ 40 | pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ 41 | 42 | /** 43 | * All of the regular expressions in use within readability. 44 | * Defined up here so we don't instantiate them repeatedly in loops. 45 | **/ 46 | regexps: { 47 | unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, 48 | okMaybeItsACandidate: /and|article|body|column|main|shadow/i, 49 | positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, 50 | negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, 51 | extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i, 52 | divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, 53 | replaceBrs: /(]*>[ \n\r\t]*){2,}/gi, 54 | replaceFonts: /<(\/?)font[^>]*>/gi, 55 | trim: /^\s+|\s+$/g, 56 | normalize: /\s{2,}/g, 57 | killBreaks: /((\s| ?)*){1,}/g, 58 | videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, 59 | skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, 60 | nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. 61 | prevLink: /(prev|earl|old|new|<|«)/i 62 | }, 63 | 64 | /** 65 | * Runs readability. 66 | * 67 | * Workflow: 68 | * 1. Prep the document by removing script tags, css, etc. 69 | * 2. Build readability's DOM tree. 70 | * 3. Grab the article content from the current dom tree. 71 | * 4. Replace the current DOM tree with the new one. 72 | * 5. Read peacefully. 73 | * 74 | * @return void 75 | **/ 76 | init: function() { 77 | /* Before we do anything, remove all scripts that are not readability. */ 78 | window.onload = window.onunload = function() {}; 79 | 80 | readability.removeScripts(document); 81 | 82 | if(document.body && !readability.bodyCache) { 83 | readability.bodyCache = document.body.innerHTML; 84 | 85 | } 86 | /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */ 87 | readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; 88 | 89 | /* Pull out any possible next page link first */ 90 | var nextPageLink = readability.findNextPageLink(document.body); 91 | 92 | readability.prepDocument(); 93 | 94 | /* Build readability's DOM tree */ 95 | var overlay = document.createElement("DIV"); 96 | var innerDiv = document.createElement("DIV"); 97 | var articleTools = readability.getArticleTools(); 98 | var articleTitle = readability.getArticleTitle(); 99 | var articleContent = readability.grabArticle(); 100 | var articleFooter = readability.getArticleFooter(); 101 | 102 | if(!articleContent) { 103 | articleContent = document.createElement("DIV"); 104 | articleContent.id = "readability-content"; 105 | articleContent.innerHTML = [ 106 | "

Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue.

", 107 | (readability.frameHack ? "

It appears this page uses frames. Unfortunately, browser security properties often cause Readability to fail on pages that include frames. You may want to try running readability itself on this source page: " + readability.biggestFrame.src + "

" : ""), 108 | "

Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.

" 109 | ].join(''); 110 | 111 | nextPageLink = null; 112 | } 113 | 114 | overlay.id = "readOverlay"; 115 | innerDiv.id = "readInner"; 116 | 117 | /* Apply user-selected styling */ 118 | document.body.className = readStyle; 119 | document.dir = readability.getSuggestedDirection(articleTitle.innerHTML); 120 | 121 | if (readStyle === "style-athelas" || readStyle === "style-apertura"){ 122 | overlay.className = readStyle + " rdbTypekit"; 123 | } 124 | else { 125 | overlay.className = readStyle; 126 | } 127 | innerDiv.className = readMargin + " " + readSize; 128 | 129 | if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) { 130 | readability.convertLinksToFootnotes = true; 131 | } 132 | 133 | /* Glue the structure of our document together. */ 134 | innerDiv.appendChild( articleTitle ); 135 | innerDiv.appendChild( articleContent ); 136 | innerDiv.appendChild( articleFooter ); 137 | overlay.appendChild( articleTools ); 138 | overlay.appendChild( innerDiv ); 139 | 140 | /* Clear the old HTML, insert the new content. */ 141 | document.body.innerHTML = ""; 142 | document.body.insertBefore(overlay, document.body.firstChild); 143 | document.body.removeAttribute('style'); 144 | 145 | if(readability.frameHack) 146 | { 147 | var readOverlay = document.getElementById('readOverlay'); 148 | readOverlay.style.height = '100%'; 149 | readOverlay.style.overflow = 'auto'; 150 | } 151 | 152 | /** 153 | * If someone tries to use Readability on a site's root page, give them a warning about usage. 154 | **/ 155 | if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) 156 | { 157 | articleContent.style.display = "none"; 158 | var rootWarning = document.createElement('p'); 159 | rootWarning.id = "readability-warning"; 160 | rootWarning.innerHTML = "Readability was intended for use on individual articles and not home pages. " + 161 | "If you'd like to try rendering this page anyway, click here to continue."; 162 | 163 | innerDiv.insertBefore( rootWarning, articleContent ); 164 | } 165 | 166 | readability.postProcessContent(articleContent); 167 | 168 | window.scrollTo(0, 0); 169 | 170 | /* If we're using the Typekit library, select the font */ 171 | if (readStyle === "style-athelas" || readStyle === "style-apertura") { 172 | readability.useRdbTypekit(); 173 | } 174 | 175 | if (nextPageLink) { 176 | /** 177 | * Append any additional pages after a small timeout so that people 178 | * can start reading without having to wait for this to finish processing. 179 | **/ 180 | window.setTimeout(function() { 181 | readability.appendNextPage(nextPageLink); 182 | }, 500); 183 | } 184 | 185 | /** Smooth scrolling **/ 186 | document.onkeydown = function(e) { 187 | var code = (window.event) ? event.keyCode : e.keyCode; 188 | if (code === 16) { 189 | readability.reversePageScroll = true; 190 | return; 191 | } 192 | 193 | if (code === 32) { 194 | readability.curScrollStep = 0; 195 | var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); 196 | 197 | if(readability.reversePageScroll) { 198 | readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); 199 | } 200 | else { 201 | readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); 202 | } 203 | 204 | return false; 205 | } 206 | }; 207 | 208 | document.onkeyup = function(e) { 209 | var code = (window.event) ? event.keyCode : e.keyCode; 210 | if (code === 16) { 211 | readability.reversePageScroll = false; 212 | return; 213 | } 214 | }; 215 | }, 216 | 217 | /** 218 | * Run any post-process modifications to article content as necessary. 219 | * 220 | * @param Element 221 | * @return void 222 | **/ 223 | postProcessContent: function(articleContent) { 224 | if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) { 225 | readability.addFootnotes(articleContent); 226 | } 227 | 228 | readability.fixImageFloats(articleContent); 229 | }, 230 | 231 | /** 232 | * Some content ends up looking ugly if the image is too large to be floated. 233 | * If the image is wider than a threshold (currently 55%), no longer float it, 234 | * center it instead. 235 | * 236 | * @param Element 237 | * @return void 238 | **/ 239 | fixImageFloats: function (articleContent) { 240 | var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55, 241 | images = articleContent.getElementsByTagName('img'); 242 | 243 | for(var i=0, il = images.length; i < il; i+=1) { 244 | var image = images[i]; 245 | 246 | if(image.offsetWidth > imageWidthThreshold) { 247 | image.className += " blockImage"; 248 | } 249 | } 250 | }, 251 | 252 | /** 253 | * Get the article tools Element that has buttons like reload, print, email. 254 | * 255 | * @return void 256 | **/ 257 | getArticleTools: function () { 258 | var articleTools = document.createElement("DIV"); 259 | 260 | articleTools.id = "readTools"; 261 | articleTools.innerHTML = 262 | "Reload Original Page" + 263 | "Print Page" + 264 | "Email Page"; 265 | 266 | return articleTools; 267 | }, 268 | 269 | /** 270 | * retuns the suggested direction of the string 271 | * 272 | * @return "rtl" || "ltr" 273 | **/ 274 | getSuggestedDirection: function(text) { 275 | function sanitizeText() { 276 | return text.replace(/@\w+/, ""); 277 | } 278 | 279 | function countMatches(match) { 280 | var matches = text.match(new RegExp(match, "g")); 281 | return matches !== null ? matches.length : 0; 282 | } 283 | 284 | function isRTL() { 285 | var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); 286 | var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); 287 | 288 | // if 20% of chars are Hebrew or Arbic then direction is rtl 289 | return (count_heb + count_arb) * 100 / text.length > 20; 290 | } 291 | 292 | text = sanitizeText(text); 293 | return isRTL() ? "rtl" : "ltr"; 294 | }, 295 | 296 | 297 | /** 298 | * Get the article title as an H1. 299 | * 300 | * @return void 301 | **/ 302 | getArticleTitle: function () { 303 | var curTitle = "", 304 | origTitle = ""; 305 | 306 | try { 307 | curTitle = origTitle = document.title; 308 | 309 | if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ 310 | curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); 311 | } 312 | } 313 | catch(e) {} 314 | 315 | if(curTitle.match(/ [\|\-] /)) 316 | { 317 | curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); 318 | 319 | if(curTitle.split(' ').length < 3) { 320 | curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); 321 | } 322 | } 323 | else if(curTitle.indexOf(': ') !== -1) 324 | { 325 | curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); 326 | 327 | if(curTitle.split(' ').length < 3) { 328 | curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); 329 | } 330 | } 331 | else if(curTitle.length > 150 || curTitle.length < 15) 332 | { 333 | var hOnes = document.getElementsByTagName('h1'); 334 | if(hOnes.length === 1) 335 | { 336 | curTitle = readability.getInnerText(hOnes[0]); 337 | } 338 | } 339 | 340 | curTitle = curTitle.replace( readability.regexps.trim, "" ); 341 | 342 | if(curTitle.split(' ').length <= 4) { 343 | curTitle = origTitle; 344 | } 345 | 346 | var articleTitle = document.createElement("H1"); 347 | articleTitle.innerHTML = curTitle; 348 | 349 | return articleTitle; 350 | }, 351 | 352 | /** 353 | * Get the footer with the readability mark etc. 354 | * 355 | * @return void 356 | **/ 357 | getArticleFooter: function () { 358 | var articleFooter = document.createElement("DIV"); 359 | 360 | /** 361 | * For research purposes, generate an img src that contains the chosen readstyle etc, 362 | * so we can generate aggregate stats and change styles based on them in the future 363 | **/ 364 | // var statsQueryParams = "?readStyle=" + encodeURIComponent(readStyle) + "&readMargin=" + encodeURIComponent(readMargin) + "&readSize=" + encodeURIComponent(readSize); 365 | /* TODO: attach this to an image */ 366 | 367 | articleFooter.id = "readFooter"; 368 | articleFooter.innerHTML = [ 369 | "", 370 | ""].join(''); 381 | 382 | return articleFooter; 383 | }, 384 | 385 | /** 386 | * Prepare the HTML document for readability to scrape it. 387 | * This includes things like stripping javascript, CSS, and handling terrible markup. 388 | * 389 | * @return void 390 | **/ 391 | prepDocument: function () { 392 | /** 393 | * In some cases a body element can't be found (if the HTML is totally hosed for example) 394 | * so we create a new body node and append it to the document. 395 | */ 396 | if(document.body === null) 397 | { 398 | var body = document.createElement("body"); 399 | try { 400 | document.body = body; 401 | } 402 | catch(e) { 403 | document.documentElement.appendChild(body); 404 | dbg(e); 405 | } 406 | } 407 | 408 | document.body.id = "readabilityBody"; 409 | 410 | var frames = document.getElementsByTagName('frame'); 411 | if(frames.length > 0) 412 | { 413 | var bestFrame = null; 414 | var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */ 415 | var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */ 416 | for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1) 417 | { 418 | var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight; 419 | var canAccessFrame = false; 420 | try { 421 | var frameBody = frames[frameIndex].contentWindow.document.body; 422 | canAccessFrame = true; 423 | } 424 | catch(eFrames) { 425 | dbg(eFrames); 426 | } 427 | 428 | if(frameSize > biggestFrameSize) { 429 | biggestFrameSize = frameSize; 430 | readability.biggestFrame = frames[frameIndex]; 431 | } 432 | 433 | if(canAccessFrame && frameSize > bestFrameSize) 434 | { 435 | readability.frameHack = true; 436 | 437 | bestFrame = frames[frameIndex]; 438 | bestFrameSize = frameSize; 439 | } 440 | } 441 | 442 | if(bestFrame) 443 | { 444 | var newBody = document.createElement('body'); 445 | newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML; 446 | newBody.style.overflow = 'scroll'; 447 | document.body = newBody; 448 | 449 | var frameset = document.getElementsByTagName('frameset')[0]; 450 | if(frameset) { 451 | frameset.parentNode.removeChild(frameset); } 452 | } 453 | } 454 | 455 | /* Remove all stylesheets */ 456 | for (var k=0;k < document.styleSheets.length; k+=1) { 457 | if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) { 458 | document.styleSheets[k].disabled = true; 459 | } 460 | } 461 | 462 | /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */ 463 | var styleTags = document.getElementsByTagName("style"); 464 | for (var st=0;st < styleTags.length; st+=1) { 465 | styleTags[st].textContent = ""; 466 | } 467 | 468 | /* Turn all double br's into p's */ 469 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 470 | document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '

').replace(readability.regexps.replaceFonts, '<$1span>'); 471 | }, 472 | 473 | /** 474 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. 475 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php 476 | * 477 | * @return void 478 | **/ 479 | addFootnotes: function(articleContent) { 480 | var footnotesWrapper = document.getElementById('readability-footnotes'), 481 | articleFootnotes = document.getElementById('readability-footnotes-list'); 482 | 483 | if(!footnotesWrapper) { 484 | footnotesWrapper = document.createElement("DIV"); 485 | footnotesWrapper.id = 'readability-footnotes'; 486 | footnotesWrapper.innerHTML = '

References

'; 487 | footnotesWrapper.style.display = 'none'; /* Until we know we have footnotes, don't show the references block. */ 488 | 489 | articleFootnotes = document.createElement('ol'); 490 | articleFootnotes.id = 'readability-footnotes-list'; 491 | 492 | footnotesWrapper.appendChild(articleFootnotes); 493 | 494 | var readFooter = document.getElementById('readFooter'); 495 | 496 | if(readFooter) { 497 | readFooter.parentNode.insertBefore(footnotesWrapper, readFooter); 498 | } 499 | } 500 | 501 | var articleLinks = articleContent.getElementsByTagName('a'); 502 | var linkCount = articleFootnotes.getElementsByTagName('li').length; 503 | for (var i = 0; i < articleLinks.length; i+=1) 504 | { 505 | var articleLink = articleLinks[i], 506 | footnoteLink = articleLink.cloneNode(true), 507 | refLink = document.createElement('a'), 508 | footnote = document.createElement('li'), 509 | linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, 510 | linkText = readability.getInnerText(articleLink); 511 | 512 | if(articleLink.className && articleLink.className.indexOf('readability-DoNotFootnote') !== -1 || linkText.match(readability.regexps.skipFootnoteLink)) { 513 | continue; 514 | } 515 | 516 | linkCount+=1; 517 | 518 | /** Add a superscript reference after the article link */ 519 | refLink.href = '#readabilityFootnoteLink-' + linkCount; 520 | refLink.innerHTML = '[' + linkCount + ']'; 521 | refLink.className = 'readability-DoNotFootnote'; 522 | try { refLink.style.color = 'inherit'; } catch(e) {} /* IE7 doesn't like inherit. */ 523 | 524 | if(articleLink.parentNode.lastChild === articleLink) { 525 | articleLink.parentNode.appendChild(refLink); 526 | } else { 527 | articleLink.parentNode.insertBefore(refLink, articleLink.nextSibling); 528 | } 529 | 530 | articleLink.name = 'readabilityLink-' + linkCount; 531 | try { articleLink.style.color = 'inherit'; } catch(err) {} /* IE7 doesn't like inherit. */ 532 | 533 | footnote.innerHTML = "^ "; 534 | 535 | footnoteLink.innerHTML = (footnoteLink.title ? footnoteLink.title : linkText); 536 | footnoteLink.name = 'readabilityFootnoteLink-' + linkCount; 537 | 538 | footnote.appendChild(footnoteLink); 539 | footnote.innerHTML = footnote.innerHTML + " (" + linkDomain + ")"; 540 | 541 | articleFootnotes.appendChild(footnote); 542 | } 543 | 544 | if(linkCount > 0) { 545 | footnotesWrapper.style.display = 'block'; 546 | } 547 | }, 548 | 549 | useRdbTypekit: function () { 550 | var rdbHead = document.getElementsByTagName('head')[0]; 551 | var rdbTKScript = document.createElement('script'); 552 | var rdbTKCode = null; 553 | 554 | var rdbTKLink = document.createElement('a'); 555 | rdbTKLink.setAttribute('class','rdbTK-powered'); 556 | rdbTKLink.setAttribute('title','Fonts by Typekit'); 557 | rdbTKLink.innerHTML = "Fonts by Typekit"; 558 | 559 | if (readStyle === "style-athelas") { 560 | rdbTKCode = "sxt6vzy"; 561 | dbg("Using Athelas Theme"); 562 | 563 | rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=athelas'); 564 | rdbTKLink.setAttribute('id','rdb-athelas'); 565 | document.getElementById("rdb-footer-right").appendChild(rdbTKLink); 566 | } 567 | if (readStyle === "style-apertura") { 568 | rdbTKCode = "bae8ybu"; 569 | dbg("Using Inverse Theme"); 570 | 571 | rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=inverse'); 572 | rdbTKLink.setAttribute('id','rdb-inverse'); 573 | document.getElementById("rdb-footer-right").appendChild(rdbTKLink); 574 | } 575 | 576 | /** 577 | * Setting new script tag attributes to pull Typekits libraries 578 | **/ 579 | rdbTKScript.setAttribute('type','text/javascript'); 580 | rdbTKScript.setAttribute('src',"http://use.typekit.com/" + rdbTKCode + ".js"); 581 | rdbTKScript.setAttribute('charset','UTF-8'); 582 | rdbHead.appendChild(rdbTKScript); 583 | 584 | /** 585 | * In the future, maybe try using the following experimental Callback function?: 586 | * http://gist.github.com/192350 587 | * & 588 | * http://getsatisfaction.com/typekit/topics/support_a_pre_and_post_load_callback_function 589 | **/ 590 | var typekitLoader = function() { 591 | dbg("Looking for Typekit."); 592 | if(typeof Typekit !== "undefined") { 593 | try { 594 | dbg("Caught typekit"); 595 | Typekit.load(); 596 | clearInterval(window.typekitInterval); 597 | } catch(e) { 598 | dbg("Typekit error: " + e); 599 | } 600 | } 601 | }; 602 | 603 | window.typekitInterval = window.setInterval(typekitLoader, 100); 604 | }, 605 | 606 | /** 607 | * Prepare the article node for display. Clean out any inline styles, 608 | * iframes, forms, strip extraneous

tags, etc. 609 | * 610 | * @param Element 611 | * @return void 612 | **/ 613 | prepArticle: function (articleContent) { 614 | readability.cleanStyles(articleContent); 615 | readability.killBreaks(articleContent); 616 | 617 | /* Clean out junk from the article content */ 618 | readability.cleanConditionally(articleContent, "form"); 619 | readability.clean(articleContent, "object"); 620 | readability.clean(articleContent, "h1"); 621 | 622 | /** 623 | * If there is only one h2, they are probably using it 624 | * as a header and not a subheader, so remove it since we already have a header. 625 | ***/ 626 | if(articleContent.getElementsByTagName('h2').length === 1) { 627 | readability.clean(articleContent, "h2"); 628 | } 629 | readability.clean(articleContent, "iframe"); 630 | 631 | readability.cleanHeaders(articleContent); 632 | 633 | /* Do these last as the previous stuff may have removed junk that will affect these */ 634 | readability.cleanConditionally(articleContent, "table"); 635 | readability.cleanConditionally(articleContent, "ul"); 636 | readability.cleanConditionally(articleContent, "div"); 637 | 638 | /* Remove extra paragraphs */ 639 | var articleParagraphs = articleContent.getElementsByTagName('p'); 640 | for(var i = articleParagraphs.length-1; i >= 0; i-=1) { 641 | var imgCount = articleParagraphs[i].getElementsByTagName('img').length; 642 | var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; 643 | var objectCount = articleParagraphs[i].getElementsByTagName('object').length; 644 | 645 | if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') { 646 | articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); 647 | } 648 | } 649 | 650 | try { 651 | articleContent.innerHTML = articleContent.innerHTML.replace(/]*>\s*

topCandidate.readability.contentScore) { 852 | topCandidate = candidates[c]; } 853 | } 854 | 855 | /** 856 | * If we still have no top candidate, just use the body as a last resort. 857 | * We also have to copy the body node so it is something we can modify. 858 | **/ 859 | if (topCandidate === null || topCandidate.tagName === "BODY") 860 | { 861 | topCandidate = document.createElement("DIV"); 862 | topCandidate.innerHTML = page.innerHTML; 863 | page.innerHTML = ""; 864 | page.appendChild(topCandidate); 865 | readability.initializeNode(topCandidate); 866 | } 867 | 868 | /** 869 | * Now that we have the top candidate, look through its siblings for content that might also be related. 870 | * Things like preambles, content split by ads that we removed, etc. 871 | **/ 872 | var articleContent = document.createElement("DIV"); 873 | if (isPaging) { 874 | articleContent.id = "readability-content"; 875 | } 876 | var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); 877 | var siblingNodes = topCandidate.parentNode.childNodes; 878 | 879 | 880 | for(var s=0, sl=siblingNodes.length; s < sl; s+=1) { 881 | var siblingNode = siblingNodes[s]; 882 | var append = false; 883 | 884 | /** 885 | * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList. 886 | * Example of error visible here: http://www.esquire.com/features/honesty0707 887 | **/ 888 | if(!siblingNode) { 889 | continue; 890 | } 891 | 892 | dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); 893 | dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); 894 | 895 | if(siblingNode === topCandidate) 896 | { 897 | append = true; 898 | } 899 | 900 | var contentBonus = 0; 901 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ 902 | if(siblingNode.className === topCandidate.className && topCandidate.className !== "") { 903 | contentBonus += topCandidate.readability.contentScore * 0.2; 904 | } 905 | 906 | if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) 907 | { 908 | append = true; 909 | } 910 | 911 | if(siblingNode.nodeName === "P") { 912 | var linkDensity = readability.getLinkDensity(siblingNode); 913 | var nodeContent = readability.getInnerText(siblingNode); 914 | var nodeLength = nodeContent.length; 915 | 916 | if(nodeLength > 80 && linkDensity < 0.25) 917 | { 918 | append = true; 919 | } 920 | else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) 921 | { 922 | append = true; 923 | } 924 | } 925 | 926 | if(append) { 927 | dbg("Appending node: " + siblingNode); 928 | 929 | var nodeToAppend = null; 930 | if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { 931 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 932 | 933 | dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); 934 | nodeToAppend = document.createElement("DIV"); 935 | try { 936 | nodeToAppend.id = siblingNode.id; 937 | nodeToAppend.innerHTML = siblingNode.innerHTML; 938 | } 939 | catch(er) { 940 | dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); 941 | nodeToAppend = siblingNode; 942 | s-=1; 943 | sl-=1; 944 | } 945 | } else { 946 | nodeToAppend = siblingNode; 947 | s-=1; 948 | sl-=1; 949 | } 950 | 951 | /* To ensure a node does not interfere with readability styles, remove its classnames */ 952 | nodeToAppend.className = ""; 953 | 954 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ 955 | articleContent.appendChild(nodeToAppend); 956 | } 957 | } 958 | 959 | /** 960 | * So we have all of the content that we need. Now we clean it up for presentation. 961 | **/ 962 | readability.prepArticle(articleContent); 963 | 964 | if (readability.curPageNum === 1) { 965 | articleContent.innerHTML = '

' + articleContent.innerHTML + '
'; 966 | } 967 | 968 | /** 969 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. 970 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 971 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 972 | * finding the -right- content. 973 | **/ 974 | if(readability.getInnerText(articleContent, false).length < 250) { 975 | page.innerHTML = pageCacheHtml; 976 | 977 | if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { 978 | readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); 979 | return readability.grabArticle(page); 980 | } 981 | else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 982 | readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); 983 | return readability.grabArticle(page); 984 | } 985 | else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 986 | readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); 987 | return readability.grabArticle(page); 988 | } else { 989 | return null; 990 | } 991 | } 992 | 993 | return articleContent; 994 | }, 995 | 996 | /** 997 | * Removes script tags from the document. 998 | * 999 | * @param Element 1000 | **/ 1001 | removeScripts: function (doc) { 1002 | var scripts = doc.getElementsByTagName('script'); 1003 | for(var i = scripts.length-1; i >= 0; i-=1) 1004 | { 1005 | if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) 1006 | { 1007 | scripts[i].nodeValue=""; 1008 | scripts[i].removeAttribute('src'); 1009 | if (scripts[i].parentNode) { 1010 | scripts[i].parentNode.removeChild(scripts[i]); 1011 | } 1012 | } 1013 | } 1014 | }, 1015 | 1016 | /** 1017 | * Get the inner text of a node - cross browser compatibly. 1018 | * This also strips out any excess whitespace to be found. 1019 | * 1020 | * @param Element 1021 | * @return string 1022 | **/ 1023 | getInnerText: function (e, normalizeSpaces) { 1024 | var textContent = ""; 1025 | 1026 | if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") { 1027 | return ""; 1028 | } 1029 | 1030 | normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; 1031 | 1032 | if (navigator.appName === "Microsoft Internet Explorer") { 1033 | textContent = e.innerText.replace( readability.regexps.trim, "" ); } 1034 | else { 1035 | textContent = e.textContent.replace( readability.regexps.trim, "" ); } 1036 | 1037 | if(normalizeSpaces) { 1038 | return textContent.replace( readability.regexps.normalize, " "); } 1039 | else { 1040 | return textContent; } 1041 | }, 1042 | 1043 | /** 1044 | * Get the number of times a string s appears in the node e. 1045 | * 1046 | * @param Element 1047 | * @param string - what to split on. Default is "," 1048 | * @return number (integer) 1049 | **/ 1050 | getCharCount: function (e,s) { 1051 | s = s || ","; 1052 | return readability.getInnerText(e).split(s).length-1; 1053 | }, 1054 | 1055 | /** 1056 | * Remove the style attribute on every e and under. 1057 | * TODO: Test if getElementsByTagName(*) is faster. 1058 | * 1059 | * @param Element 1060 | * @return void 1061 | **/ 1062 | cleanStyles: function (e) { 1063 | e = e || document; 1064 | var cur = e.firstChild; 1065 | 1066 | if(!e) { 1067 | return; } 1068 | 1069 | // Remove any root styles, if we're able. 1070 | if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') { 1071 | e.removeAttribute('style'); } 1072 | 1073 | // Go until there are no more child nodes 1074 | while ( cur !== null ) { 1075 | if ( cur.nodeType === 1 ) { 1076 | // Remove style attribute(s) : 1077 | if(cur.className !== "readability-styled") { 1078 | cur.removeAttribute("style"); 1079 | } 1080 | readability.cleanStyles( cur ); 1081 | } 1082 | cur = cur.nextSibling; 1083 | } 1084 | }, 1085 | 1086 | /** 1087 | * Get the density of links as a percentage of the content 1088 | * This is the amount of text that is inside a link divided by the total text in the node. 1089 | * 1090 | * @param Element 1091 | * @return number (float) 1092 | **/ 1093 | getLinkDensity: function (e) { 1094 | var links = e.getElementsByTagName("a"); 1095 | var textLength = readability.getInnerText(e).length; 1096 | var linkLength = 0; 1097 | for(var i=0, il=links.length; i 25) { 1209 | continue; 1210 | } 1211 | 1212 | /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */ 1213 | var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); 1214 | if(!linkHrefLeftover.match(/\d/)) { 1215 | continue; 1216 | } 1217 | 1218 | if(!(linkHref in possiblePages)) { 1219 | possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; 1220 | } else { 1221 | possiblePages[linkHref].linkText += ' | ' + linkText; 1222 | } 1223 | 1224 | var linkObj = possiblePages[linkHref]; 1225 | 1226 | /** 1227 | * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. 1228 | * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 1229 | **/ 1230 | if(linkHref.indexOf(articleBaseUrl) !== 0) { 1231 | linkObj.score -= 25; 1232 | } 1233 | 1234 | var linkData = linkText + ' ' + link.className + ' ' + link.id; 1235 | if(linkData.match(readability.regexps.nextLink)) { 1236 | linkObj.score += 50; 1237 | } 1238 | if(linkData.match(/pag(e|ing|inat)/i)) { 1239 | linkObj.score += 25; 1240 | } 1241 | if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, 1242 | /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ 1243 | if(!linkObj.linkText.match(readability.regexps.nextLink)) { 1244 | linkObj.score -= 65; 1245 | } 1246 | } 1247 | if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) { 1248 | linkObj.score -= 50; 1249 | } 1250 | if(linkData.match(readability.regexps.prevLink)) { 1251 | linkObj.score -= 200; 1252 | } 1253 | 1254 | /* If a parentNode contains page or paging or paginat */ 1255 | var parentNode = link.parentNode, 1256 | positiveNodeMatch = false, 1257 | negativeNodeMatch = false; 1258 | while(parentNode) { 1259 | var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; 1260 | if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { 1261 | positiveNodeMatch = true; 1262 | linkObj.score += 25; 1263 | } 1264 | if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) { 1265 | /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */ 1266 | if(!parentNodeClassAndId.match(readability.regexps.positive)) { 1267 | linkObj.score -= 25; 1268 | negativeNodeMatch = true; 1269 | } 1270 | } 1271 | 1272 | parentNode = parentNode.parentNode; 1273 | } 1274 | 1275 | /** 1276 | * If the URL looks like it has paging in it, add to the score. 1277 | * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 1278 | **/ 1279 | if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { 1280 | linkObj.score += 25; 1281 | } 1282 | 1283 | /* If the URL contains negative values, give a slight decrease. */ 1284 | if (linkHref.match(readability.regexps.extraneous)) { 1285 | linkObj.score -= 15; 1286 | } 1287 | 1288 | /** 1289 | * Minor punishment to anything that doesn't match our current URL. 1290 | * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. 1291 | * Dan, can you show me a counterexample where this is necessary? 1292 | * if (linkHref.indexOf(window.location.href) !== 0) { 1293 | * linkObj.score -= 1; 1294 | * } 1295 | **/ 1296 | 1297 | /** 1298 | * If the link text can be parsed as a number, give it a minor bonus, with a slight 1299 | * bias towards lower numbered pages. This is so that pages that might not have 'next' 1300 | * in their text can still get scored, and sorted properly by score. 1301 | **/ 1302 | var linkTextAsNumber = parseInt(linkText, 10); 1303 | if(linkTextAsNumber) { 1304 | // Punish 1 since we're either already there, or it's probably before what we want anyways. 1305 | if (linkTextAsNumber === 1) { 1306 | linkObj.score -= 10; 1307 | } 1308 | else { 1309 | // Todo: Describe this better 1310 | linkObj.score += Math.max(0, 10 - linkTextAsNumber); 1311 | } 1312 | } 1313 | } 1314 | 1315 | /** 1316 | * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL. 1317 | * Require at least a score of 50, which is a relatively high confidence that this page is the next link. 1318 | **/ 1319 | var topPage = null; 1320 | for(var page in possiblePages) { 1321 | if(possiblePages.hasOwnProperty(page)) { 1322 | if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) { 1323 | topPage = possiblePages[page]; 1324 | } 1325 | } 1326 | } 1327 | 1328 | if(topPage) { 1329 | var nextHref = topPage.href.replace(/\/$/,''); 1330 | 1331 | dbg('NEXT PAGE IS ' + nextHref); 1332 | readability.parsedPages[nextHref] = true; 1333 | return nextHref; 1334 | } 1335 | else { 1336 | return null; 1337 | } 1338 | }, 1339 | 1340 | /** 1341 | * Build a simple cross browser compatible XHR. 1342 | * 1343 | * TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk. 1344 | **/ 1345 | xhr: function () { 1346 | if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { 1347 | return new XMLHttpRequest(); 1348 | } 1349 | else { 1350 | try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } 1351 | try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } 1352 | try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } 1353 | } 1354 | 1355 | return false; 1356 | }, 1357 | 1358 | successfulRequest: function (request) { 1359 | return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText); 1360 | }, 1361 | 1362 | ajax: function (url, options) { 1363 | var request = readability.xhr(); 1364 | 1365 | function respondToReadyState(readyState) { 1366 | if (request.readyState === 4) { 1367 | if (readability.successfulRequest(request)) { 1368 | if (options.success) { options.success(request); } 1369 | } 1370 | else { 1371 | if (options.error) { options.error(request); } 1372 | } 1373 | } 1374 | } 1375 | 1376 | if (typeof options === 'undefined') { options = {}; } 1377 | 1378 | request.onreadystatechange = respondToReadyState; 1379 | 1380 | request.open('get', url, true); 1381 | request.setRequestHeader('Accept', 'text/html'); 1382 | 1383 | try { 1384 | request.send(options.postBody); 1385 | } 1386 | catch (e) { 1387 | if (options.error) { options.error(); } 1388 | } 1389 | 1390 | return request; 1391 | }, 1392 | 1393 | /** 1394 | * Make an AJAX request for each page and append it to the document. 1395 | **/ 1396 | curPageNum: 1, 1397 | 1398 | appendNextPage: function (nextPageLink) { 1399 | readability.curPageNum+=1; 1400 | 1401 | var articlePage = document.createElement("DIV"); 1402 | articlePage.id = 'readability-page-' + readability.curPageNum; 1403 | articlePage.className = 'page'; 1404 | articlePage.innerHTML = '

§

'; 1405 | 1406 | document.getElementById("readability-content").appendChild(articlePage); 1407 | 1408 | if(readability.curPageNum > readability.maxPages) { 1409 | var nextPageMarkup = "
View Next Page
"; 1410 | 1411 | articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; 1412 | return; 1413 | } 1414 | 1415 | /** 1416 | * Now that we've built the article page DOM element, get the page content 1417 | * asynchronously and load the cleaned content into the div we created for it. 1418 | **/ 1419 | (function(pageUrl, thisPage) { 1420 | readability.ajax(pageUrl, { 1421 | success: function(r) { 1422 | 1423 | /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ 1424 | var eTag = r.getResponseHeader('ETag'); 1425 | if(eTag) { 1426 | if(eTag in readability.pageETags) { 1427 | dbg("Exact duplicate page found via ETag. Aborting."); 1428 | articlePage.style.display = 'none'; 1429 | return; 1430 | } else { 1431 | readability.pageETags[eTag] = 1; 1432 | } 1433 | } 1434 | 1435 | // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. 1436 | var page = document.createElement("DIV"); 1437 | 1438 | /** 1439 | * Do some preprocessing to our HTML to make it ready for appending. 1440 | * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. 1441 | * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript. 1442 | * • Turn all double br's into p's - was handled by prepDocument in the original view. 1443 | * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages. 1444 | **/ 1445 | var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''); 1446 | responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''); 1447 | responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div'); 1448 | responseHtml = responseHtml.replace(readability.regexps.replaceBrs, '

'); 1449 | responseHtml = responseHtml.replace(readability.regexps.replaceFonts, '<$1span>'); 1450 | 1451 | page.innerHTML = responseHtml; 1452 | 1453 | /** 1454 | * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle. 1455 | **/ 1456 | readability.flags = 0x1 | 0x2 | 0x4; 1457 | 1458 | var nextPageLink = readability.findNextPageLink(page), 1459 | content = readability.grabArticle(page); 1460 | 1461 | if(!content) { 1462 | dbg("No content found in page to append. Aborting."); 1463 | return; 1464 | } 1465 | 1466 | /** 1467 | * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. 1468 | * Compare it against all of the the previous document's we've gotten. If the previous 1469 | * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. 1470 | **/ 1471 | var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; 1472 | if(firstP && firstP.innerHTML.length > 100) { 1473 | for(var i=1; i <= readability.curPageNum; i+=1) { 1474 | var rPage = document.getElementById('readability-page-' + i); 1475 | if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { 1476 | dbg('Duplicate of page ' + i + ' - skipping.'); 1477 | articlePage.style.display = 'none'; 1478 | readability.parsedPages[pageUrl] = true; 1479 | return; 1480 | } 1481 | } 1482 | } 1483 | 1484 | readability.removeScripts(content); 1485 | 1486 | thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; 1487 | 1488 | /** 1489 | * After the page has rendered, post process the content. This delay is necessary because, 1490 | * in webkit at least, offsetWidth is not set in time to determine image width. We have to 1491 | * wait a little bit for reflow to finish before we can fix floating images. 1492 | **/ 1493 | window.setTimeout( 1494 | function() { readability.postProcessContent(thisPage); }, 1495 | 500 1496 | ); 1497 | 1498 | if(nextPageLink) { 1499 | readability.appendNextPage(nextPageLink); 1500 | } 1501 | } 1502 | }); 1503 | }(nextPageLink, articlePage)); 1504 | }, 1505 | 1506 | /** 1507 | * Get an elements class/id weight. Uses regular expressions to tell if this 1508 | * element looks good or bad. 1509 | * 1510 | * @param Element 1511 | * @return number (Integer) 1512 | **/ 1513 | getClassWeight: function (e) { 1514 | if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 1515 | return 0; 1516 | } 1517 | 1518 | var weight = 0; 1519 | 1520 | /* Look for a special classname */ 1521 | if (typeof(e.className) === 'string' && e.className !== '') 1522 | { 1523 | if(e.className.search(readability.regexps.negative) !== -1) { 1524 | weight -= 25; } 1525 | 1526 | if(e.className.search(readability.regexps.positive) !== -1) { 1527 | weight += 25; } 1528 | } 1529 | 1530 | /* Look for a special ID */ 1531 | if (typeof(e.id) === 'string' && e.id !== '') 1532 | { 1533 | if(e.id.search(readability.regexps.negative) !== -1) { 1534 | weight -= 25; } 1535 | 1536 | if(e.id.search(readability.regexps.positive) !== -1) { 1537 | weight += 25; } 1538 | } 1539 | 1540 | return weight; 1541 | }, 1542 | 1543 | nodeIsVisible: function (node) { 1544 | return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none'; 1545 | }, 1546 | 1547 | /** 1548 | * Remove extraneous break tags from a node. 1549 | * 1550 | * @param Element 1551 | * @return void 1552 | **/ 1553 | killBreaks: function (e) { 1554 | try { 1555 | e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaks,'
'); 1556 | } 1557 | catch (eBreaks) { 1558 | dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks); 1559 | } 1560 | }, 1561 | 1562 | /** 1563 | * Clean a node of all elements of type "tag". 1564 | * (Unless it's a youtube/vimeo video. People love movies.) 1565 | * 1566 | * @param Element 1567 | * @param string tag to clean 1568 | * @return void 1569 | **/ 1570 | clean: function (e, tag) { 1571 | var targetList = e.getElementsByTagName( tag ); 1572 | var isEmbed = (tag === 'object' || tag === 'embed'); 1573 | 1574 | for (var y=targetList.length-1; y >= 0; y-=1) { 1575 | /* Allow youtube and vimeo videos through as people usually want to see those. */ 1576 | if(isEmbed) { 1577 | var attributeValues = ""; 1578 | for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { 1579 | attributeValues += targetList[y].attributes[i].value + '|'; 1580 | } 1581 | 1582 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 1583 | if (attributeValues.search(readability.regexps.videos) !== -1) { 1584 | continue; 1585 | } 1586 | 1587 | /* Then check the elements inside this element for the same. */ 1588 | if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) { 1589 | continue; 1590 | } 1591 | 1592 | } 1593 | 1594 | targetList[y].parentNode.removeChild(targetList[y]); 1595 | } 1596 | }, 1597 | 1598 | /** 1599 | * Clean an element of all tags of type "tag" if they look fishy. 1600 | * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 1601 | * 1602 | * @return void 1603 | **/ 1604 | cleanConditionally: function (e, tag) { 1605 | 1606 | if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 1607 | return; 1608 | } 1609 | 1610 | var tagsList = e.getElementsByTagName(tag); 1611 | var curTagsLength = tagsList.length; 1612 | 1613 | /** 1614 | * Gather counts for other typical elements embedded within. 1615 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. 1616 | * 1617 | * TODO: Consider taking into account original contentScore here. 1618 | **/ 1619 | for (var i=curTagsLength-1; i >= 0; i-=1) { 1620 | var weight = readability.getClassWeight(tagsList[i]); 1621 | var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; 1622 | 1623 | dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); 1624 | 1625 | if(weight+contentScore < 0) 1626 | { 1627 | tagsList[i].parentNode.removeChild(tagsList[i]); 1628 | } 1629 | else if ( readability.getCharCount(tagsList[i],',') < 10) { 1630 | /** 1631 | * If there are not very many commas, and the number of 1632 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. 1633 | **/ 1634 | var p = tagsList[i].getElementsByTagName("p").length; 1635 | var img = tagsList[i].getElementsByTagName("img").length; 1636 | var li = tagsList[i].getElementsByTagName("li").length-100; 1637 | var input = tagsList[i].getElementsByTagName("input").length; 1638 | 1639 | var embedCount = 0; 1640 | var embeds = tagsList[i].getElementsByTagName("embed"); 1641 | for(var ei=0,il=embeds.length; ei < il; ei+=1) { 1642 | if (embeds[ei].src.search(readability.regexps.videos) === -1) { 1643 | embedCount+=1; 1644 | } 1645 | } 1646 | 1647 | var linkDensity = readability.getLinkDensity(tagsList[i]); 1648 | var contentLength = readability.getInnerText(tagsList[i]).length; 1649 | var toRemove = false; 1650 | 1651 | if ( img > p ) { 1652 | toRemove = true; 1653 | } else if(li > p && tag !== "ul" && tag !== "ol") { 1654 | toRemove = true; 1655 | } else if( input > Math.floor(p/3) ) { 1656 | toRemove = true; 1657 | } else if(contentLength < 25 && (img === 0 || img > 2) ) { 1658 | toRemove = true; 1659 | } else if(weight < 25 && linkDensity > 0.2) { 1660 | toRemove = true; 1661 | } else if(weight >= 25 && linkDensity > 0.5) { 1662 | toRemove = true; 1663 | } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { 1664 | toRemove = true; 1665 | } 1666 | 1667 | if(toRemove) { 1668 | tagsList[i].parentNode.removeChild(tagsList[i]); 1669 | } 1670 | } 1671 | } 1672 | }, 1673 | 1674 | /** 1675 | * Clean out spurious headers from an Element. Checks things like classnames and link density. 1676 | * 1677 | * @param Element 1678 | * @return void 1679 | **/ 1680 | cleanHeaders: function (e) { 1681 | for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) { 1682 | var headers = e.getElementsByTagName('h' + headerIndex); 1683 | for (var i=headers.length-1; i >=0; i-=1) { 1684 | if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) { 1685 | headers[i].parentNode.removeChild(headers[i]); 1686 | } 1687 | } 1688 | } 1689 | }, 1690 | 1691 | /*** Smooth scrolling logic ***/ 1692 | 1693 | /** 1694 | * easeInOut animation algorithm - returns an integer that says how far to move at this point in the animation. 1695 | * Borrowed from jQuery's easing library. 1696 | * @return integer 1697 | **/ 1698 | easeInOut: function(start,end,totalSteps,actualStep) { 1699 | var delta = end - start; 1700 | 1701 | if ((actualStep/=totalSteps/2) < 1) { 1702 | return delta/2*actualStep*actualStep + start; 1703 | } 1704 | actualStep -=1; 1705 | return -delta/2 * ((actualStep)*(actualStep-2) - 1) + start; 1706 | }, 1707 | 1708 | /** 1709 | * Helper function to, in a cross compatible way, get or set the current scroll offset of the document. 1710 | * @return mixed integer on get, the result of window.scrollTo on set 1711 | **/ 1712 | scrollTop: function(scroll){ 1713 | var setScroll = typeof scroll !== 'undefined'; 1714 | 1715 | if(setScroll) { 1716 | return window.scrollTo(0, scroll); 1717 | } 1718 | if(typeof window.pageYOffset !== 'undefined') { 1719 | return window.pageYOffset; 1720 | } 1721 | else if(document.documentElement.clientHeight) { 1722 | return document.documentElement.scrollTop; 1723 | } 1724 | else { 1725 | return document.body.scrollTop; 1726 | } 1727 | }, 1728 | 1729 | /** 1730 | * scrollTo - Smooth scroll to the point of scrollEnd in the document. 1731 | * @return void 1732 | **/ 1733 | curScrollStep: 0, 1734 | scrollTo: function (scrollStart, scrollEnd, steps, interval) { 1735 | if( 1736 | (scrollStart < scrollEnd && readability.scrollTop() < scrollEnd) || 1737 | (scrollStart > scrollEnd && readability.scrollTop() > scrollEnd) 1738 | ) { 1739 | readability.curScrollStep+=1; 1740 | if(readability.curScrollStep > steps) { 1741 | return; 1742 | } 1743 | 1744 | var oldScrollTop = readability.scrollTop(); 1745 | 1746 | readability.scrollTop(readability.easeInOut(scrollStart, scrollEnd, steps, readability.curScrollStep)); 1747 | 1748 | // We're at the end of the window. 1749 | if(oldScrollTop === readability.scrollTop()) { 1750 | return; 1751 | } 1752 | 1753 | window.setTimeout(function() { 1754 | readability.scrollTo(scrollStart, scrollEnd, steps, interval); 1755 | }, interval); 1756 | } 1757 | }, 1758 | 1759 | 1760 | /** 1761 | * Show the email popup. 1762 | * 1763 | * @return void 1764 | **/ 1765 | emailBox: function () { 1766 | var emailContainerExists = document.getElementById('email-container'); 1767 | if(null !== emailContainerExists) 1768 | { 1769 | return; 1770 | } 1771 | 1772 | var emailContainer = document.createElement("DIV"); 1773 | emailContainer.setAttribute('id', 'email-container'); 1774 | emailContainer.innerHTML = ''; 1775 | 1776 | document.body.appendChild(emailContainer); 1777 | }, 1778 | 1779 | /** 1780 | * Close the email popup. This is a hacktackular way to check if we're in a "close loop". 1781 | * Since we don't have crossdomain access to the frame, we can only know when it has 1782 | * loaded again. If it's loaded over 3 times, we know to close the frame. 1783 | * 1784 | * @return void 1785 | **/ 1786 | removeFrame: function () { 1787 | readability.iframeLoads+=1; 1788 | if (readability.iframeLoads > 3) 1789 | { 1790 | var emailContainer = document.getElementById('email-container'); 1791 | if (null !== emailContainer) { 1792 | emailContainer.parentNode.removeChild(emailContainer); 1793 | } 1794 | 1795 | readability.iframeLoads = 0; 1796 | } 1797 | }, 1798 | 1799 | htmlspecialchars: function (s) { 1800 | if (typeof(s) === "string") { 1801 | s = s.replace(/&/g, "&"); 1802 | s = s.replace(/"/g, """); 1803 | s = s.replace(/'/g, "'"); 1804 | s = s.replace(//g, ">"); 1806 | } 1807 | 1808 | return s; 1809 | }, 1810 | 1811 | flagIsActive: function(flag) { 1812 | return (readability.flags & flag) > 0; 1813 | }, 1814 | 1815 | addFlag: function(flag) { 1816 | readability.flags = readability.flags | flag; 1817 | }, 1818 | 1819 | removeFlag: function(flag) { 1820 | readability.flags = readability.flags & ~flag; 1821 | } 1822 | 1823 | }; 1824 | 1825 | readability.init(); 1826 | --------------------------------------------------------------------------------