├── Procfile
├── run.js
├── .gitignore
├── index.js
├── static
    ├── style.css
    ├── main.js
    └── index.html
├── .travis.yml
├── sanitize.js
├── package.json
├── scrape.js
├── server.js
├── phantom-scrape.js
├── README.md
├── test
    └── index.js
└── vendor
    └── Readability.js


/Procfile:
--------------------------------------------------------------------------------
1 | web: node run.js
2 | 


--------------------------------------------------------------------------------
/run.js:
--------------------------------------------------------------------------------
1 | require("./server").serve();
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | npm-debug.log
3 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   scrape: require("./scrape"),
3 |   server: require("./server")
4 | };
5 | 


--------------------------------------------------------------------------------
/static/style.css:
--------------------------------------------------------------------------------
 1 | iframe {
 2 |   border: none;
 3 |   width: 100%;
 4 |   height: 640px;
 5 |   background: #fff;
 6 | }
 7 | iframe body {
 8 |   font-size: 22px;
 9 | }
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: node_js
 2 | node_js:
 3 | - "0.12"
 4 | - "4"
 5 | - "5"
 6 | - "6"
 7 | before_install:
 8 | - sudo apt-get install python-software-properties
 9 | - sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
10 | - sudo apt-get update
11 | - sudo apt-get install gcc-5 g++-5
12 | - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 80 --slave /usr/bin/g++ g++ /usr/bin/g++-5
13 | - sudo update-alternatives --set gcc /usr/bin/gcc-5
14 | 


--------------------------------------------------------------------------------
/sanitize.js:
--------------------------------------------------------------------------------
 1 | var html2md = require("html-md");
 2 | var markdown = require("markdown");
 3 | 
 4 | /**
 5 |  * Takes a result object and replace native html contents with a safer sanitized
 6 |  * version.
 7 |  * @param  {Object} resultObject
 8 |  * @return {Object}
 9 |  */
10 | exports.sanitizeResult = function(resultObject) {
11 |   try {
12 |     var sanitized = markdown.parse(html2md(resultObject.content));
13 |     resultObject.content = sanitized;
14 |     resultObject.length = sanitized.length;
15 |     return resultObject;
16 |   } catch (err) {
17 |     throw {error: "Failed HTML sanitization:" + (err || "Unknown reason.")};
18 |   }
19 | };
20 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "readable-proxy",
 3 |   "version": "1.6.1",
 4 |   "description": "Node service attempting to fetch readable contents from any URL.",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node run.js",
 8 |     "test": "mocha"
 9 |   },
10 |   "keywords": [
11 |     "readable",
12 |     "readability",
13 |     "fetch",
14 |     "proxy",
15 |     "scrape"
16 |   ],
17 |   "author": "Nicolas Perriault <nperriault@mozilla.com>",
18 |   "license": "MPL",
19 |   "dependencies": {
20 |     "bluebird": "^2.9.*",
21 |     "bootstrap": "^3.3.*",
22 |     "cheerio": "^0.22.0",
23 |     "express": "^4.11.*",
24 |     "html-md": "^3.0.*",
25 |     "markdown": "^0.5.*",
26 |     "object-assign": "^2.0.*",
27 |     "phantomjs-prebuilt": "^2.1.*"
28 |   },
29 |   "devDependencies": {
30 |     "chai": "^2.1.*",
31 |     "mocha": "^2.1.*",
32 |     "sinon": "^1.12.*",
33 |     "supertest": "^1.2.*"
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/scrape.js:
--------------------------------------------------------------------------------
 1 | var childProcess = require("child_process");
 2 | var phantomjs = require("phantomjs-prebuilt");
 3 | var binPath = phantomjs.path;
 4 | var path = require("path");
 5 | var Promise = require("bluebird");
 6 | var objectAssign = require("object-assign");
 7 | 
 8 | var readabilityPath = process.env.READABILITY_LIB_PATH ||
 9 |                       path.normalize(path.join(__dirname, "vendor", "Readability.js"));
10 | 
11 | module.exports = function scrape(url, options) {
12 |   options = options || {};
13 |   if (!url) throw new Error("Missing url.");
14 |   return new Promise(function(fulfill, reject) {
15 |     var childArgs = [path.join(__dirname, "phantom-scrape.js"), url, readabilityPath];
16 |     if (options.userAgent) {
17 |       childArgs.push(options.userAgent);
18 |     }
19 |     childProcess.execFile(binPath, childArgs, function(err, stdout, stderr) {
20 |       if (err) {
21 |         return reject(err);
22 |       }
23 |       var response, error;
24 |       try {
25 |         response = JSON.parse(stdout);
26 |       } catch (e) {
27 |         error = {
28 |           message: "Unable to parse JSON proxy response.",
29 |           line: e.line,
30 |           stack: e.stack
31 |         };
32 |       }
33 |       if (response && response.error) {
34 |         error = response.error;
35 |       }
36 |       if (error) {
37 |         reject(objectAssign(new Error(error.message), error));
38 |       } else if (!response) {
39 |         reject(new Error("Empty scraped response."));
40 |       } else {
41 |         fulfill(response);
42 |       }
43 |     });
44 |   });
45 | };
46 | 


--------------------------------------------------------------------------------
/static/main.js:
--------------------------------------------------------------------------------
 1 | (function() {
 2 |   "use strict";
 3 | 
 4 |   var q = document.querySelector.bind(document);
 5 | 
 6 |   function injectReadableContents(params, target) {
 7 |     q("#error").classList.add("hide");
 8 |     var req = new XMLHttpRequest();
 9 |     var apiUrl = [
10 |       "/api/get?sanitize=" + (params.sanitize ? "yes" : "no"),
11 |       "url=" + encodeURIComponent(params.url),
12 |       "userAgent=" + encodeURIComponent(params.userAgent)
13 |     ].join("&");
14 |     req.open("GET", apiUrl, false);
15 |     req.send(null);
16 |     var jsonResponse = JSON.parse(req.responseText);
17 |     if (jsonResponse.error) {
18 |       q("#error").textContent = jsonResponse.error.message;
19 |       q("#error").classList.remove("hide");
20 |       q("#readerable").textContent = "";
21 |       q("#title").textContent = "";
22 |       q("#byline").textContent = "";
23 |       q("#length").textContent = "";
24 |       q("#dir").textContent = "";
25 |       q("#excerpt").textContent = "";
26 |       q("#logs").value = "";
27 |       target.contentDocument.body.innerHTML = "";
28 |     } else {
29 |       q("#error").textContent = "";
30 |       q("#readerable").textContent = jsonResponse.isProbablyReaderable;
31 |       q("#title").textContent = jsonResponse.title;
32 |       q("#byline").textContent = jsonResponse.byline;
33 |       q("#length").textContent = jsonResponse.length;
34 |       q("#dir").textContent = jsonResponse.dir;
35 |       q("#excerpt").textContent = jsonResponse.excerpt;
36 |       q("#logs").value = (jsonResponse.consoleLogs || []).join("\n");
37 |       target.contentDocument.body.innerHTML = jsonResponse.content;
38 |     }
39 |   }
40 | 
41 |   function init() {
42 |     q("form").addEventListener("submit", function(event) {
43 |       event.preventDefault();
44 |       var url = q("#url").value;
45 |       q("#source").src = url;
46 |       injectReadableContents({
47 |         url: url,
48 |         sanitize: q("#sanitize").checked,
49 |         userAgent: q("#userAgent").value
50 |       }, q("#target"));
51 |     });
52 |   }
53 | 
54 |   window.addEventListener("DOMContentLoaded", init);
55 | })();
56 | 


--------------------------------------------------------------------------------
/server.js:
--------------------------------------------------------------------------------
 1 | var scrape = require("./scrape");
 2 | var sanitizeResult = require("./sanitize").sanitizeResult;
 3 | var express = require("express");
 4 | var pkgInfo = require("./package.json");
 5 | var cheerio = require("cheerio");
 6 | 
 7 | var app = express();
 8 | exports.app = app;
 9 | 
10 | app.use(express.static("static"));
11 | app.use(express.static("node_modules/bootstrap/dist/css"));
12 | 
13 | /**
14 |  * Casts a query string arg into an actual boolean value.
15 |  * @param  {String} arg The query string arg.
16 |  * @return {Boolean}
17 |  */
18 | function boolArg(queryParam) {
19 |   if (!queryParam) return false;
20 |   return ["1", "on", "true", "yes", "y"].indexOf(queryParam.toLowerCase()) !== -1;
21 | }
22 | 
23 | app.use(function(req, res, next) {
24 |   res.header("Content-Type", "application/json");
25 |   res.header("Access-Control-Allow-Origin", "*");
26 |   res.header("Access-Control-Allow-Headers", "Origin, Requested-With, Content-Type, Accept");
27 |   next();
28 | });
29 | 
30 | app.get("/api", function(req, res) {
31 |   res.json({
32 |     name: pkgInfo.name,
33 |     documentation: "https://github.com/n1k0/readable-proxy/blob/master/README.md",
34 |     description: pkgInfo.description,
35 |     version: pkgInfo.version
36 |   });
37 | });
38 | 
39 | app.get("/api/get", function(req, res) {
40 |   var url = req.query.url,
41 |       sanitize = boolArg(req.query.sanitize),
42 |       userAgent = req.query.userAgent;
43 |   if (!url) {
44 |     return res.status(400).json({error: "Missing url parameter"});
45 |   }
46 |   function handleError(err) {
47 |     console.error(err);
48 |     res.status(500).json({error: {message: err.message}});
49 |   }
50 |   scrape(url, {userAgent: userAgent})
51 |     .then(function(result) {
52 |       if (!result) {
53 |         throw new Error("No scraped result received.");
54 |       }
55 | 
56 |       var sanitizedResult = sanitizeResult(result);
57 |       var $ = cheerio.load(sanitizedResult.content);
58 |       var rawText = $('*').contents().map(function() {
59 |           return (this.type === 'text') ? $(this).text() + ' ' : '';
60 |       }).get().join('');
61 | 
62 |       result.rawText = rawText.trim();
63 | 
64 |       res.json(sanitize ? sanitizedResult : result);
65 |     })
66 |     .catch(handleError);
67 | });
68 | 
69 | exports.serve = function() {
70 |   var server = app.listen(process.env.PORT || 3000, function() {
71 |     var host = server.address().address;
72 |     var port = server.address().port;
73 |     console.log("Server listening at http://%s:%s", host, port);
74 |   });
75 | };
76 | 


--------------------------------------------------------------------------------
/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta charset="utf-8">
 5 |   <title>Readability.js test page</title>
 6 |   <link rel="stylesheet" type="text/css" href="bootstrap.min.css">
 7 |   <link rel="stylesheet" type="text/css" href="style.css">
 8 |   <style>
 9 |     #logs {
10 |       width: 100%;
11 |       height: 180px;
12 |       font-family: monospace;
13 |     }
14 |   </style>
15 | </head>
16 | <body>
17 |   <div class="container-fluid">
18 |     <div class="page-header">
19 |       <h1>Readability.js <small>test page</small></h1>
20 |     </div>
21 |     <div class="alert alert-danger hide" id="error"></div>
22 |     <div class="row">
23 |       <div class="col-md-6">
24 |         <form class="form-horizontal" id="form">
25 |           <div class="form-group">
26 |             <label class="col-sm-1 control-label">URL*</label>
27 |             <div class="col-sm-11">
28 |               <input type="url" class="form-control" id="url" placeholder="http://" required>
29 |             </div>
30 |           </div>
31 |           <div class="form-group">
32 |             <label class="col-sm-1 control-label">UA</label>
33 |             <div class="col-sm-11">
34 |               <input type="text" class="form-control" id="userAgent" placeholder="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36">
35 |             </div>
36 |           </div>
37 |           <div class="form-group">
38 |             <div class="col-sm-offset-1 col-sm-11">
39 |               <div class="checkbox">
40 |                 <label><input type="checkbox" id="sanitize">Sanitize output</label>
41 |               </div>
42 |             </div>
43 |           </div>
44 |           <div class="form-group">
45 |             <div class="col-sm-offset-1 col-sm-10">
46 |               <input class="btn btn-info" type="submit">
47 |             </div>
48 |           </div>
49 |         </form>
50 |       </div>
51 |       <div class="col-md-6">
52 |         <table class="table table-striped">
53 |           <tr><th>Readerable?</th><td id="readerable"></td></tr>
54 |           <tr><th>Title</th><td id="title"></td></tr>
55 |           <tr><th>Dir</th><td id="dir"></td></tr>
56 |           <tr><th>Byline</th><td id="byline"></td></tr>
57 |           <tr><th>Length</th><td id="length"></td></tr>
58 |           <tr><th>Excerpt</th><td id="excerpt"></td></tr>
59 |         </table>
60 |       </div>
61 |     </div>
62 |     <div class="row">
63 |       <div class="col-md-6">
64 |         <div class="panel panel-default">
65 |           <div class="panel-heading">Original</div>
66 |           <iframe id="source"></iframe>
67 |         </div>
68 |       </div>
69 |       <div class="col-md-6">
70 |         <div class="panel panel-default">
71 |           <div class="panel-heading">Readable</div>
72 |           <iframe id="target"></iframe>
73 |         </div>
74 |       </div>
75 |     </div>
76 |     <div>
77 |       <h3>Console logs</h3>
78 |       <textarea id="logs"></textarea>
79 |     </div>
80 |   </div>
81 |   <script src="main.js"></script>
82 | </body>
83 | </html>
84 | 


--------------------------------------------------------------------------------
/phantom-scrape.js:
--------------------------------------------------------------------------------
  1 | var system = require("system");
  2 | var page = require("webpage").create();
  3 | var url = system.args[1];
  4 | var readabilityPath = system.args[2];
  5 | var userAgent = system.args[3];
  6 | var consoleLogs = [];
  7 | 
  8 | // Prevent page js errors to break JSON output
  9 | // XXX: should we log these instead?
 10 | phantom.onError = page.onError = function(){};
 11 | 
 12 | function exitWithError(message) {
 13 |   outputJSON({error: {message: message}});
 14 |   phantom.exit();
 15 | }
 16 | 
 17 | function outputJSON(object) {
 18 |   console.log(JSON.stringify(object, null, 2));
 19 | }
 20 | 
 21 | /**
 22 |  * Note: This function runs within page environment.
 23 |  */
 24 | function runReadability(url, userAgent, pageContent) {
 25 |   var location = document.location;
 26 |   var uri = {
 27 |     spec: location.href,
 28 |     host: location.host,
 29 |     prePath: location.protocol + "//" + location.host, // TODO This is incomplete, needs username/password and port
 30 |     scheme: location.protocol.substr(0, location.protocol.indexOf(":")),
 31 |     pathBase: location.protocol + "//" + location.host + location.pathname.substr(0, location.pathname.lastIndexOf("/") + 1)
 32 |   };
 33 |   try {
 34 |     var readabilityObj = new Readability(uri, document);
 35 |     var isProbablyReaderable = readabilityObj.isProbablyReaderable();
 36 |     var result = readabilityObj.parse();
 37 |     if (result) {
 38 |       result.userAgent = userAgent;
 39 |       result.isProbablyReaderable = isProbablyReaderable;
 40 |     } else {
 41 |       result = {
 42 |         error: {
 43 |           message: "Empty result from Readability.js.",
 44 |           sourceHTML: pageContent || "Empty page content."
 45 |         }
 46 |       };
 47 |     }
 48 |     return result;
 49 |   } catch (err) {
 50 |     return {
 51 |       error: {
 52 |         message: err.message,
 53 |         line: err.line,
 54 |         stack: err.stack,
 55 |         sourceHTML: pageContent || "Empty page content."
 56 |       }
 57 |     };
 58 |   }
 59 | };
 60 | 
 61 | if (!url) {
 62 |   exitWithError("Missing url arg.");
 63 | } else if (!readabilityPath) {
 64 |   exitWithError("Missing readabilityPath arg.");
 65 | }
 66 | 
 67 | if (userAgent) {
 68 |   page.settings.userAgent = userAgent;
 69 | }
 70 | 
 71 | // disable loading images as we don't use them
 72 | page.settings.loadImages = false;
 73 | 
 74 | // ensure we don't waste time trying to load slow/missing resources
 75 | page.settings.resourceTimeout = 5000;
 76 | 
 77 | // if we do timeout a slow resource, say something useful
 78 | page.onResourceTimeout = function(request) {
 79 |     console.log('Response (#' + request.id + '): ' + JSON.stringify(request));
 80 | };
 81 | 
 82 | page.onConsoleMessage = function(msg) {
 83 |   consoleLogs.push(msg);
 84 | };
 85 | 
 86 | page.open(url, function(status) {
 87 |   if (status !== "success") {
 88 |     return exitWithError("Unable to access " + url);
 89 |   }
 90 |   if (!page.injectJs(readabilityPath)) {
 91 |     exitWithError("Couldn't inject " + readabilityPath);
 92 |   }
 93 |   var result = page.evaluate(runReadability, url, page.settings.userAgent, page.content);
 94 |   if (result && result.error) {
 95 |     result.error.consoleLogs = consoleLogs;
 96 |   } else if (result && result.content) {
 97 |     result.consoleLogs = consoleLogs;
 98 |   }
 99 |   outputJSON(result);
100 |   phantom.exit();
101 | });
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | readable-proxy
  2 | ==============
  3 | 
  4 | [![Build Status](https://travis-ci.org/n1k0/readable-proxy.svg?branch=master)](https://travis-ci.org/n1k0/readable-proxy) [![Dependency Status](https://www.versioneye.com/user/projects/54f03dfc4f3108d1fa00000c/badge.svg?style=flat)](https://www.versioneye.com/user/projects/54f03dfc4f3108d1fa00000c)
  5 | 
  6 | Proxy server to retrieve a readable version of any provided url, powered by Node,
  7 | [PhantomJS](http://phantom.org/) and [Readability.js](https://github.com/mozilla/readability).
  8 | 
  9 | Installation
 10 | ------------
 11 | 
 12 |     $ git clone https://github.com/n1k0/readable-proxy
 13 |     $ cd readable-proxy
 14 |     $ npm install
 15 | 
 16 | Run
 17 | ---
 18 | 
 19 | Starts server on `localhost:3000`:
 20 | 
 21 |     $ npm start
 22 | 
 23 | Note about CORS: by design, the server will allow any origin to access it, so browsers can consume it from pages hosted on a different domain.
 24 | 
 25 | Configuration
 26 | -------------
 27 | 
 28 | By default, the proxy server will use the Readability.js version it ships with; to override this, you can set the `READABILITY_LIB_PATH` environment variable to the absolute path to the library file on your local system:
 29 | 
 30 |     $ READABILITY_LIB_PATH=/path/to/my/own/version/of/Readability.js npm start
 31 | 
 32 | Usage
 33 | -----
 34 | 
 35 | ### Web UI
 36 | 
 37 | Just head to `http://localhost:3000/`, enter some URL and start enjoying both original and readable renderings side by side.
 38 | 
 39 | ![](https://s3.amazonaws.com/f.cl.ly/items/0H2X0o1V2Y240u3L1b06/Screen%20Shot%202015-02-26%20at%2012.33.15.png)
 40 | 
 41 | ### REST/JSON API
 42 | 
 43 | The HTTP Rest API is available under `/api`.
 44 | 
 45 | **Disclaimer:** Truly *REST* implementation is probably far from being considered achieved.
 46 | 
 47 | #### `GET /api/get`
 48 | 
 49 | ##### Required parameters
 50 | 
 51 | - `url`: The URL to retrieve retrieve readable contents from, eg. `https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/`.
 52 | 
 53 | ##### Optional parameters
 54 | 
 55 | - `sanitize`: A *boolean string* to enable HTML sanitization (valid truthy boolean strings: "1", "on", "true", "yes", "y"; everything else will be considered falsy):
 56 | - `userAgent`: A custom [User Agent](http://en.wikipedia.org/wiki/User_agent) string. By default, it will use the PhantomJS one.
 57 | 
 58 | **Note:** Enabling contents sanitization loses Readability.js specific HTML semantics, though is probably safer for users if you plan to publish retrieved contents on a public website.
 59 | 
 60 | ##### Example
 61 | 
 62 | Content sanitization enabled:
 63 | 
 64 |     $ curl http://0.0.0.0:3000/api/get\?sanitize=y&url\=https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/
 65 |     {
 66 |       "byline":"Nicolas Perriault —",
 67 |       "content":"<p><strong>So finally you&#39;re <a href=\"https://nicolas.perriault.net/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/\">testing",
 68 |       "length":2867,
 69 |       "title":"Get your Frontend JavaScript Code Covered | Code",
 70 |       "uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/",
 71 |       "isProbablyReaderable": true
 72 |     }
 73 | 
 74 | Content sanitization disabled (default):
 75 | 
 76 |     $ curl http://0.0.0.0:3000/api/get\?url\=https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/
 77 |     {
 78 |       "byline":"Nicolas Perriault —",
 79 |       "content":"<div id=\"readability-page-1\" class=\"page\"><section class=\"\">\n<p><strong>So finally you're…",
 80 |       "length":3851,
 81 |       "title":"Get your Frontend JavaScript Code Covered | Code",
 82 |       "uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/",
 83 |       "isProbablyReaderable": true
 84 |     }
 85 | 
 86 | Note: the `isProbablyReaderable` property tells if Readability has determined if page contents were parseable or not.
 87 | 
 88 | ### Usage from node
 89 | 
 90 | #### scrape() function
 91 | 
 92 | The `scrape` function scrapes a URL and returns a Promise with the JSON result object described above:
 93 | 
 94 | ```js
 95 | var scrape = require("readable-proxy").scrape;
 96 | var url = "https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/";
 97 | 
 98 | scrape(url, {sanitize: true, userAgent: "My custom User-Agent string"})
 99 |   .then(console.error.log(console))
100 |   .catch(console.error.bind(console));
101 | ```
102 | 
103 | Tests
104 | -----
105 | 
106 |     $ npm test
107 | 
108 | License
109 | -------
110 | 
111 | MPL 2.0.
112 | 


--------------------------------------------------------------------------------
/test/index.js:
--------------------------------------------------------------------------------
  1 | var expect = require("chai").expect;
  2 | var scrape = require("../scrape");
  3 | var Promise = require("bluebird");
  4 | var sinon = require("sinon");
  5 | var childProcess = require("child_process");
  6 | var app = require("../server").app;
  7 | var request = require("supertest");
  8 | 
  9 | describe("Tests", function() {
 10 |   var sandbox;
 11 | 
 12 |   beforeEach(function() {
 13 |     sandbox = sinon.sandbox.create();
 14 |   });
 15 | 
 16 |   afterEach(function() {
 17 |     sandbox.restore();
 18 |   });
 19 | 
 20 |   describe("scrape", function() {
 21 |     it("should throw on url arg missing", function() {
 22 |       expect(scrape).to.Throw(/Missing url./);
 23 |     });
 24 | 
 25 |     it("should return a promise", function() {
 26 |       sandbox.stub(childProcess, "execFile");
 27 | 
 28 |       expect(scrape("http://invalid.test/")).to.be.an.instanceOf(Promise);
 29 |     });
 30 | 
 31 |     it("should call phantomjs exec with expected args", function() {
 32 |       sandbox.stub(childProcess, "execFile");
 33 | 
 34 |       scrape("http://invalid.test/");
 35 | 
 36 |       sinon.assert.calledOnce(childProcess.execFile);
 37 |       expect(childProcess.execFile.getCall(0).args[0]).to.match(/phantomjs/);
 38 |       expect(childProcess.execFile.getCall(0).args[1]).to.include("http://invalid.test/");
 39 |       expect(childProcess.execFile.getCall(0).args[1][2]).to.match(/Readability\.js/);
 40 |     });
 41 | 
 42 |     it("should handle rejection on process call error", function(done) {
 43 |       var fakeErr = new Error("Boom");
 44 |       sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
 45 |         cb(fakeErr);
 46 |       });
 47 | 
 48 |       scrape("http://invalid.test/").catch(function(err) {
 49 |         expect(err).eql(fakeErr);
 50 |         done();
 51 |       });
 52 |     });
 53 | 
 54 |     it("should reject on stdout json parsing failure", function(done) {
 55 |       sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
 56 |         cb(null, "invalid.json.string");
 57 |       });
 58 | 
 59 |       scrape("http://invalid.test/").catch(function(err) {
 60 |         expect(err.message).to.match(/Unable to parse JSON proxy response/);
 61 |         done();
 62 |       });
 63 |     });
 64 | 
 65 |     it("should reject on data extraction error", function(done) {
 66 |       sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
 67 |         cb(null, JSON.stringify({error: {message: "Foo"}}));
 68 |       });
 69 | 
 70 |       scrape("http://invalid.test/").catch(function(err) {
 71 |         expect(err).to.be.an.instanceOf(Error);
 72 |         expect(err.message).eql("Foo");
 73 |         done();
 74 |       });
 75 |     });
 76 | 
 77 |     it("should fulfill with a valid json result", function(done) {
 78 |       sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
 79 |         cb(null, JSON.stringify({title: "plop", content: "plip"}));
 80 |       });
 81 | 
 82 |       scrape("http://invalid.test/").then(function(result) {
 83 |         expect(result.title).eql("plop");
 84 |         expect(result.content).eql("plip");
 85 |         done();
 86 |       });
 87 |     });
 88 |   });
 89 | 
 90 |   describe("server.app", function() {
 91 |     describe("Web UI", function() {
 92 |       it("should serve Web UI on root endpoint", function(done) {
 93 |         request(app)
 94 |           .get("/")
 95 |           .expect("Content-Type", /text\/html/)
 96 |           .expect(200, done);
 97 |       });
 98 |     });
 99 | 
100 |     describe("API", function() {
101 |       describe("GET /api", function() {
102 |         it("should serve JSON on /api endpoint", function(done) {
103 |           request(app)
104 |             .get("/api")
105 |             .set("Accept", "application/json")
106 |             .expect("Content-Type", /application\/json/)
107 |             .expect(200, done);
108 |         });
109 | 
110 |         it("should serve app info on /api endpoint", function(done) {
111 |           request(app)
112 |             .get("/api")
113 |             .set("Accept", "application/json")
114 |             .expect("Content-Type", /application\/json/)
115 |             .expect(function(res) {
116 |               expect(res.body.name).eql("readable-proxy");
117 |             })
118 |             .end(done);
119 |         });
120 |       });
121 | 
122 |       describe("GET /api/get", function() {
123 |         it("should return error if missing url param", function(done) {
124 |           request(app)
125 |             .get("/api/get")
126 |             .expect(400)
127 |             .expect(function(res) {
128 |               expect(res.body.error).eql("Missing url parameter");
129 |             })
130 |             .end(done);
131 |         });
132 | 
133 |         it("should return scraped response", function(done) {
134 |           sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
135 |             cb(null, JSON.stringify({title: "plop"}));
136 |           });
137 | 
138 |           request(app)
139 |             .get("/api/get?url=http://invalid.test/")
140 |             .expect(200)
141 |             .expect(function(res) {
142 |               expect(res.body.title).eql("plop");
143 |             })
144 |             .end(done);
145 |         });
146 | 
147 |         it("should return a server error on call error", function(done) {
148 |           sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
149 |             cb(null, JSON.stringify({error: {message: "fail"}}));
150 |           });
151 | 
152 |           request(app)
153 |             .get("/api/get?url=http://invalid.test/")
154 |             .expect(500)
155 |             .expect(function(res) {
156 |               expect(res.body.error.message).eql("fail");
157 |             })
158 |             .end(done);
159 |         });
160 | 
161 |         it("should apply custom user agent when provided", function(done) {
162 |           sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
163 |             cb(null, "{}");
164 |           });
165 | 
166 |           request(app)
167 |             .get("/api/get?url=http://invalid.test/&userAgent=custom+ua")
168 |             .expect(200)
169 |             .expect(function() {
170 |               expect(childProcess.execFile.getCall(0).args[1]).to.contain("custom ua");
171 |             })
172 |             .end(done);
173 |         });
174 | 
175 |         it("should return sanitized response when sanitize arg is passed", function(done) {
176 |           sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
177 |             cb(null, JSON.stringify({content: "<p><script>alert('xss')</script>plop</p>"}));
178 |           });
179 | 
180 |           request(app)
181 |             .get("/api/get?sanitize=1&url=http://invalid.test/")
182 |             .expect(200)
183 |             .expect(function(res) {
184 |               expect(res.body.content).eql("<p>plop</p>");
185 |               expect(res.body.rawText).eql("plop");
186 |             })
187 |             .end(done);
188 |         });
189 |       });
190 |     });
191 |   });
192 | });
193 | 


--------------------------------------------------------------------------------
/vendor/Readability.js:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Copyright (c) 2010 Arc90 Inc
   3 |  *
   4 |  * Licensed under the Apache License, Version 2.0 (the "License");
   5 |  * you may not use this file except in compliance with the License.
   6 |  * You may obtain a copy of the License at
   7 |  *
   8 |  *     http://www.apache.org/licenses/LICENSE-2.0
   9 |  *
  10 |  * Unless required by applicable law or agreed to in writing, software
  11 |  * distributed under the License is distributed on an "AS IS" BASIS,
  12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 |  * See the License for the specific language governing permissions and
  14 |  * limitations under the License.
  15 |  */
  16 | 
  17 | /*
  18 |  * This code is heavily based on Arc90's readability.js (1.7.1) script
  19 |  * available at: http://code.google.com/p/arc90labs-readability
  20 |  */
  21 | var root = this;
  22 | 
  23 | /**
  24 |  * Public constructor.
  25 |  * @param {Object}       uri     The URI descriptor object.
  26 |  * @param {HTMLDocument} doc     The document to parse.
  27 |  * @param {Object}       options The options object.
  28 |  */
  29 | var Readability = function(uri, doc, options) {
  30 |   options = options || {};
  31 | 
  32 |   this._uri = uri;
  33 |   this._doc = doc;
  34 |   this._biggestFrame = false;
  35 |   this._articleByline = null;
  36 |   this._articleDir = null;
  37 | 
  38 |   // Configureable options
  39 |   this._debug = !!options.debug;
  40 |   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
  41 |   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
  42 |   this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
  43 | 
  44 |   // Start with all flags set
  45 |   this._flags = this.FLAG_STRIP_UNLIKELYS |
  46 |                 this.FLAG_WEIGHT_CLASSES |
  47 |                 this.FLAG_CLEAN_CONDITIONALLY;
  48 | 
  49 |   // The list of pages we've parsed in this call of readability,
  50 |   // for autopaging. As a key store for easier searching.
  51 |   this._parsedPages = {};
  52 | 
  53 |   // A list of the ETag headers of pages we've parsed, in case they happen to match,
  54 |   // we'll know it's a duplicate.
  55 |   this._pageETags = {};
  56 | 
  57 |   // Make an AJAX request for each page and append it to the document.
  58 |   this._curPageNum = 1;
  59 | 
  60 |   // Control whether log messages are sent to the console
  61 |   if (this._debug) {
  62 |     function logEl(e) {
  63 |       var rv = e.nodeName + " ";
  64 |       if (e.nodeType == e.TEXT_NODE) {
  65 |         return rv + '("' + e.textContent + '")';
  66 |       }
  67 |       var classDesc = e.className && ("." + e.className.replace(/ /g, "."));
  68 |       var elDesc = e.id ? "(#" + e.id + classDesc + ")" :
  69 |                           (classDesc ? "(" + classDesc + ")" : "");
  70 |       return rv + elDesc;
  71 |     }
  72 |     this.log = function () {
  73 |       if ("dump" in root) {
  74 |         var msg = Array.prototype.map.call(arguments, function(x) {
  75 |           return (x && x.nodeName) ? logEl(x) : x;
  76 |         }).join(" ");
  77 |         dump("Reader: (Readability) " + msg + "\n");
  78 |       } else if ("console" in root) {
  79 |         var args = ["Reader: (Readability) "].concat(arguments);
  80 |         console.log.apply(console, args);
  81 |       }
  82 |     };
  83 |   } else {
  84 |     this.log = function () {};
  85 |   }
  86 | }
  87 | 
  88 | Readability.prototype = {
  89 |   FLAG_STRIP_UNLIKELYS: 0x1,
  90 |   FLAG_WEIGHT_CLASSES: 0x2,
  91 |   FLAG_CLEAN_CONDITIONALLY: 0x4,
  92 | 
  93 |   // Max number of nodes supported by this parser. Default: 0 (no limit)
  94 |   DEFAULT_MAX_ELEMS_TO_PARSE: 0,
  95 | 
  96 |   // The number of top candidates to consider when analysing how
  97 |   // tight the competition is among candidates.
  98 |   DEFAULT_N_TOP_CANDIDATES: 5,
  99 | 
 100 |   // The maximum number of pages to loop through before we call
 101 |   // it quits and just show a link.
 102 |   DEFAULT_MAX_PAGES: 5,
 103 | 
 104 |   // Element tags to score by default.
 105 |   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 106 | 
 107 |   // All of the regular expressions in use within readability.
 108 |   // Defined up here so we don't instantiate them repeatedly in loops.
 109 |   REGEXPS: {
 110 |     unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
 111 |     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
 112 |     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
 113 |     negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
 114 |     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
 115 |     byline: /byline|author|dateline|writtenby/i,
 116 |     replaceFonts: /<(\/?)font[^>]*>/gi,
 117 |     normalize: /\s{2,}/g,
 118 |     videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
 119 |     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
 120 |     prevLink: /(prev|earl|old|new|<|«)/i,
 121 |     whitespace: /^\s*$/,
 122 |     hasContent: /\S$/,
 123 |   },
 124 | 
 125 |   DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
 126 | 
 127 |   ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
 128 | 
 129 |   /**
 130 |    * Run any post-process modifications to article content as necessary.
 131 |    *
 132 |    * @param Element
 133 |    * @return void
 134 |   **/
 135 |   _postProcessContent: function(articleContent) {
 136 |     // Readability cannot open relative uris so we convert them to absolute uris.
 137 |     this._fixRelativeUris(articleContent);
 138 |   },
 139 | 
 140 |   /**
 141 |    * Iterate over a NodeList, which doesn't natively fully implement the Array
 142 |    * interface.
 143 |    *
 144 |    * For convenience, the current object context is applied to the provided
 145 |    * iterate function.
 146 |    *
 147 |    * @param  NodeList nodeList The NodeList.
 148 |    * @param  Function fn       The iterate function.
 149 |    * @return void
 150 |    */
 151 |   _forEachNode: function(nodeList, fn) {
 152 |     return Array.prototype.forEach.call(nodeList, fn, this);
 153 |   },
 154 | 
 155 |   /**
 156 |    * Iterate over a NodeList, return true if any of the provided iterate
 157 |    * function calls returns true, false otherwise.
 158 |    *
 159 |    * For convenience, the current object context is applied to the
 160 |    * provided iterate function.
 161 |    *
 162 |    * @param  NodeList nodeList The NodeList.
 163 |    * @param  Function fn       The iterate function.
 164 |    * @return Boolean
 165 |    */
 166 |   _someNode: function(nodeList, fn) {
 167 |     return Array.prototype.some.call(nodeList, fn, this);
 168 |   },
 169 | 
 170 |   /**
 171 |    * Concat all nodelists passed as arguments.
 172 |    *
 173 |    * @return ...NodeList
 174 |    * @return Array
 175 |    */
 176 |   _concatNodeLists: function() {
 177 |     var slice = Array.prototype.slice;
 178 |     var args = slice.call(arguments);
 179 |     var nodeLists = args.map(function(list) {
 180 |       return slice.call(list);
 181 |     });
 182 |     return Array.prototype.concat.apply([], nodeLists);
 183 |   },
 184 | 
 185 |   _getAllNodesWithTag: function(node, tagNames) {
 186 |     if (node.querySelectorAll) {
 187 |       return node.querySelectorAll(tagNames.join(','));
 188 |     }
 189 |     return [].concat.apply([], tagNames.map(function(tag) {
 190 |       return node.getElementsByTagName(tag);
 191 |     }));
 192 |   },
 193 | 
 194 |   /**
 195 |    * Converts each <a> and <img> uri in the given element to an absolute URI.
 196 |    *
 197 |    * @param Element
 198 |    * @return void
 199 |    */
 200 |   _fixRelativeUris: function(articleContent) {
 201 |     var scheme = this._uri.scheme;
 202 |     var prePath = this._uri.prePath;
 203 |     var pathBase = this._uri.pathBase;
 204 | 
 205 |     function toAbsoluteURI(uri) {
 206 |       // If this is already an absolute URI, return it.
 207 |       if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
 208 |         return uri;
 209 | 
 210 |       // Scheme-rooted relative URI.
 211 |       if (uri.substr(0, 2) == "//")
 212 |         return scheme + "://" + uri.substr(2);
 213 | 
 214 |       // Prepath-rooted relative URI.
 215 |       if (uri[0] == "/")
 216 |         return prePath + uri;
 217 | 
 218 |       // Dotslash relative URI.
 219 |       if (uri.indexOf("./") === 0)
 220 |         return pathBase + uri.slice(2);
 221 | 
 222 |       // Standard relative URI; add entire path. pathBase already includes a
 223 |       // trailing "/".
 224 |       return pathBase + uri;
 225 |     }
 226 | 
 227 |     var links = articleContent.getElementsByTagName("a");
 228 |     this._forEachNode(links, function(link) {
 229 |       var href = link.getAttribute("href");
 230 |       if (href) {
 231 |         // Replace links with javascript: URIs with text content, since
 232 |         // they won't work after scripts have been removed from the page.
 233 |         if (href.indexOf("javascript:") === 0) {
 234 |           var text = this._doc.createTextNode(link.textContent);
 235 |           link.parentNode.replaceChild(text, link);
 236 |         } else {
 237 |           link.setAttribute("href", toAbsoluteURI(href));
 238 |         }
 239 |       }
 240 |     });
 241 | 
 242 |     var imgs = articleContent.getElementsByTagName("img");
 243 |     this._forEachNode(imgs, function(img) {
 244 |       var src = img.getAttribute("src");
 245 |       if (src) {
 246 |         img.setAttribute("src", toAbsoluteURI(src));
 247 |       }
 248 |     });
 249 |   },
 250 | 
 251 |   /**
 252 |    * Get the article title as an H1.
 253 |    *
 254 |    * @return void
 255 |    **/
 256 |   _getArticleTitle: function() {
 257 |     var doc = this._doc;
 258 |     var curTitle = "";
 259 |     var origTitle = "";
 260 | 
 261 |     try {
 262 |       curTitle = origTitle = doc.title;
 263 | 
 264 |       // If they had an element with id "title" in their HTML
 265 |       if (typeof curTitle !== "string")
 266 |         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
 267 |     } catch(e) {}
 268 | 
 269 |     if (curTitle.match(/ [\|\-] /)) {
 270 |       curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
 271 | 
 272 |       if (curTitle.split(' ').length < 3)
 273 |         curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
 274 |     } else if (curTitle.indexOf(': ') !== -1) {
 275 |       // Check if we have an heading containing this exact string, so we
 276 |       // could assume it's the full title.
 277 |       var headings = this._concatNodeLists(
 278 |         doc.getElementsByTagName('h1'),
 279 |         doc.getElementsByTagName('h2')
 280 |       );
 281 |       var match = this._someNode(headings, function(heading) {
 282 |         return heading.textContent === curTitle;
 283 |       });
 284 | 
 285 |       // If we don't, let's extract the title out of the original title string.
 286 |       if (!match) {
 287 |         curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
 288 | 
 289 |         // If the title is now too short, try the first colon instead:
 290 |         if (curTitle.split(' ').length < 3)
 291 |           curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
 292 |       }
 293 |     } else if (curTitle.length > 150 || curTitle.length < 15) {
 294 |       var hOnes = doc.getElementsByTagName('h1');
 295 | 
 296 |       if (hOnes.length === 1)
 297 |         curTitle = this._getInnerText(hOnes[0]);
 298 |     }
 299 | 
 300 |     curTitle = curTitle.trim();
 301 | 
 302 |     if (curTitle.split(' ').length <= 4)
 303 |       curTitle = origTitle;
 304 | 
 305 |     return curTitle;
 306 |   },
 307 | 
 308 |   /**
 309 |    * Prepare the HTML document for readability to scrape it.
 310 |    * This includes things like stripping javascript, CSS, and handling terrible markup.
 311 |    *
 312 |    * @return void
 313 |    **/
 314 |   _prepDocument: function() {
 315 |     var doc = this._doc;
 316 | 
 317 |     // Remove all style tags in head
 318 |     this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
 319 |       styleNode.parentNode.removeChild(styleNode);
 320 |     });
 321 | 
 322 |     if (doc.body) {
 323 |       this._replaceBrs(doc.body);
 324 |     }
 325 | 
 326 |     this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
 327 |       this._setNodeTag(fontNode, "SPAN");
 328 |     });
 329 |   },
 330 | 
 331 |   /**
 332 |    * Finds the next element, starting from the given node, and ignoring
 333 |    * whitespace in between. If the given node is an element, the same node is
 334 |    * returned.
 335 |    */
 336 |   _nextElement: function (node) {
 337 |     var next = node;
 338 |     while (next
 339 |         && (next.nodeType != Node.ELEMENT_NODE)
 340 |         && this.REGEXPS.whitespace.test(next.textContent)) {
 341 |       next = next.nextSibling;
 342 |     }
 343 |     return next;
 344 |   },
 345 | 
 346 |   /**
 347 |    * Replaces 2 or more successive <br> elements with a single <p>.
 348 |    * Whitespace between <br> elements are ignored. For example:
 349 |    *   <div>foo<br>bar<br> <br><br>abc</div>
 350 |    * will become:
 351 |    *   <div>foo<br>bar<p>abc</p></div>
 352 |    */
 353 |   _replaceBrs: function (elem) {
 354 |     this._forEachNode(elem.getElementsByTagName("br"), function(br) {
 355 |       var next = br.nextSibling;
 356 | 
 357 |       // Whether 2 or more <br> elements have been found and replaced with a
 358 |       // <p> block.
 359 |       var replaced = false;
 360 | 
 361 |       // If we find a <br> chain, remove the <br>s until we hit another element
 362 |       // or non-whitespace. This leaves behind the first <br> in the chain
 363 |       // (which will be replaced with a <p> later).
 364 |       while ((next = this._nextElement(next)) && (next.tagName == "BR")) {
 365 |         replaced = true;
 366 |         var sibling = next.nextSibling;
 367 |         next.parentNode.removeChild(next);
 368 |         next = sibling;
 369 |       }
 370 | 
 371 |       // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
 372 |       // all sibling nodes as children of the <p> until we hit another <br>
 373 |       // chain.
 374 |       if (replaced) {
 375 |         var p = this._doc.createElement("p");
 376 |         br.parentNode.replaceChild(p, br);
 377 | 
 378 |         next = p.nextSibling;
 379 |         while (next) {
 380 |           // If we've hit another <br><br>, we're done adding children to this <p>.
 381 |           if (next.tagName == "BR") {
 382 |             var nextElem = this._nextElement(next);
 383 |             if (nextElem && nextElem.tagName == "BR")
 384 |               break;
 385 |           }
 386 | 
 387 |           // Otherwise, make this node a child of the new <p>.
 388 |           var sibling = next.nextSibling;
 389 |           p.appendChild(next);
 390 |           next = sibling;
 391 |         }
 392 |       }
 393 |     });
 394 |   },
 395 | 
 396 |   _setNodeTag: function (node, tag) {
 397 |     this.log("_setNodeTag", node, tag);
 398 |     if (node.__JSDOMParser__) {
 399 |       node.localName = tag.toLowerCase();
 400 |       node.tagName = tag.toUpperCase();
 401 |       return node;
 402 |     }
 403 | 
 404 |     var replacement = node.ownerDocument.createElement(tag);
 405 |     while (node.firstChild) {
 406 |       replacement.appendChild(node.firstChild);
 407 |     }
 408 |     node.parentNode.replaceChild(replacement, node);
 409 |     if (node.readability)
 410 |       replacement.readability = node.readability;
 411 | 
 412 |     for (var i = 0; i < node.attributes.length; i++) {
 413 |       replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
 414 |     }
 415 |     return replacement;
 416 |   },
 417 | 
 418 |   /**
 419 |    * Prepare the article node for display. Clean out any inline styles,
 420 |    * iframes, forms, strip extraneous <p> tags, etc.
 421 |    *
 422 |    * @param Element
 423 |    * @return void
 424 |    **/
 425 |   _prepArticle: function(articleContent) {
 426 |     this._cleanStyles(articleContent);
 427 | 
 428 |     // Clean out junk from the article content
 429 |     this._cleanConditionally(articleContent, "form");
 430 |     this._clean(articleContent, "object");
 431 |     this._clean(articleContent, "embed");
 432 |     this._clean(articleContent, "h1");
 433 |     this._clean(articleContent, "footer");
 434 | 
 435 |     // If there is only one h2, they are probably using it as a header
 436 |     // and not a subheader, so remove it since we already have a header.
 437 |     if (articleContent.getElementsByTagName('h2').length === 1)
 438 |       this._clean(articleContent, "h2");
 439 | 
 440 |     this._clean(articleContent, "iframe");
 441 |     this._cleanHeaders(articleContent);
 442 | 
 443 |     // Do these last as the previous stuff may have removed junk
 444 |     // that will affect these
 445 |     this._cleanConditionally(articleContent, "table");
 446 |     this._cleanConditionally(articleContent, "ul");
 447 |     this._cleanConditionally(articleContent, "div");
 448 | 
 449 |     // Remove extra paragraphs
 450 |     this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
 451 |       var imgCount = paragraph.getElementsByTagName('img').length;
 452 |       var embedCount = paragraph.getElementsByTagName('embed').length;
 453 |       var objectCount = paragraph.getElementsByTagName('object').length;
 454 |       // At this point, nasty iframes have been removed, only remain embedded video ones.
 455 |       var iframeCount = paragraph.getElementsByTagName('iframe').length;
 456 |       var totalCount = imgCount + embedCount + objectCount + iframeCount;
 457 | 
 458 |       if (totalCount === 0 && !this._getInnerText(paragraph, false))
 459 |         paragraph.parentNode.removeChild(paragraph);
 460 |     });
 461 | 
 462 |     this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
 463 |       var next = this._nextElement(br.nextSibling);
 464 |       if (next && next.tagName == "P")
 465 |         br.parentNode.removeChild(br);
 466 |     });
 467 |   },
 468 | 
 469 |   /**
 470 |    * Initialize a node with the readability object. Also checks the
 471 |    * className/id for special names to add to its score.
 472 |    *
 473 |    * @param Element
 474 |    * @return void
 475 |   **/
 476 |   _initializeNode: function(node) {
 477 |     node.readability = {"contentScore": 0};
 478 | 
 479 |     switch(node.tagName) {
 480 |       case 'DIV':
 481 |         node.readability.contentScore += 5;
 482 |         break;
 483 | 
 484 |       case 'PRE':
 485 |       case 'TD':
 486 |       case 'BLOCKQUOTE':
 487 |         node.readability.contentScore += 3;
 488 |         break;
 489 | 
 490 |       case 'ADDRESS':
 491 |       case 'OL':
 492 |       case 'UL':
 493 |       case 'DL':
 494 |       case 'DD':
 495 |       case 'DT':
 496 |       case 'LI':
 497 |       case 'FORM':
 498 |         node.readability.contentScore -= 3;
 499 |         break;
 500 | 
 501 |       case 'H1':
 502 |       case 'H2':
 503 |       case 'H3':
 504 |       case 'H4':
 505 |       case 'H5':
 506 |       case 'H6':
 507 |       case 'TH':
 508 |         node.readability.contentScore -= 5;
 509 |         break;
 510 |     }
 511 | 
 512 |     node.readability.contentScore += this._getClassWeight(node);
 513 |   },
 514 | 
 515 |   _removeAndGetNext: function(node) {
 516 |     var nextNode = this._getNextNode(node, true);
 517 |     node.parentNode.removeChild(node);
 518 |     return nextNode;
 519 |   },
 520 | 
 521 |   /**
 522 |    * Traverse the DOM from node to node, starting at the node passed in.
 523 |    * Pass true for the second parameter to indicate this node itself
 524 |    * (and its kids) are going away, and we want the next node over.
 525 |    *
 526 |    * Calling this in a loop will traverse the DOM depth-first.
 527 |    */
 528 |   _getNextNode: function(node, ignoreSelfAndKids) {
 529 |     // First check for kids if those aren't being ignored
 530 |     if (!ignoreSelfAndKids && node.firstElementChild) {
 531 |       return node.firstElementChild;
 532 |     }
 533 |     // Then for siblings...
 534 |     if (node.nextElementSibling) {
 535 |       return node.nextElementSibling;
 536 |     }
 537 |     // And finally, move up the parent chain *and* find a sibling
 538 |     // (because this is depth-first traversal, we will have already
 539 |     // seen the parent nodes themselves).
 540 |     do {
 541 |       node = node.parentNode;
 542 |     } while (node && !node.nextElementSibling);
 543 |     return node && node.nextElementSibling;
 544 |   },
 545 | 
 546 |   /**
 547 |    * Like _getNextNode, but for DOM implementations with no
 548 |    * firstElementChild/nextElementSibling functionality...
 549 |    */
 550 |   _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
 551 |     function nextSiblingEl(n) {
 552 |       do {
 553 |         n = n.nextSibling;
 554 |       } while (n && n.nodeType !== n.ELEMENT_NODE);
 555 |       return n;
 556 |     }
 557 |     // First check for kids if those aren't being ignored
 558 |     if (!ignoreSelfAndKids && node.children[0]) {
 559 |       return node.children[0];
 560 |     }
 561 |     // Then for siblings...
 562 |     var next = nextSiblingEl(node);
 563 |     if (next) {
 564 |       return next;
 565 |     }
 566 |     // And finally, move up the parent chain *and* find a sibling
 567 |     // (because this is depth-first traversal, we will have already
 568 |     // seen the parent nodes themselves).
 569 |     do {
 570 |       node = node.parentNode;
 571 |       if (node)
 572 |         next = nextSiblingEl(node);
 573 |     } while (node && !next);
 574 |     return node && next;
 575 |   },
 576 | 
 577 |   _checkByline: function(node, matchString) {
 578 |     if (this._articleByline) {
 579 |       return false;
 580 |     }
 581 | 
 582 |     if (node.getAttribute !== undefined) {
 583 |       var rel = node.getAttribute("rel");
 584 |     }
 585 | 
 586 |     if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
 587 |       this._articleByline = node.textContent.trim();
 588 |       return true;
 589 |     }
 590 | 
 591 |     return false;
 592 |   },
 593 | 
 594 |   _getNodeAncestors: function(node, maxDepth) {
 595 |     maxDepth = maxDepth || 0;
 596 |     var i = 0, ancestors = [];
 597 |     while (node.parentNode) {
 598 |       ancestors.push(node.parentNode)
 599 |       if (maxDepth && ++i === maxDepth)
 600 |         break;
 601 |       node = node.parentNode;
 602 |     }
 603 |     return ancestors;
 604 |   },
 605 | 
 606 |   /***
 607 |    * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
 608 |    *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
 609 |    *
 610 |    * @param page a document to run upon. Needs to be a full document, complete with body.
 611 |    * @return Element
 612 |   **/
 613 |   _grabArticle: function (page) {
 614 |     this.log("**** grabArticle ****");
 615 |     var doc = this._doc;
 616 |     var isPaging = (page !== null ? true: false);
 617 |     page = page ? page : this._doc.body;
 618 | 
 619 |     // We can't grab an article if we don't have a page!
 620 |     if (!page) {
 621 |       this.log("No body found in document. Abort.");
 622 |       return null;
 623 |     }
 624 | 
 625 |     var pageCacheHtml = page.innerHTML;
 626 | 
 627 |     // Check if any "dir" is set on the toplevel document element
 628 |     this._articleDir = doc.documentElement.getAttribute("dir");
 629 | 
 630 |     while (true) {
 631 |       var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
 632 | 
 633 |       // First, node prepping. Trash nodes that look cruddy (like ones with the
 634 |       // class name "comment", etc), and turn divs into P tags where they have been
 635 |       // used inappropriately (as in, where they contain no other block level elements.)
 636 |       var elementsToScore = [];
 637 |       var node = this._doc.documentElement;
 638 | 
 639 |       while (node) {
 640 |         var matchString = node.className + " " + node.id;
 641 | 
 642 |         // Check to see if this node is a byline, and remove it if it is.
 643 |         if (this._checkByline(node, matchString)) {
 644 |           node = this._removeAndGetNext(node);
 645 |           continue;
 646 |         }
 647 | 
 648 |         // Remove unlikely candidates
 649 |         if (stripUnlikelyCandidates) {
 650 |           if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
 651 |               !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
 652 |               node.tagName !== "BODY" &&
 653 |               node.tagName !== "A") {
 654 |             this.log("Removing unlikely candidate - " + matchString);
 655 |             node = this._removeAndGetNext(node);
 656 |             continue;
 657 |           }
 658 |         }
 659 | 
 660 |         if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
 661 |           elementsToScore.push(node);
 662 |         }
 663 | 
 664 |         // Turn all divs that don't have children block level elements into p's
 665 |         if (node.tagName === "DIV") {
 666 |           // Sites like http://mobile.slate.com encloses each paragraph with a DIV
 667 |           // element. DIVs with only a P element inside and no text content can be
 668 |           // safely converted into plain P elements to avoid confusing the scoring
 669 |           // algorithm with DIVs with are, in practice, paragraphs.
 670 |           if (this._hasSinglePInsideElement(node)) {
 671 |             var newNode = node.children[0];
 672 |             node.parentNode.replaceChild(newNode, node);
 673 |             node = newNode;
 674 |           } else if (!this._hasChildBlockElement(node)) {
 675 |             node = this._setNodeTag(node, "P");
 676 |             elementsToScore.push(node);
 677 |           } else {
 678 |             // EXPERIMENTAL
 679 |             this._forEachNode(node.childNodes, function(childNode) {
 680 |               if (childNode.nodeType === Node.TEXT_NODE) {
 681 |                 var p = doc.createElement('p');
 682 |                 p.textContent = childNode.textContent;
 683 |                 p.style.display = 'inline';
 684 |                 p.className = 'readability-styled';
 685 |                 node.replaceChild(p, childNode);
 686 |               }
 687 |             });
 688 |           }
 689 |         }
 690 |         node = this._getNextNode(node);
 691 |       }
 692 | 
 693 |       /**
 694 |        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
 695 |        * Then add their score to their parent node.
 696 |        *
 697 |        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
 698 |       **/
 699 |       var candidates = [];
 700 |       this._forEachNode(elementsToScore, function(elementToScore) {
 701 |         if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
 702 |           return;
 703 | 
 704 |         // If this paragraph is less than 25 characters, don't even count it.
 705 |         var innerText = this._getInnerText(elementToScore);
 706 |         if (innerText.length < 25)
 707 |           return;
 708 | 
 709 |         // Exclude nodes with no ancestor.
 710 |         var ancestors = this._getNodeAncestors(elementToScore, 3);
 711 |         if (ancestors.length === 0)
 712 |           return;
 713 | 
 714 |         var contentScore = 0;
 715 | 
 716 |         // Add a point for the paragraph itself as a base.
 717 |         contentScore += 1;
 718 | 
 719 |         // Add points for any commas within this paragraph.
 720 |         contentScore += innerText.split(',').length;
 721 | 
 722 |         // For every 100 characters in this paragraph, add another point. Up to 3 points.
 723 |         contentScore += Math.min(Math.floor(innerText.length / 100), 3);
 724 | 
 725 |         // Initialize and score ancestors.
 726 |         this._forEachNode(ancestors, function(ancestor, level) {
 727 |           if (!ancestor.tagName)
 728 |             return;
 729 | 
 730 |           if (typeof(ancestor.readability) === 'undefined') {
 731 |             this._initializeNode(ancestor);
 732 |             candidates.push(ancestor);
 733 |           }
 734 | 
 735 |           // Node score divider:
 736 |           // - parent:             1 (no division)
 737 |           // - grandparent:        2
 738 |           // - great grandparent+: ancestor level * 3
 739 |           var scoreDivider = level === 0 ? 1 : level === 1 ? 2 : level * 3;
 740 |           ancestor.readability.contentScore += contentScore / scoreDivider;
 741 |         });
 742 |       });
 743 | 
 744 |       // After we've calculated scores, loop through all of the possible
 745 |       // candidate nodes we found and find the one with the highest score.
 746 |       var topCandidates = [];
 747 |       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
 748 |         var candidate = candidates[c];
 749 | 
 750 |         // Scale the final candidates score based on link density. Good content
 751 |         // should have a relatively small link density (5% or less) and be mostly
 752 |         // unaffected by this operation.
 753 |         var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
 754 |         candidate.readability.contentScore = candidateScore;
 755 | 
 756 |         this.log('Candidate:', candidate, "with score " + candidateScore);
 757 | 
 758 |         for (var t = 0; t < this._nbTopCandidates; t++) {
 759 |           var aTopCandidate = topCandidates[t];
 760 | 
 761 |           if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
 762 |             topCandidates.splice(t, 0, candidate);
 763 |             if (topCandidates.length > this._nbTopCandidates)
 764 |               topCandidates.pop();
 765 |             break;
 766 |           }
 767 |         }
 768 |       }
 769 | 
 770 |       var topCandidate = topCandidates[0] || null;
 771 |       var neededToCreateTopCandidate = false;
 772 | 
 773 |       // If we still have no top candidate, just use the body as a last resort.
 774 |       // We also have to copy the body node so it is something we can modify.
 775 |       if (topCandidate === null || topCandidate.tagName === "BODY") {
 776 |         // Move all of the page's children into topCandidate
 777 |         topCandidate = doc.createElement("DIV");
 778 |         neededToCreateTopCandidate = true;
 779 |         // Move everything (not just elements, also text nodes etc.) into the container
 780 |         // so we even include text directly in the body:
 781 |         var kids = page.childNodes;
 782 |         while (kids.length) {
 783 |           this.log("Moving child out:", kids[0]);
 784 |           topCandidate.appendChild(kids[0]);
 785 |         }
 786 | 
 787 |         page.appendChild(topCandidate);
 788 | 
 789 |         this._initializeNode(topCandidate);
 790 |       } else if (topCandidate) {
 791 |         // Because of our bonus system, parents of candidates might have scores
 792 |         // themselves. They get half of the node. There won't be nodes with higher
 793 |         // scores than our topCandidate, but if we see the score going *up* in the first
 794 |         // few steps up the tree, that's a decent sign that there might be more content
 795 |         // lurking in other places that we want to unify in. The sibling stuff
 796 |         // below does some of that - but only if we've looked high enough up the DOM
 797 |         // tree.
 798 |         var parentOfTopCandidate = topCandidate.parentNode;
 799 |         var lastScore = topCandidate.readability.contentScore;
 800 |         // The scores shouldn't get too low.
 801 |         var scoreThreshold = lastScore / 3;
 802 |         while (parentOfTopCandidate && parentOfTopCandidate.readability) {
 803 |           var parentScore = parentOfTopCandidate.readability.contentScore;
 804 |           if (parentScore < scoreThreshold)
 805 |             break;
 806 |           if (parentScore > lastScore) {
 807 |             // Alright! We found a better parent to use.
 808 |             topCandidate = parentOfTopCandidate;
 809 |             break;
 810 |           }
 811 |           lastScore = parentOfTopCandidate.readability.contentScore;
 812 |           parentOfTopCandidate = parentOfTopCandidate.parentNode;
 813 |         }
 814 |       }
 815 | 
 816 |       // Now that we have the top candidate, look through its siblings for content
 817 |       // that might also be related. Things like preambles, content split by ads
 818 |       // that we removed, etc.
 819 |       var articleContent = doc.createElement("DIV");
 820 |       if (isPaging)
 821 |         articleContent.id = "readability-content";
 822 | 
 823 |       var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
 824 |       var siblings = topCandidate.parentNode.children;
 825 | 
 826 |       for (var s = 0, sl = siblings.length; s < sl; s++) {
 827 |         var sibling = siblings[s];
 828 |         var append = false;
 829 | 
 830 |         this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : '');
 831 |         this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');
 832 | 
 833 |         if (sibling === topCandidate) {
 834 |           append = true;
 835 |         } else {
 836 |           var contentBonus = 0;
 837 | 
 838 |           // Give a bonus if sibling nodes and top candidates have the example same classname
 839 |           if (sibling.className === topCandidate.className && topCandidate.className !== "")
 840 |             contentBonus += topCandidate.readability.contentScore * 0.2;
 841 | 
 842 |           if (sibling.readability &&
 843 |               ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
 844 |             append = true;
 845 |           } else if (sibling.nodeName === "P") {
 846 |             var linkDensity = this._getLinkDensity(sibling);
 847 |             var nodeContent = this._getInnerText(sibling);
 848 |             var nodeLength = nodeContent.length;
 849 | 
 850 |             if (nodeLength > 80 && linkDensity < 0.25) {
 851 |               append = true;
 852 |             } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) {
 853 |               append = true;
 854 |             }
 855 |           }
 856 |         }
 857 | 
 858 |         if (append) {
 859 |           this.log("Appending node:", sibling);
 860 | 
 861 |           if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
 862 |             // We have a node that isn't a common block level element, like a form or td tag.
 863 |             // Turn it into a div so it doesn't get filtered out later by accident.
 864 |             this.log("Altering sibling:", sibling, 'to div.');
 865 | 
 866 |             sibling = this._setNodeTag(sibling, "DIV");
 867 |           }
 868 | 
 869 |           articleContent.appendChild(sibling);
 870 |           // siblings is a reference to the children array, and
 871 |           // sibling is removed from the array when we call appendChild().
 872 |           // As a result, we must revisit this index since the nodes
 873 |           // have been shifted.
 874 |           s -= 1;
 875 |           sl -= 1;
 876 |         }
 877 |       }
 878 | 
 879 |       if (this._debug)
 880 |         this.log("Article content pre-prep: " + articleContent.innerHTML);
 881 |       // So we have all of the content that we need. Now we clean it up for presentation.
 882 |       this._prepArticle(articleContent);
 883 |       if (this._debug)
 884 |         this.log("Article content post-prep: " + articleContent.innerHTML);
 885 | 
 886 |       if (this._curPageNum === 1) {
 887 |         if (neededToCreateTopCandidate) {
 888 |           // We already created a fake div thing, and there wouldn't have been any siblings left
 889 |           // for the previous loop, so there's no point trying to create a new div, and then
 890 |           // move all the children over. Just assign IDs and class names here. No need to append
 891 |           // because that already happened anyway.
 892 |           topCandidate.id = "readability-page-1";
 893 |           topCandidate.className = "page";
 894 |         } else {
 895 |           var div = doc.createElement("DIV");
 896 |           div.id = "readability-page-1";
 897 |           div.className = "page";
 898 |           var children = articleContent.childNodes;
 899 |           while (children.length) {
 900 |             div.appendChild(children[0]);
 901 |           }
 902 |           articleContent.appendChild(div);
 903 |         }
 904 |       }
 905 | 
 906 |       if (this._debug)
 907 |         this.log("Article content after paging: " + articleContent.innerHTML);
 908 | 
 909 |       // Now that we've gone through the full algorithm, check to see if
 910 |       // we got any meaningful content. If we didn't, we may need to re-run
 911 |       // grabArticle with different flags set. This gives us a higher likelihood of
 912 |       // finding the content, and the sieve approach gives us a higher likelihood of
 913 |       // finding the -right- content.
 914 |       if (this._getInnerText(articleContent, true).length < 500) {
 915 |         page.innerHTML = pageCacheHtml;
 916 | 
 917 |         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
 918 |           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
 919 |         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
 920 |           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
 921 |         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
 922 |           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
 923 |         } else {
 924 |           return null;
 925 |         }
 926 |       } else {
 927 |         return articleContent;
 928 |       }
 929 |     }
 930 |   },
 931 | 
 932 |   /**
 933 |    * Check whether the input string could be a byline.
 934 |    * This verifies that the input is a string, and that the length
 935 |    * is less than 100 chars.
 936 |    *
 937 |    * @param possibleByline {string} - a string to check whether its a byline.
 938 |    * @return Boolean - whether the input string is a byline.
 939 |    */
 940 |   _isValidByline: function(byline) {
 941 |     if (typeof byline == 'string' || byline instanceof String) {
 942 |       byline = byline.trim();
 943 |       return (byline.length > 0) && (byline.length < 100);
 944 |     }
 945 |     return false;
 946 |   },
 947 | 
 948 |   /**
 949 |    * Attempts to get excerpt and byline metadata for the article.
 950 |    *
 951 |    * @return Object with optional "excerpt" and "byline" properties
 952 |    */
 953 |   _getArticleMetadata: function() {
 954 |     var metadata = {};
 955 |     var values = {};
 956 |     var metaElements = this._doc.getElementsByTagName("meta");
 957 | 
 958 |     // Match "description", or Twitter's "twitter:description" (Cards)
 959 |     // in name attribute.
 960 |     var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi;
 961 | 
 962 |     // Match Facebook's Open Graph title & description properties.
 963 |     var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
 964 | 
 965 |     // Find description tags.
 966 |     this._forEachNode(metaElements, function(element) {
 967 |       var elementName = element.getAttribute("name");
 968 |       var elementProperty = element.getAttribute("property");
 969 | 
 970 |       if ([elementName, elementProperty].indexOf("author") !== -1) {
 971 |         metadata.byline = element.getAttribute("content");
 972 |         return;
 973 |       }
 974 | 
 975 |       var name = null;
 976 |       if (namePattern.test(elementName)) {
 977 |         name = elementName;
 978 |       } else if (propertyPattern.test(elementProperty)) {
 979 |         name = elementProperty;
 980 |       }
 981 | 
 982 |       if (name) {
 983 |         var content = element.getAttribute("content");
 984 |         if (content) {
 985 |           // Convert to lowercase and remove any whitespace
 986 |           // so we can match below.
 987 |           name = name.toLowerCase().replace(/\s/g, '');
 988 |           values[name] = content.trim();
 989 |         }
 990 |       }
 991 |     });
 992 | 
 993 |     if ("description" in values) {
 994 |       metadata.excerpt = values["description"];
 995 |     } else if ("og:description" in values) {
 996 |       // Use facebook open graph description.
 997 |       metadata.excerpt = values["og:description"];
 998 |     } else if ("twitter:description" in values) {
 999 |       // Use twitter cards description.
1000 |       metadata.excerpt = values["twitter:description"];
1001 |     }
1002 | 
1003 |     if ("og:title" in values) {
1004 |       // Use facebook open graph title.
1005 |       metadata.title = values["og:title"];
1006 |     } else if ("twitter:title" in values) {
1007 |       // Use twitter cards title.
1008 |       metadata.title = values["twitter:title"];
1009 |     }
1010 | 
1011 |     return metadata;
1012 |   },
1013 | 
1014 |   /**
1015 |    * Removes script tags from the document.
1016 |    *
1017 |    * @param Element
1018 |   **/
1019 |   _removeScripts: function(doc) {
1020 |     this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
1021 |       scriptNode.nodeValue = "";
1022 |       scriptNode.removeAttribute('src');
1023 | 
1024 |       if (scriptNode.parentNode)
1025 |         scriptNode.parentNode.removeChild(scriptNode);
1026 |     });
1027 |     this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
1028 |       if (noscriptNode.parentNode)
1029 |         noscriptNode.parentNode.removeChild(noscriptNode);
1030 |     });
1031 |   },
1032 | 
1033 |   /**
1034 |    * Check if this node has only whitespace and a single P element
1035 |    * Returns false if the DIV node contains non-empty text nodes
1036 |    * or if it contains no P or more than 1 element.
1037 |    *
1038 |    * @param Element
1039 |   **/
1040 |   _hasSinglePInsideElement: function(element) {
1041 |     // There should be exactly 1 element child which is a P:
1042 |     if (element.children.length != 1 || element.children[0].tagName !== "P") {
1043 |       return false;
1044 |     }
1045 | 
1046 |     // And there should be no text nodes with real content
1047 |     return !this._someNode(element.childNodes, function(node) {
1048 |       return node.nodeType === Node.TEXT_NODE &&
1049 |              this.REGEXPS.hasContent.test(node.textContent);
1050 |     });
1051 |   },
1052 | 
1053 |   /**
1054 |    * Determine whether element has any children block level elements.
1055 |    *
1056 |    * @param Element
1057 |    */
1058 |   _hasChildBlockElement: function (element) {
1059 |     return this._someNode(element.childNodes, function(node) {
1060 |       return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
1061 |              this._hasChildBlockElement(node);
1062 |     });
1063 |   },
1064 | 
1065 |   /**
1066 |    * Get the inner text of a node - cross browser compatibly.
1067 |    * This also strips out any excess whitespace to be found.
1068 |    *
1069 |    * @param Element
1070 |    * @param Boolean normalizeSpaces (default: true)
1071 |    * @return string
1072 |   **/
1073 |   _getInnerText: function(e, normalizeSpaces) {
1074 |     normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
1075 |     var textContent = e.textContent.trim();
1076 | 
1077 |     if (normalizeSpaces) {
1078 |       return textContent.replace(this.REGEXPS.normalize, " ");
1079 |     } else {
1080 |       return textContent;
1081 |     }
1082 |   },
1083 | 
1084 |   /**
1085 |    * Get the number of times a string s appears in the node e.
1086 |    *
1087 |    * @param Element
1088 |    * @param string - what to split on. Default is ","
1089 |    * @return number (integer)
1090 |   **/
1091 |   _getCharCount: function(e,s) {
1092 |     s = s || ",";
1093 |     return this._getInnerText(e).split(s).length - 1;
1094 |   },
1095 | 
1096 |   /**
1097 |    * Remove the style attribute on every e and under.
1098 |    * TODO: Test if getElementsByTagName(*) is faster.
1099 |    *
1100 |    * @param Element
1101 |    * @return void
1102 |   **/
1103 |   _cleanStyles: function(e) {
1104 |     e = e || this._doc;
1105 |     if (!e)
1106 |       return;
1107 |     var cur = e.firstChild;
1108 | 
1109 |     // Remove any root styles, if we're able.
1110 |     if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
1111 |       e.removeAttribute('style');
1112 | 
1113 |     // Go until there are no more child nodes
1114 |     while (cur !== null) {
1115 |       if (cur.nodeType === cur.ELEMENT_NODE) {
1116 |         // Remove style attribute(s) :
1117 |         if (cur.className !== "readability-styled")
1118 |           cur.removeAttribute("style");
1119 | 
1120 |         this._cleanStyles(cur);
1121 |       }
1122 | 
1123 |       cur = cur.nextSibling;
1124 |     }
1125 |   },
1126 | 
1127 |   /**
1128 |    * Get the density of links as a percentage of the content
1129 |    * This is the amount of text that is inside a link divided by the total text in the node.
1130 |    *
1131 |    * @param Element
1132 |    * @return number (float)
1133 |   **/
1134 |   _getLinkDensity: function(element) {
1135 |     var textLength = this._getInnerText(element).length;
1136 |     if (textLength === 0)
1137 |       return;
1138 | 
1139 |     var linkLength = 0;
1140 | 
1141 |     // XXX implement _reduceNodeList?
1142 |     this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
1143 |       linkLength += this._getInnerText(linkNode).length;
1144 |     });
1145 | 
1146 |     return linkLength / textLength;
1147 |   },
1148 | 
1149 |   /**
1150 |    * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
1151 |    *
1152 |    * @author Dan Lacy
1153 |    * @return string the base url
1154 |   **/
1155 |   _findBaseUrl: function() {
1156 |     var uri = this._uri;
1157 |     var noUrlParams = uri.path.split("?")[0];
1158 |     var urlSlashes = noUrlParams.split("/").reverse();
1159 |     var cleanedSegments = [];
1160 |     var possibleType = "";
1161 | 
1162 |     for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
1163 |       var segment = urlSlashes[i];
1164 | 
1165 |       // Split off and save anything that looks like a file type.
1166 |       if (segment.indexOf(".") !== -1) {
1167 |         possibleType = segment.split(".")[1];
1168 | 
1169 |         // If the type isn't alpha-only, it's probably not actually a file extension.
1170 |         if (!possibleType.match(/[^a-zA-Z]/))
1171 |           segment = segment.split(".")[0];
1172 |       }
1173 | 
1174 |       // EW-CMS specific segment replacement. Ugly.
1175 |       // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
1176 |       if (segment.indexOf(',00') !== -1)
1177 |         segment = segment.replace(',00', '');
1178 | 
1179 |       // If our first or second segment has anything looking like a page number, remove it.
1180 |       if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
1181 |         segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
1182 | 
1183 |       var del = false;
1184 | 
1185 |       // If this is purely a number, and it's the first or second segment,
1186 |       // it's probably a page number. Remove it.
1187 |       if (i < 2 && segment.match(/^\d{1,2}$/))
1188 |         del = true;
1189 | 
1190 |       // If this is the first segment and it's just "index", remove it.
1191 |       if (i === 0 && segment.toLowerCase() === "index")
1192 |         del = true;
1193 | 
1194 |       // If our first or second segment is smaller than 3 characters,
1195 |       // and the first segment was purely alphas, remove it.
1196 |       if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
1197 |         del = true;
1198 | 
1199 |       // If it's not marked for deletion, push it to cleanedSegments.
1200 |       if (!del)
1201 |         cleanedSegments.push(segment);
1202 |     }
1203 | 
1204 |     // This is our final, cleaned, base article URL.
1205 |     return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
1206 |   },
1207 | 
1208 |   /**
1209 |    * Look for any paging links that may occur within the document.
1210 |    *
1211 |    * @param body
1212 |    * @return object (array)
1213 |   **/
1214 |   _findNextPageLink: function(elem) {
1215 |     var uri = this._uri;
1216 |     var possiblePages = {};
1217 |     var allLinks = elem.getElementsByTagName('a');
1218 |     var articleBaseUrl = this._findBaseUrl();
1219 | 
1220 |     // Loop through all links, looking for hints that they may be next-page links.
1221 |     // Things like having "page" in their textContent, className or id, or being a child
1222 |     // of a node with a page-y className or id.
1223 |     //
1224 |     // Also possible: levenshtein distance? longest common subsequence?
1225 |     //
1226 |     // After we do that, assign each page a score, and
1227 |     for (var i = 0, il = allLinks.length; i < il; i += 1) {
1228 |       var link = allLinks[i];
1229 |       var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
1230 | 
1231 |       // If we've already seen this page, ignore it.
1232 |       if (linkHref === "" ||
1233 |         linkHref === articleBaseUrl ||
1234 |         linkHref === uri.spec ||
1235 |         linkHref in this._parsedPages) {
1236 |         continue;
1237 |       }
1238 | 
1239 |       // If it's on a different domain, skip it.
1240 |       if (uri.host !== linkHref.split(/\/+/g)[1])
1241 |         continue;
1242 | 
1243 |       var linkText = this._getInnerText(link);
1244 | 
1245 |       // If the linkText looks like it's not the next page, skip it.
1246 |       if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
1247 |         continue;
1248 | 
1249 |       // If the leftovers of the URL after removing the base URL don't contain
1250 |       // any digits, it's certainly not a next page link.
1251 |       var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1252 |       if (!linkHrefLeftover.match(/\d/))
1253 |         continue;
1254 | 
1255 |       if (!(linkHref in possiblePages)) {
1256 |         possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
1257 |       } else {
1258 |         possiblePages[linkHref].linkText += ' | ' + linkText;
1259 |       }
1260 | 
1261 |       var linkObj = possiblePages[linkHref];
1262 | 
1263 |       // If the articleBaseUrl isn't part of this URL, penalize this link. It could
1264 |       // still be the link, but the odds are lower.
1265 |       // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1266 |       if (linkHref.indexOf(articleBaseUrl) !== 0)
1267 |         linkObj.score -= 25;
1268 | 
1269 |       var linkData = linkText + ' ' + link.className + ' ' + link.id;
1270 |       if (linkData.match(this.REGEXPS.nextLink))
1271 |         linkObj.score += 50;
1272 | 
1273 |       if (linkData.match(/pag(e|ing|inat)/i))
1274 |         linkObj.score += 25;
1275 | 
1276 |       if (linkData.match(/(first|last)/i)) {
1277 |         // -65 is enough to negate any bonuses gotten from a > or » in the text,
1278 |         // If we already matched on "next", last is probably fine.
1279 |         // If we didn't, then it's bad. Penalize.
1280 |         if (!linkObj.linkText.match(this.REGEXPS.nextLink))
1281 |           linkObj.score -= 65;
1282 |       }
1283 | 
1284 |       if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
1285 |         linkObj.score -= 50;
1286 | 
1287 |       if (linkData.match(this.REGEXPS.prevLink))
1288 |         linkObj.score -= 200;
1289 | 
1290 |       // If a parentNode contains page or paging or paginat
1291 |       var parentNode = link.parentNode;
1292 |       var positiveNodeMatch = false;
1293 |       var negativeNodeMatch = false;
1294 | 
1295 |       while (parentNode) {
1296 |         var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
1297 | 
1298 |         if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
1299 |           positiveNodeMatch = true;
1300 |           linkObj.score += 25;
1301 |         }
1302 | 
1303 |         if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
1304 |           // If this is just something like "footer", give it a negative.
1305 |           // If it's something like "body-and-footer", leave it be.
1306 |           if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
1307 |             linkObj.score -= 25;
1308 |             negativeNodeMatch = true;
1309 |           }
1310 |         }
1311 | 
1312 |         parentNode = parentNode.parentNode;
1313 |       }
1314 | 
1315 |       // If the URL looks like it has paging in it, add to the score.
1316 |       // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1317 |       if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
1318 |         linkObj.score += 25;
1319 | 
1320 |       // If the URL contains negative values, give a slight decrease.
1321 |       if (linkHref.match(this.REGEXPS.extraneous))
1322 |         linkObj.score -= 15;
1323 | 
1324 |       /**
1325 |        * Minor punishment to anything that doesn't match our current URL.
1326 |        * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
1327 |        *     Dan, can you show me a counterexample where this is necessary?
1328 |        * if (linkHref.indexOf(window.location.href) !== 0) {
1329 |        *  linkObj.score -= 1;
1330 |        * }
1331 |       **/
1332 | 
1333 |       // If the link text can be parsed as a number, give it a minor bonus, with a slight
1334 |       // bias towards lower numbered pages. This is so that pages that might not have 'next'
1335 |       // in their text can still get scored, and sorted properly by score.
1336 |       var linkTextAsNumber = parseInt(linkText, 10);
1337 |       if (linkTextAsNumber) {
1338 |         // Punish 1 since we're either already there, or it's probably
1339 |         // before what we want anyways.
1340 |         if (linkTextAsNumber === 1) {
1341 |           linkObj.score -= 10;
1342 |         } else {
1343 |           linkObj.score += Math.max(0, 10 - linkTextAsNumber);
1344 |         }
1345 |       }
1346 |     }
1347 | 
1348 |     // Loop thrugh all of our possible pages from above and find our top
1349 |     // candidate for the next page URL. Require at least a score of 50, which
1350 |     // is a relatively high confidence that this page is the next link.
1351 |     var topPage = null;
1352 |     for (var page in possiblePages) {
1353 |       if (possiblePages.hasOwnProperty(page)) {
1354 |         if (possiblePages[page].score >= 50 &&
1355 |           (!topPage || topPage.score < possiblePages[page].score))
1356 |           topPage = possiblePages[page];
1357 |       }
1358 |     }
1359 | 
1360 |     if (topPage) {
1361 |       var nextHref = topPage.href.replace(/\/$/,'');
1362 | 
1363 |       this.log('NEXT PAGE IS ' + nextHref);
1364 |       this._parsedPages[nextHref] = true;
1365 |       return nextHref;
1366 |     } else {
1367 |       return null;
1368 |     }
1369 |   },
1370 | 
1371 |   _successfulRequest: function(request) {
1372 |     return (request.status >= 200 && request.status < 300) ||
1373 |         request.status === 304 ||
1374 |          (request.status === 0 && request.responseText);
1375 |   },
1376 | 
1377 |   _ajax: function(url, options) {
1378 |     var request = new XMLHttpRequest();
1379 | 
1380 |     function respondToReadyState(readyState) {
1381 |       if (request.readyState === 4) {
1382 |         if (this._successfulRequest(request)) {
1383 |           if (options.success)
1384 |             options.success(request);
1385 |         } else {
1386 |           if (options.error)
1387 |             options.error(request);
1388 |         }
1389 |       }
1390 |     }
1391 | 
1392 |     if (typeof options === 'undefined')
1393 |       options = {};
1394 | 
1395 |     request.onreadystatechange = respondToReadyState;
1396 | 
1397 |     request.open('get', url, true);
1398 |     request.setRequestHeader('Accept', 'text/html');
1399 | 
1400 |     try {
1401 |       request.send(options.postBody);
1402 |     } catch (e) {
1403 |       if (options.error)
1404 |         options.error();
1405 |     }
1406 | 
1407 |     return request;
1408 |   },
1409 | 
1410 |   _appendNextPage: function(nextPageLink) {
1411 |     var doc = this._doc;
1412 |     this._curPageNum += 1;
1413 | 
1414 |     var articlePage = doc.createElement("DIV");
1415 |     articlePage.id = 'readability-page-' + this._curPageNum;
1416 |     articlePage.className = 'page';
1417 |     articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">&sect;</p>';
1418 | 
1419 |     doc.getElementById("readability-content").appendChild(articlePage);
1420 | 
1421 |     if (this._curPageNum > this._maxPages) {
1422 |       var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
1423 |       articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
1424 |       return;
1425 |     }
1426 | 
1427 |     // Now that we've built the article page DOM element, get the page content
1428 |     // asynchronously and load the cleaned content into the div we created for it.
1429 |     (function(pageUrl, thisPage) {
1430 |       this._ajax(pageUrl, {
1431 |         success: function(r) {
1432 | 
1433 |           // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
1434 |           var eTag = r.getResponseHeader('ETag');
1435 |           if (eTag) {
1436 |             if (eTag in this._pageETags) {
1437 |               this.log("Exact duplicate page found via ETag. Aborting.");
1438 |               articlePage.style.display = 'none';
1439 |               return;
1440 |             } else {
1441 |               this._pageETags[eTag] = 1;
1442 |             }
1443 |           }
1444 | 
1445 |           // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
1446 |           var page = doc.createElement("DIV");
1447 | 
1448 |           // Do some preprocessing to our HTML to make it ready for appending.
1449 |           // - Remove any script tags. Swap and reswap newlines with a unicode
1450 |           //   character because multiline regex doesn't work in javascript.
1451 |           // - Turn any noscript tags into divs so that we can parse them. This
1452 |           //   allows us to find any next page links hidden via javascript.
1453 |           // - Turn all double br's into p's - was handled by prepDocument in the original view.
1454 |           //   Maybe in the future abstract out prepDocument to work for both the original document
1455 |           //   and AJAX-added pages.
1456 |           var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
1457 |           responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
1458 |           responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
1459 |           responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
1460 | 
1461 |           page.innerHTML = responseHtml;
1462 |           this._replaceBrs(page);
1463 | 
1464 |           // Reset all flags for the next page, as they will search through it and
1465 |           // disable as necessary at the end of grabArticle.
1466 |           this._flags = 0x1 | 0x2 | 0x4;
1467 | 
1468 |           var nextPageLink = this._findNextPageLink(page);
1469 | 
1470 |           // NOTE: if we end up supporting _appendNextPage(), we'll need to
1471 |           // change this call to be async
1472 |           var content = this._grabArticle(page);
1473 | 
1474 |           if (!content) {
1475 |             this.log("No content found in page to append. Aborting.");
1476 |             return;
1477 |           }
1478 | 
1479 |           // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
1480 |           // Compare it against all of the the previous document's we've gotten. If the previous
1481 |           // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
1482 |           var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
1483 |           if (firstP && firstP.innerHTML.length > 100) {
1484 |             for (var i = 1; i <= this._curPageNum; i += 1) {
1485 |               var rPage = doc.getElementById('readability-page-' + i);
1486 |               if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
1487 |                 this.log('Duplicate of page ' + i + ' - skipping.');
1488 |                 articlePage.style.display = 'none';
1489 |                 this._parsedPages[pageUrl] = true;
1490 |                 return;
1491 |               }
1492 |             }
1493 |           }
1494 | 
1495 |           this._removeScripts(content);
1496 | 
1497 |           thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
1498 | 
1499 |           // After the page has rendered, post process the content. This delay is necessary because,
1500 |           // in webkit at least, offsetWidth is not set in time to determine image width. We have to
1501 |           // wait a little bit for reflow to finish before we can fix floating images.
1502 |           setTimeout((function() {
1503 |             this._postProcessContent(thisPage);
1504 |           }).bind(this), 500);
1505 | 
1506 | 
1507 |           if (nextPageLink)
1508 |             this._appendNextPage(nextPageLink);
1509 |         }
1510 |       });
1511 |     }).bind(this)(nextPageLink, articlePage);
1512 |   },
1513 | 
1514 |   /**
1515 |    * Get an elements class/id weight. Uses regular expressions to tell if this
1516 |    * element looks good or bad.
1517 |    *
1518 |    * @param Element
1519 |    * @return number (Integer)
1520 |   **/
1521 |   _getClassWeight: function(e) {
1522 |     if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES))
1523 |       return 0;
1524 | 
1525 |     var weight = 0;
1526 | 
1527 |     // Look for a special classname
1528 |     if (typeof(e.className) === 'string' && e.className !== '') {
1529 |       if (this.REGEXPS.negative.test(e.className))
1530 |         weight -= 25;
1531 | 
1532 |       if (this.REGEXPS.positive.test(e.className))
1533 |         weight += 25;
1534 |     }
1535 | 
1536 |     // Look for a special ID
1537 |     if (typeof(e.id) === 'string' && e.id !== '') {
1538 |       if (this.REGEXPS.negative.test(e.id))
1539 |         weight -= 25;
1540 | 
1541 |       if (this.REGEXPS.positive.test(e.id))
1542 |         weight += 25;
1543 |     }
1544 | 
1545 |     return weight;
1546 |   },
1547 | 
1548 |   /**
1549 |    * Clean a node of all elements of type "tag".
1550 |    * (Unless it's a youtube/vimeo video. People love movies.)
1551 |    *
1552 |    * @param Element
1553 |    * @param string tag to clean
1554 |    * @return void
1555 |    **/
1556 |   _clean: function(e, tag) {
1557 |     var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
1558 | 
1559 |     this._forEachNode(e.getElementsByTagName(tag), function(element) {
1560 |       // Allow youtube and vimeo videos through as people usually want to see those.
1561 |       if (isEmbed) {
1562 |         var attributeValues = [].map.call(element.attributes, function(attr) {
1563 |           return attr.value;
1564 |         }).join("|");
1565 | 
1566 |         // First, check the elements attributes to see if any of them contain youtube or vimeo
1567 |         if (this.REGEXPS.videos.test(attributeValues))
1568 |           return;
1569 | 
1570 |         // Then check the elements inside this element for the same.
1571 |         if (this.REGEXPS.videos.test(element.innerHTML))
1572 |           return;
1573 |       }
1574 | 
1575 |       element.parentNode.removeChild(element);
1576 |     });
1577 |   },
1578 | 
1579 |   /**
1580 |    * Check if a given node has one of its ancestor tag name matching the
1581 |    * provided one.
1582 |    * @param  HTMLElement node
1583 |    * @param  String      tagName
1584 |    * @param  Number      maxDepth
1585 |    * @return Boolean
1586 |    */
1587 |   _hasAncestorTag: function(node, tagName, maxDepth) {
1588 |     maxDepth = maxDepth || 3;
1589 |     tagName = tagName.toUpperCase();
1590 |     var depth = 0;
1591 |     while (node.parentNode) {
1592 |       if (depth > maxDepth)
1593 |         return false;
1594 |       if (node.parentNode.tagName === tagName)
1595 |         return true;
1596 |       node = node.parentNode;
1597 |       depth++;
1598 |     }
1599 |     return false;
1600 |   },
1601 | 
1602 |   /**
1603 |    * Clean an element of all tags of type "tag" if they look fishy.
1604 |    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
1605 |    *
1606 |    * @return void
1607 |    **/
1608 |   _cleanConditionally: function(e, tag) {
1609 |     if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
1610 |       return;
1611 | 
1612 |     var tagsList = e.getElementsByTagName(tag);
1613 |     var curTagsLength = tagsList.length;
1614 |     var isList = tag === "ul" || tag === "ol";
1615 | 
1616 |     // Gather counts for other typical elements embedded within.
1617 |     // Traverse backwards so we can remove nodes at the same time
1618 |     // without effecting the traversal.
1619 |     //
1620 |     // TODO: Consider taking into account original contentScore here.
1621 |     for (var i = curTagsLength-1; i >= 0; i -= 1) {
1622 |       var weight = this._getClassWeight(tagsList[i]);
1623 |       var contentScore = 0;
1624 | 
1625 |       this.log("Cleaning Conditionally", tagsList[i]);
1626 | 
1627 |       if (weight + contentScore < 0) {
1628 |         tagsList[i].parentNode.removeChild(tagsList[i]);
1629 |       } else if (this._getCharCount(tagsList[i],',') < 10) {
1630 |         // If there are not very many commas, and the number of
1631 |         // non-paragraph elements is more than paragraphs or other
1632 |         // ominous signs, remove the element.
1633 |         var p = tagsList[i].getElementsByTagName("p").length;
1634 |         var img = tagsList[i].getElementsByTagName("img").length;
1635 |         var li = tagsList[i].getElementsByTagName("li").length-100;
1636 |         var input = tagsList[i].getElementsByTagName("input").length;
1637 | 
1638 |         var embedCount = 0;
1639 |         var embeds = tagsList[i].getElementsByTagName("embed");
1640 |         for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
1641 |           if (!this.REGEXPS.videos.test(embeds[ei].src))
1642 |             embedCount += 1;
1643 |         }
1644 | 
1645 |         var linkDensity = this._getLinkDensity(tagsList[i]);
1646 |         var contentLength = this._getInnerText(tagsList[i]).length;
1647 |         var toRemove = false;
1648 |         if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
1649 |           toRemove = true;
1650 |         } else if (!isList && li > p) {
1651 |           toRemove = true;
1652 |         } else if (input > Math.floor(p/3)) {
1653 |           toRemove = true;
1654 |         } else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
1655 |           toRemove = true;
1656 |         } else if (!isList && weight < 25 && linkDensity > 0.2) {
1657 |           toRemove = true;
1658 |         } else if (weight >= 25 && linkDensity > 0.5) {
1659 |           toRemove = true;
1660 |         } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1661 |           toRemove = true;
1662 |         }
1663 | 
1664 |         if (toRemove) {
1665 |           tagsList[i].parentNode.removeChild(tagsList[i]);
1666 |         }
1667 |       }
1668 |     }
1669 |   },
1670 | 
1671 |   /**
1672 |    * Clean out spurious headers from an Element. Checks things like classnames and link density.
1673 |    *
1674 |    * @param Element
1675 |    * @return void
1676 |   **/
1677 |   _cleanHeaders: function(e) {
1678 |     for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
1679 |       var headers = e.getElementsByTagName('h' + headerIndex);
1680 |       for (var i = headers.length - 1; i >= 0; i -= 1) {
1681 |         if (this._getClassWeight(headers[i]) < 0)
1682 |           headers[i].parentNode.removeChild(headers[i]);
1683 |       }
1684 |     }
1685 |   },
1686 | 
1687 |   _flagIsActive: function(flag) {
1688 |     return (this._flags & flag) > 0;
1689 |   },
1690 | 
1691 |   _addFlag: function(flag) {
1692 |     this._flags = this._flags | flag;
1693 |   },
1694 | 
1695 |   _removeFlag: function(flag) {
1696 |     this._flags = this._flags & ~flag;
1697 |   },
1698 | 
1699 |   /**
1700 |    * Decides whether or not the document is reader-able without parsing the whole thing.
1701 |    *
1702 |    * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
1703 |    */
1704 |   isProbablyReaderable: function(helperIsVisible) {
1705 |     var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
1706 | 
1707 |     // FIXME we should have a fallback for helperIsVisible, but this is
1708 |     // problematic because of jsdom's elem.style handling - see
1709 |     // https://github.com/mozilla/readability/pull/186 for context.
1710 | 
1711 |     var score = 0;
1712 |     // This is a little cheeky, we use the accumulator 'score' to decide what to return from
1713 |     // this callback:
1714 |     return this._someNode(nodes, function(node) {
1715 |       if (helperIsVisible && !helperIsVisible(node))
1716 |         return false;
1717 |       var matchString = node.className + " " + node.id;
1718 | 
1719 |       if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
1720 |           !this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
1721 |         return false;
1722 |       }
1723 | 
1724 |       if (node.matches && node.matches("li p")) {
1725 |         return false;
1726 |       }
1727 | 
1728 |       var textContentLength = node.textContent.trim().length;
1729 |       if (textContentLength < 140) {
1730 |         return false;
1731 |       }
1732 | 
1733 |       score += Math.sqrt(textContentLength - 140);
1734 | 
1735 |       if (score > 20) {
1736 |         return true;
1737 |       }
1738 |       return false;
1739 |     });
1740 |   },
1741 | 
1742 |   /**
1743 |    * Runs readability.
1744 |    *
1745 |    * Workflow:
1746 |    *  1. Prep the document by removing script tags, css, etc.
1747 |    *  2. Build readability's DOM tree.
1748 |    *  3. Grab the article content from the current dom tree.
1749 |    *  4. Replace the current DOM tree with the new one.
1750 |    *  5. Read peacefully.
1751 |    *
1752 |    * @return void
1753 |    **/
1754 |   parse: function () {
1755 |     // Avoid parsing too large documents, as per configuration option
1756 |     if (this._maxElemsToParse > 0) {
1757 |       var numTags = this._doc.getElementsByTagName("*").length;
1758 |       if (numTags > this._maxElemsToParse) {
1759 |         throw new Error("Aborting parsing document; " + numTags + " elements found");
1760 |       }
1761 |     }
1762 | 
1763 |     if (typeof this._doc.documentElement.firstElementChild === "undefined") {
1764 |       this._getNextNode = this._getNextNodeNoElementProperties;
1765 |     }
1766 |     // Remove script tags from the document.
1767 |     this._removeScripts(this._doc);
1768 | 
1769 |     // FIXME: Disabled multi-page article support for now as it
1770 |     // needs more work on infrastructure.
1771 | 
1772 |     // Make sure this document is added to the list of parsed pages first,
1773 |     // so we don't double up on the first page.
1774 |     // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
1775 | 
1776 |     // Pull out any possible next page link first.
1777 |     // var nextPageLink = this._findNextPageLink(doc.body);
1778 | 
1779 |     this._prepDocument();
1780 | 
1781 |     var metadata = this._getArticleMetadata();
1782 |     var articleTitle = metadata.title || this._getArticleTitle();
1783 | 
1784 |     var articleContent = this._grabArticle();
1785 |     if (!articleContent)
1786 |       return null;
1787 | 
1788 |     this.log("Grabbed: " + articleContent.innerHTML);
1789 | 
1790 |     this._postProcessContent(articleContent);
1791 | 
1792 |     // if (nextPageLink) {
1793 |     //   // Append any additional pages after a small timeout so that people
1794 |     //   // can start reading without having to wait for this to finish processing.
1795 |     //   setTimeout((function() {
1796 |     //     this._appendNextPage(nextPageLink);
1797 |     //   }).bind(this), 500);
1798 |     // }
1799 | 
1800 |     // If we haven't found an excerpt in the article's metadata, use the article's
1801 |     // first paragraph as the excerpt. This is used for displaying a preview of
1802 |     // the article's content.
1803 |     if (!metadata.excerpt) {
1804 |       var paragraphs = articleContent.getElementsByTagName("p");
1805 |       if (paragraphs.length > 0) {
1806 |         metadata.excerpt = paragraphs[0].textContent.trim();
1807 |       }
1808 |     }
1809 | 
1810 |     return { uri: this._uri,
1811 |              title: articleTitle,
1812 |              byline: metadata.byline || this._articleByline,
1813 |              dir: this._articleDir,
1814 |              content: articleContent.innerHTML,
1815 |              length: articleContent.textContent.length,
1816 |              excerpt: metadata.excerpt };
1817 |   }
1818 | };
1819 | 


--------------------------------------------------------------------------------