├── Procfile ├── run.js ├── .gitignore ├── index.js ├── static ├── style.css ├── main.js └── index.html ├── .travis.yml ├── sanitize.js ├── package.json ├── scrape.js ├── server.js ├── phantom-scrape.js ├── README.md ├── test └── index.js └── vendor └── Readability.js /Procfile: -------------------------------------------------------------------------------- 1 | web: node run.js 2 | -------------------------------------------------------------------------------- /run.js: -------------------------------------------------------------------------------- 1 | require("./server").serve(); 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | npm-debug.log 3 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | scrape: require("./scrape"), 3 | server: require("./server") 4 | }; 5 | -------------------------------------------------------------------------------- /static/style.css: -------------------------------------------------------------------------------- 1 | iframe { 2 | border: none; 3 | width: 100%; 4 | height: 640px; 5 | background: #fff; 6 | } 7 | iframe body { 8 | font-size: 22px; 9 | } 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.12" 4 | - "4" 5 | - "5" 6 | - "6" 7 | before_install: 8 | - sudo apt-get install python-software-properties 9 | - sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y 10 | - sudo apt-get update 11 | - sudo apt-get install gcc-5 g++-5 12 | - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 80 --slave /usr/bin/g++ g++ /usr/bin/g++-5 13 | - sudo update-alternatives --set gcc /usr/bin/gcc-5 14 | -------------------------------------------------------------------------------- /sanitize.js: -------------------------------------------------------------------------------- 1 | var html2md = require("html-md"); 2 | var markdown = require("markdown"); 3 | 4 | /** 5 | * Takes a result object and replace native html contents with a safer sanitized 6 | * version. 7 | * @param {Object} resultObject 8 | * @return {Object} 9 | */ 10 | exports.sanitizeResult = function(resultObject) { 11 | try { 12 | var sanitized = markdown.parse(html2md(resultObject.content)); 13 | resultObject.content = sanitized; 14 | resultObject.length = sanitized.length; 15 | return resultObject; 16 | } catch (err) { 17 | throw {error: "Failed HTML sanitization:" + (err || "Unknown reason.")}; 18 | } 19 | }; 20 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "readable-proxy", 3 | "version": "1.6.1", 4 | "description": "Node service attempting to fetch readable contents from any URL.", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node run.js", 8 | "test": "mocha" 9 | }, 10 | "keywords": [ 11 | "readable", 12 | "readability", 13 | "fetch", 14 | "proxy", 15 | "scrape" 16 | ], 17 | "author": "Nicolas Perriault ", 18 | "license": "MPL", 19 | "dependencies": { 20 | "bluebird": "^2.9.*", 21 | "bootstrap": "^3.3.*", 22 | "cheerio": "^0.22.0", 23 | "express": "^4.11.*", 24 | "html-md": "^3.0.*", 25 | "markdown": "^0.5.*", 26 | "object-assign": "^2.0.*", 27 | "phantomjs-prebuilt": "^2.1.*" 28 | }, 29 | "devDependencies": { 30 | "chai": "^2.1.*", 31 | "mocha": "^2.1.*", 32 | "sinon": "^1.12.*", 33 | "supertest": "^1.2.*" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /scrape.js: -------------------------------------------------------------------------------- 1 | var childProcess = require("child_process"); 2 | var phantomjs = require("phantomjs-prebuilt"); 3 | var binPath = phantomjs.path; 4 | var path = require("path"); 5 | var Promise = require("bluebird"); 6 | var objectAssign = require("object-assign"); 7 | 8 | var readabilityPath = process.env.READABILITY_LIB_PATH || 9 | path.normalize(path.join(__dirname, "vendor", "Readability.js")); 10 | 11 | module.exports = function scrape(url, options) { 12 | options = options || {}; 13 | if (!url) throw new Error("Missing url."); 14 | return new Promise(function(fulfill, reject) { 15 | var childArgs = [path.join(__dirname, "phantom-scrape.js"), url, readabilityPath]; 16 | if (options.userAgent) { 17 | childArgs.push(options.userAgent); 18 | } 19 | childProcess.execFile(binPath, childArgs, function(err, stdout, stderr) { 20 | if (err) { 21 | return reject(err); 22 | } 23 | var response, error; 24 | try { 25 | response = JSON.parse(stdout); 26 | } catch (e) { 27 | error = { 28 | message: "Unable to parse JSON proxy response.", 29 | line: e.line, 30 | stack: e.stack 31 | }; 32 | } 33 | if (response && response.error) { 34 | error = response.error; 35 | } 36 | if (error) { 37 | reject(objectAssign(new Error(error.message), error)); 38 | } else if (!response) { 39 | reject(new Error("Empty scraped response.")); 40 | } else { 41 | fulfill(response); 42 | } 43 | }); 44 | }); 45 | }; 46 | -------------------------------------------------------------------------------- /static/main.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | "use strict"; 3 | 4 | var q = document.querySelector.bind(document); 5 | 6 | function injectReadableContents(params, target) { 7 | q("#error").classList.add("hide"); 8 | var req = new XMLHttpRequest(); 9 | var apiUrl = [ 10 | "/api/get?sanitize=" + (params.sanitize ? "yes" : "no"), 11 | "url=" + encodeURIComponent(params.url), 12 | "userAgent=" + encodeURIComponent(params.userAgent) 13 | ].join("&"); 14 | req.open("GET", apiUrl, false); 15 | req.send(null); 16 | var jsonResponse = JSON.parse(req.responseText); 17 | if (jsonResponse.error) { 18 | q("#error").textContent = jsonResponse.error.message; 19 | q("#error").classList.remove("hide"); 20 | q("#readerable").textContent = ""; 21 | q("#title").textContent = ""; 22 | q("#byline").textContent = ""; 23 | q("#length").textContent = ""; 24 | q("#dir").textContent = ""; 25 | q("#excerpt").textContent = ""; 26 | q("#logs").value = ""; 27 | target.contentDocument.body.innerHTML = ""; 28 | } else { 29 | q("#error").textContent = ""; 30 | q("#readerable").textContent = jsonResponse.isProbablyReaderable; 31 | q("#title").textContent = jsonResponse.title; 32 | q("#byline").textContent = jsonResponse.byline; 33 | q("#length").textContent = jsonResponse.length; 34 | q("#dir").textContent = jsonResponse.dir; 35 | q("#excerpt").textContent = jsonResponse.excerpt; 36 | q("#logs").value = (jsonResponse.consoleLogs || []).join("\n"); 37 | target.contentDocument.body.innerHTML = jsonResponse.content; 38 | } 39 | } 40 | 41 | function init() { 42 | q("form").addEventListener("submit", function(event) { 43 | event.preventDefault(); 44 | var url = q("#url").value; 45 | q("#source").src = url; 46 | injectReadableContents({ 47 | url: url, 48 | sanitize: q("#sanitize").checked, 49 | userAgent: q("#userAgent").value 50 | }, q("#target")); 51 | }); 52 | } 53 | 54 | window.addEventListener("DOMContentLoaded", init); 55 | })(); 56 | -------------------------------------------------------------------------------- /server.js: -------------------------------------------------------------------------------- 1 | var scrape = require("./scrape"); 2 | var sanitizeResult = require("./sanitize").sanitizeResult; 3 | var express = require("express"); 4 | var pkgInfo = require("./package.json"); 5 | var cheerio = require("cheerio"); 6 | 7 | var app = express(); 8 | exports.app = app; 9 | 10 | app.use(express.static("static")); 11 | app.use(express.static("node_modules/bootstrap/dist/css")); 12 | 13 | /** 14 | * Casts a query string arg into an actual boolean value. 15 | * @param {String} arg The query string arg. 16 | * @return {Boolean} 17 | */ 18 | function boolArg(queryParam) { 19 | if (!queryParam) return false; 20 | return ["1", "on", "true", "yes", "y"].indexOf(queryParam.toLowerCase()) !== -1; 21 | } 22 | 23 | app.use(function(req, res, next) { 24 | res.header("Content-Type", "application/json"); 25 | res.header("Access-Control-Allow-Origin", "*"); 26 | res.header("Access-Control-Allow-Headers", "Origin, Requested-With, Content-Type, Accept"); 27 | next(); 28 | }); 29 | 30 | app.get("/api", function(req, res) { 31 | res.json({ 32 | name: pkgInfo.name, 33 | documentation: "https://github.com/n1k0/readable-proxy/blob/master/README.md", 34 | description: pkgInfo.description, 35 | version: pkgInfo.version 36 | }); 37 | }); 38 | 39 | app.get("/api/get", function(req, res) { 40 | var url = req.query.url, 41 | sanitize = boolArg(req.query.sanitize), 42 | userAgent = req.query.userAgent; 43 | if (!url) { 44 | return res.status(400).json({error: "Missing url parameter"}); 45 | } 46 | function handleError(err) { 47 | console.error(err); 48 | res.status(500).json({error: {message: err.message}}); 49 | } 50 | scrape(url, {userAgent: userAgent}) 51 | .then(function(result) { 52 | if (!result) { 53 | throw new Error("No scraped result received."); 54 | } 55 | 56 | var sanitizedResult = sanitizeResult(result); 57 | var $ = cheerio.load(sanitizedResult.content); 58 | var rawText = $('*').contents().map(function() { 59 | return (this.type === 'text') ? $(this).text() + ' ' : ''; 60 | }).get().join(''); 61 | 62 | result.rawText = rawText.trim(); 63 | 64 | res.json(sanitize ? sanitizedResult : result); 65 | }) 66 | .catch(handleError); 67 | }); 68 | 69 | exports.serve = function() { 70 | var server = app.listen(process.env.PORT || 3000, function() { 71 | var host = server.address().address; 72 | var port = server.address().port; 73 | console.log("Server listening at http://%s:%s", host, port); 74 | }); 75 | }; 76 | -------------------------------------------------------------------------------- /static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Readability.js test page 6 | 7 | 8 | 15 | 16 | 17 |
18 | 21 |
22 |
23 |
24 |
25 |
26 | 27 |
28 | 29 |
30 |
31 |
32 | 33 |
34 | 35 |
36 |
37 |
38 |
39 |
40 | 41 |
42 |
43 |
44 |
45 |
46 | 47 |
48 |
49 |
50 |
51 |
52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
Readerable?
Title
Dir
Byline
Length
Excerpt
60 |
61 |
62 |
63 |
64 |
65 |
Original
66 | 67 |
68 |
69 |
70 |
71 |
Readable
72 | 73 |
74 |
75 |
76 |
77 |

Console logs

78 | 79 |
80 |
81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /phantom-scrape.js: -------------------------------------------------------------------------------- 1 | var system = require("system"); 2 | var page = require("webpage").create(); 3 | var url = system.args[1]; 4 | var readabilityPath = system.args[2]; 5 | var userAgent = system.args[3]; 6 | var consoleLogs = []; 7 | 8 | // Prevent page js errors to break JSON output 9 | // XXX: should we log these instead? 10 | phantom.onError = page.onError = function(){}; 11 | 12 | function exitWithError(message) { 13 | outputJSON({error: {message: message}}); 14 | phantom.exit(); 15 | } 16 | 17 | function outputJSON(object) { 18 | console.log(JSON.stringify(object, null, 2)); 19 | } 20 | 21 | /** 22 | * Note: This function runs within page environment. 23 | */ 24 | function runReadability(url, userAgent, pageContent) { 25 | var location = document.location; 26 | var uri = { 27 | spec: location.href, 28 | host: location.host, 29 | prePath: location.protocol + "//" + location.host, // TODO This is incomplete, needs username/password and port 30 | scheme: location.protocol.substr(0, location.protocol.indexOf(":")), 31 | pathBase: location.protocol + "//" + location.host + location.pathname.substr(0, location.pathname.lastIndexOf("/") + 1) 32 | }; 33 | try { 34 | var readabilityObj = new Readability(uri, document); 35 | var isProbablyReaderable = readabilityObj.isProbablyReaderable(); 36 | var result = readabilityObj.parse(); 37 | if (result) { 38 | result.userAgent = userAgent; 39 | result.isProbablyReaderable = isProbablyReaderable; 40 | } else { 41 | result = { 42 | error: { 43 | message: "Empty result from Readability.js.", 44 | sourceHTML: pageContent || "Empty page content." 45 | } 46 | }; 47 | } 48 | return result; 49 | } catch (err) { 50 | return { 51 | error: { 52 | message: err.message, 53 | line: err.line, 54 | stack: err.stack, 55 | sourceHTML: pageContent || "Empty page content." 56 | } 57 | }; 58 | } 59 | }; 60 | 61 | if (!url) { 62 | exitWithError("Missing url arg."); 63 | } else if (!readabilityPath) { 64 | exitWithError("Missing readabilityPath arg."); 65 | } 66 | 67 | if (userAgent) { 68 | page.settings.userAgent = userAgent; 69 | } 70 | 71 | // disable loading images as we don't use them 72 | page.settings.loadImages = false; 73 | 74 | // ensure we don't waste time trying to load slow/missing resources 75 | page.settings.resourceTimeout = 5000; 76 | 77 | // if we do timeout a slow resource, say something useful 78 | page.onResourceTimeout = function(request) { 79 | console.log('Response (#' + request.id + '): ' + JSON.stringify(request)); 80 | }; 81 | 82 | page.onConsoleMessage = function(msg) { 83 | consoleLogs.push(msg); 84 | }; 85 | 86 | page.open(url, function(status) { 87 | if (status !== "success") { 88 | return exitWithError("Unable to access " + url); 89 | } 90 | if (!page.injectJs(readabilityPath)) { 91 | exitWithError("Couldn't inject " + readabilityPath); 92 | } 93 | var result = page.evaluate(runReadability, url, page.settings.userAgent, page.content); 94 | if (result && result.error) { 95 | result.error.consoleLogs = consoleLogs; 96 | } else if (result && result.content) { 97 | result.consoleLogs = consoleLogs; 98 | } 99 | outputJSON(result); 100 | phantom.exit(); 101 | }); 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | readable-proxy 2 | ============== 3 | 4 | [![Build Status](https://travis-ci.org/n1k0/readable-proxy.svg?branch=master)](https://travis-ci.org/n1k0/readable-proxy) [![Dependency Status](https://www.versioneye.com/user/projects/54f03dfc4f3108d1fa00000c/badge.svg?style=flat)](https://www.versioneye.com/user/projects/54f03dfc4f3108d1fa00000c) 5 | 6 | Proxy server to retrieve a readable version of any provided url, powered by Node, 7 | [PhantomJS](http://phantom.org/) and [Readability.js](https://github.com/mozilla/readability). 8 | 9 | Installation 10 | ------------ 11 | 12 | $ git clone https://github.com/n1k0/readable-proxy 13 | $ cd readable-proxy 14 | $ npm install 15 | 16 | Run 17 | --- 18 | 19 | Starts server on `localhost:3000`: 20 | 21 | $ npm start 22 | 23 | Note about CORS: by design, the server will allow any origin to access it, so browsers can consume it from pages hosted on a different domain. 24 | 25 | Configuration 26 | ------------- 27 | 28 | By default, the proxy server will use the Readability.js version it ships with; to override this, you can set the `READABILITY_LIB_PATH` environment variable to the absolute path to the library file on your local system: 29 | 30 | $ READABILITY_LIB_PATH=/path/to/my/own/version/of/Readability.js npm start 31 | 32 | Usage 33 | ----- 34 | 35 | ### Web UI 36 | 37 | Just head to `http://localhost:3000/`, enter some URL and start enjoying both original and readable renderings side by side. 38 | 39 | ![](https://s3.amazonaws.com/f.cl.ly/items/0H2X0o1V2Y240u3L1b06/Screen%20Shot%202015-02-26%20at%2012.33.15.png) 40 | 41 | ### REST/JSON API 42 | 43 | The HTTP Rest API is available under `/api`. 44 | 45 | **Disclaimer:** Truly *REST* implementation is probably far from being considered achieved. 46 | 47 | #### `GET /api/get` 48 | 49 | ##### Required parameters 50 | 51 | - `url`: The URL to retrieve retrieve readable contents from, eg. `https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/`. 52 | 53 | ##### Optional parameters 54 | 55 | - `sanitize`: A *boolean string* to enable HTML sanitization (valid truthy boolean strings: "1", "on", "true", "yes", "y"; everything else will be considered falsy): 56 | - `userAgent`: A custom [User Agent](http://en.wikipedia.org/wiki/User_agent) string. By default, it will use the PhantomJS one. 57 | 58 | **Note:** Enabling contents sanitization loses Readability.js specific HTML semantics, though is probably safer for users if you plan to publish retrieved contents on a public website. 59 | 60 | ##### Example 61 | 62 | Content sanitization enabled: 63 | 64 | $ curl http://0.0.0.0:3000/api/get\?sanitize=y&url\=https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/ 65 | { 66 | "byline":"Nicolas Perriault —", 67 | "content":"

So finally you're testing", 68 | "length":2867, 69 | "title":"Get your Frontend JavaScript Code Covered | Code", 70 | "uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/", 71 | "isProbablyReaderable": true 72 | } 73 | 74 | Content sanitization disabled (default): 75 | 76 | $ curl http://0.0.0.0:3000/api/get\?url\=https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/ 77 | { 78 | "byline":"Nicolas Perriault —", 79 | "content":"

\n

So finally you're…", 80 | "length":3851, 81 | "title":"Get your Frontend JavaScript Code Covered | Code", 82 | "uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/", 83 | "isProbablyReaderable": true 84 | } 85 | 86 | Note: the `isProbablyReaderable` property tells if Readability has determined if page contents were parseable or not. 87 | 88 | ### Usage from node 89 | 90 | #### scrape() function 91 | 92 | The `scrape` function scrapes a URL and returns a Promise with the JSON result object described above: 93 | 94 | ```js 95 | var scrape = require("readable-proxy").scrape; 96 | var url = "https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/"; 97 | 98 | scrape(url, {sanitize: true, userAgent: "My custom User-Agent string"}) 99 | .then(console.error.log(console)) 100 | .catch(console.error.bind(console)); 101 | ``` 102 | 103 | Tests 104 | ----- 105 | 106 | $ npm test 107 | 108 | License 109 | ------- 110 | 111 | MPL 2.0. 112 | -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | var expect = require("chai").expect; 2 | var scrape = require("../scrape"); 3 | var Promise = require("bluebird"); 4 | var sinon = require("sinon"); 5 | var childProcess = require("child_process"); 6 | var app = require("../server").app; 7 | var request = require("supertest"); 8 | 9 | describe("Tests", function() { 10 | var sandbox; 11 | 12 | beforeEach(function() { 13 | sandbox = sinon.sandbox.create(); 14 | }); 15 | 16 | afterEach(function() { 17 | sandbox.restore(); 18 | }); 19 | 20 | describe("scrape", function() { 21 | it("should throw on url arg missing", function() { 22 | expect(scrape).to.Throw(/Missing url./); 23 | }); 24 | 25 | it("should return a promise", function() { 26 | sandbox.stub(childProcess, "execFile"); 27 | 28 | expect(scrape("http://invalid.test/")).to.be.an.instanceOf(Promise); 29 | }); 30 | 31 | it("should call phantomjs exec with expected args", function() { 32 | sandbox.stub(childProcess, "execFile"); 33 | 34 | scrape("http://invalid.test/"); 35 | 36 | sinon.assert.calledOnce(childProcess.execFile); 37 | expect(childProcess.execFile.getCall(0).args[0]).to.match(/phantomjs/); 38 | expect(childProcess.execFile.getCall(0).args[1]).to.include("http://invalid.test/"); 39 | expect(childProcess.execFile.getCall(0).args[1][2]).to.match(/Readability\.js/); 40 | }); 41 | 42 | it("should handle rejection on process call error", function(done) { 43 | var fakeErr = new Error("Boom"); 44 | sandbox.stub(childProcess, "execFile", function(exec, args, cb) { 45 | cb(fakeErr); 46 | }); 47 | 48 | scrape("http://invalid.test/").catch(function(err) { 49 | expect(err).eql(fakeErr); 50 | done(); 51 | }); 52 | }); 53 | 54 | it("should reject on stdout json parsing failure", function(done) { 55 | sandbox.stub(childProcess, "execFile", function(exec, args, cb) { 56 | cb(null, "invalid.json.string"); 57 | }); 58 | 59 | scrape("http://invalid.test/").catch(function(err) { 60 | expect(err.message).to.match(/Unable to parse JSON proxy response/); 61 | done(); 62 | }); 63 | }); 64 | 65 | it("should reject on data extraction error", function(done) { 66 | sandbox.stub(childProcess, "execFile", function(exec, args, cb) { 67 | cb(null, JSON.stringify({error: {message: "Foo"}})); 68 | }); 69 | 70 | scrape("http://invalid.test/").catch(function(err) { 71 | expect(err).to.be.an.instanceOf(Error); 72 | expect(err.message).eql("Foo"); 73 | done(); 74 | }); 75 | }); 76 | 77 | it("should fulfill with a valid json result", function(done) { 78 | sandbox.stub(childProcess, "execFile", function(exec, args, cb) { 79 | cb(null, JSON.stringify({title: "plop", content: "plip"})); 80 | }); 81 | 82 | scrape("http://invalid.test/").then(function(result) { 83 | expect(result.title).eql("plop"); 84 | expect(result.content).eql("plip"); 85 | done(); 86 | }); 87 | }); 88 | }); 89 | 90 | describe("server.app", function() { 91 | describe("Web UI", function() { 92 | it("should serve Web UI on root endpoint", function(done) { 93 | request(app) 94 | .get("/") 95 | .expect("Content-Type", /text\/html/) 96 | .expect(200, done); 97 | }); 98 | }); 99 | 100 | describe("API", function() { 101 | describe("GET /api", function() { 102 | it("should serve JSON on /api endpoint", function(done) { 103 | request(app) 104 | .get("/api") 105 | .set("Accept", "application/json") 106 | .expect("Content-Type", /application\/json/) 107 | .expect(200, done); 108 | }); 109 | 110 | it("should serve app info on /api endpoint", function(done) { 111 | request(app) 112 | .get("/api") 113 | .set("Accept", "application/json") 114 | .expect("Content-Type", /application\/json/) 115 | .expect(function(res) { 116 | expect(res.body.name).eql("readable-proxy"); 117 | }) 118 | .end(done); 119 | }); 120 | }); 121 | 122 | describe("GET /api/get", function() { 123 | it("should return error if missing url param", function(done) { 124 | request(app) 125 | .get("/api/get") 126 | .expect(400) 127 | .expect(function(res) { 128 | expect(res.body.error).eql("Missing url parameter"); 129 | }) 130 | .end(done); 131 | }); 132 | 133 | it("should return scraped response", function(done) { 134 | sandbox.stub(childProcess, "execFile", function(exec, args, cb) { 135 | cb(null, JSON.stringify({title: "plop"})); 136 | }); 137 | 138 | request(app) 139 | .get("/api/get?url=http://invalid.test/") 140 | .expect(200) 141 | .expect(function(res) { 142 | expect(res.body.title).eql("plop"); 143 | }) 144 | .end(done); 145 | }); 146 | 147 | it("should return a server error on call error", function(done) { 148 | sandbox.stub(childProcess, "execFile", function(exec, args, cb) { 149 | cb(null, JSON.stringify({error: {message: "fail"}})); 150 | }); 151 | 152 | request(app) 153 | .get("/api/get?url=http://invalid.test/") 154 | .expect(500) 155 | .expect(function(res) { 156 | expect(res.body.error.message).eql("fail"); 157 | }) 158 | .end(done); 159 | }); 160 | 161 | it("should apply custom user agent when provided", function(done) { 162 | sandbox.stub(childProcess, "execFile", function(exec, args, cb) { 163 | cb(null, "{}"); 164 | }); 165 | 166 | request(app) 167 | .get("/api/get?url=http://invalid.test/&userAgent=custom+ua") 168 | .expect(200) 169 | .expect(function() { 170 | expect(childProcess.execFile.getCall(0).args[1]).to.contain("custom ua"); 171 | }) 172 | .end(done); 173 | }); 174 | 175 | it("should return sanitized response when sanitize arg is passed", function(done) { 176 | sandbox.stub(childProcess, "execFile", function(exec, args, cb) { 177 | cb(null, JSON.stringify({content: "

plop

"})); 178 | }); 179 | 180 | request(app) 181 | .get("/api/get?sanitize=1&url=http://invalid.test/") 182 | .expect(200) 183 | .expect(function(res) { 184 | expect(res.body.content).eql("

plop

"); 185 | expect(res.body.rawText).eql("plop"); 186 | }) 187 | .end(done); 188 | }); 189 | }); 190 | }); 191 | }); 192 | }); 193 | -------------------------------------------------------------------------------- /vendor/Readability.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Arc90 Inc 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * This code is heavily based on Arc90's readability.js (1.7.1) script 19 | * available at: http://code.google.com/p/arc90labs-readability 20 | */ 21 | var root = this; 22 | 23 | /** 24 | * Public constructor. 25 | * @param {Object} uri The URI descriptor object. 26 | * @param {HTMLDocument} doc The document to parse. 27 | * @param {Object} options The options object. 28 | */ 29 | var Readability = function(uri, doc, options) { 30 | options = options || {}; 31 | 32 | this._uri = uri; 33 | this._doc = doc; 34 | this._biggestFrame = false; 35 | this._articleByline = null; 36 | this._articleDir = null; 37 | 38 | // Configureable options 39 | this._debug = !!options.debug; 40 | this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; 41 | this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; 42 | this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES; 43 | 44 | // Start with all flags set 45 | this._flags = this.FLAG_STRIP_UNLIKELYS | 46 | this.FLAG_WEIGHT_CLASSES | 47 | this.FLAG_CLEAN_CONDITIONALLY; 48 | 49 | // The list of pages we've parsed in this call of readability, 50 | // for autopaging. As a key store for easier searching. 51 | this._parsedPages = {}; 52 | 53 | // A list of the ETag headers of pages we've parsed, in case they happen to match, 54 | // we'll know it's a duplicate. 55 | this._pageETags = {}; 56 | 57 | // Make an AJAX request for each page and append it to the document. 58 | this._curPageNum = 1; 59 | 60 | // Control whether log messages are sent to the console 61 | if (this._debug) { 62 | function logEl(e) { 63 | var rv = e.nodeName + " "; 64 | if (e.nodeType == e.TEXT_NODE) { 65 | return rv + '("' + e.textContent + '")'; 66 | } 67 | var classDesc = e.className && ("." + e.className.replace(/ /g, ".")); 68 | var elDesc = e.id ? "(#" + e.id + classDesc + ")" : 69 | (classDesc ? "(" + classDesc + ")" : ""); 70 | return rv + elDesc; 71 | } 72 | this.log = function () { 73 | if ("dump" in root) { 74 | var msg = Array.prototype.map.call(arguments, function(x) { 75 | return (x && x.nodeName) ? logEl(x) : x; 76 | }).join(" "); 77 | dump("Reader: (Readability) " + msg + "\n"); 78 | } else if ("console" in root) { 79 | var args = ["Reader: (Readability) "].concat(arguments); 80 | console.log.apply(console, args); 81 | } 82 | }; 83 | } else { 84 | this.log = function () {}; 85 | } 86 | } 87 | 88 | Readability.prototype = { 89 | FLAG_STRIP_UNLIKELYS: 0x1, 90 | FLAG_WEIGHT_CLASSES: 0x2, 91 | FLAG_CLEAN_CONDITIONALLY: 0x4, 92 | 93 | // Max number of nodes supported by this parser. Default: 0 (no limit) 94 | DEFAULT_MAX_ELEMS_TO_PARSE: 0, 95 | 96 | // The number of top candidates to consider when analysing how 97 | // tight the competition is among candidates. 98 | DEFAULT_N_TOP_CANDIDATES: 5, 99 | 100 | // The maximum number of pages to loop through before we call 101 | // it quits and just show a link. 102 | DEFAULT_MAX_PAGES: 5, 103 | 104 | // Element tags to score by default. 105 | DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), 106 | 107 | // All of the regular expressions in use within readability. 108 | // Defined up here so we don't instantiate them repeatedly in loops. 109 | REGEXPS: { 110 | unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i, 111 | okMaybeItsACandidate: /and|article|body|column|main|shadow/i, 112 | positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, 113 | negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, 114 | extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, 115 | byline: /byline|author|dateline|writtenby/i, 116 | replaceFonts: /<(\/?)font[^>]*>/gi, 117 | normalize: /\s{2,}/g, 118 | videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, 119 | nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, 120 | prevLink: /(prev|earl|old|new|<|«)/i, 121 | whitespace: /^\s*$/, 122 | hasContent: /\S$/, 123 | }, 124 | 125 | DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], 126 | 127 | ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], 128 | 129 | /** 130 | * Run any post-process modifications to article content as necessary. 131 | * 132 | * @param Element 133 | * @return void 134 | **/ 135 | _postProcessContent: function(articleContent) { 136 | // Readability cannot open relative uris so we convert them to absolute uris. 137 | this._fixRelativeUris(articleContent); 138 | }, 139 | 140 | /** 141 | * Iterate over a NodeList, which doesn't natively fully implement the Array 142 | * interface. 143 | * 144 | * For convenience, the current object context is applied to the provided 145 | * iterate function. 146 | * 147 | * @param NodeList nodeList The NodeList. 148 | * @param Function fn The iterate function. 149 | * @return void 150 | */ 151 | _forEachNode: function(nodeList, fn) { 152 | return Array.prototype.forEach.call(nodeList, fn, this); 153 | }, 154 | 155 | /** 156 | * Iterate over a NodeList, return true if any of the provided iterate 157 | * function calls returns true, false otherwise. 158 | * 159 | * For convenience, the current object context is applied to the 160 | * provided iterate function. 161 | * 162 | * @param NodeList nodeList The NodeList. 163 | * @param Function fn The iterate function. 164 | * @return Boolean 165 | */ 166 | _someNode: function(nodeList, fn) { 167 | return Array.prototype.some.call(nodeList, fn, this); 168 | }, 169 | 170 | /** 171 | * Concat all nodelists passed as arguments. 172 | * 173 | * @return ...NodeList 174 | * @return Array 175 | */ 176 | _concatNodeLists: function() { 177 | var slice = Array.prototype.slice; 178 | var args = slice.call(arguments); 179 | var nodeLists = args.map(function(list) { 180 | return slice.call(list); 181 | }); 182 | return Array.prototype.concat.apply([], nodeLists); 183 | }, 184 | 185 | _getAllNodesWithTag: function(node, tagNames) { 186 | if (node.querySelectorAll) { 187 | return node.querySelectorAll(tagNames.join(',')); 188 | } 189 | return [].concat.apply([], tagNames.map(function(tag) { 190 | return node.getElementsByTagName(tag); 191 | })); 192 | }, 193 | 194 | /** 195 | * Converts each
and uri in the given element to an absolute URI. 196 | * 197 | * @param Element 198 | * @return void 199 | */ 200 | _fixRelativeUris: function(articleContent) { 201 | var scheme = this._uri.scheme; 202 | var prePath = this._uri.prePath; 203 | var pathBase = this._uri.pathBase; 204 | 205 | function toAbsoluteURI(uri) { 206 | // If this is already an absolute URI, return it. 207 | if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri)) 208 | return uri; 209 | 210 | // Scheme-rooted relative URI. 211 | if (uri.substr(0, 2) == "//") 212 | return scheme + "://" + uri.substr(2); 213 | 214 | // Prepath-rooted relative URI. 215 | if (uri[0] == "/") 216 | return prePath + uri; 217 | 218 | // Dotslash relative URI. 219 | if (uri.indexOf("./") === 0) 220 | return pathBase + uri.slice(2); 221 | 222 | // Standard relative URI; add entire path. pathBase already includes a 223 | // trailing "/". 224 | return pathBase + uri; 225 | } 226 | 227 | var links = articleContent.getElementsByTagName("a"); 228 | this._forEachNode(links, function(link) { 229 | var href = link.getAttribute("href"); 230 | if (href) { 231 | // Replace links with javascript: URIs with text content, since 232 | // they won't work after scripts have been removed from the page. 233 | if (href.indexOf("javascript:") === 0) { 234 | var text = this._doc.createTextNode(link.textContent); 235 | link.parentNode.replaceChild(text, link); 236 | } else { 237 | link.setAttribute("href", toAbsoluteURI(href)); 238 | } 239 | } 240 | }); 241 | 242 | var imgs = articleContent.getElementsByTagName("img"); 243 | this._forEachNode(imgs, function(img) { 244 | var src = img.getAttribute("src"); 245 | if (src) { 246 | img.setAttribute("src", toAbsoluteURI(src)); 247 | } 248 | }); 249 | }, 250 | 251 | /** 252 | * Get the article title as an H1. 253 | * 254 | * @return void 255 | **/ 256 | _getArticleTitle: function() { 257 | var doc = this._doc; 258 | var curTitle = ""; 259 | var origTitle = ""; 260 | 261 | try { 262 | curTitle = origTitle = doc.title; 263 | 264 | // If they had an element with id "title" in their HTML 265 | if (typeof curTitle !== "string") 266 | curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); 267 | } catch(e) {} 268 | 269 | if (curTitle.match(/ [\|\-] /)) { 270 | curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); 271 | 272 | if (curTitle.split(' ').length < 3) 273 | curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); 274 | } else if (curTitle.indexOf(': ') !== -1) { 275 | // Check if we have an heading containing this exact string, so we 276 | // could assume it's the full title. 277 | var headings = this._concatNodeLists( 278 | doc.getElementsByTagName('h1'), 279 | doc.getElementsByTagName('h2') 280 | ); 281 | var match = this._someNode(headings, function(heading) { 282 | return heading.textContent === curTitle; 283 | }); 284 | 285 | // If we don't, let's extract the title out of the original title string. 286 | if (!match) { 287 | curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1); 288 | 289 | // If the title is now too short, try the first colon instead: 290 | if (curTitle.split(' ').length < 3) 291 | curTitle = origTitle.substring(origTitle.indexOf(':') + 1); 292 | } 293 | } else if (curTitle.length > 150 || curTitle.length < 15) { 294 | var hOnes = doc.getElementsByTagName('h1'); 295 | 296 | if (hOnes.length === 1) 297 | curTitle = this._getInnerText(hOnes[0]); 298 | } 299 | 300 | curTitle = curTitle.trim(); 301 | 302 | if (curTitle.split(' ').length <= 4) 303 | curTitle = origTitle; 304 | 305 | return curTitle; 306 | }, 307 | 308 | /** 309 | * Prepare the HTML document for readability to scrape it. 310 | * This includes things like stripping javascript, CSS, and handling terrible markup. 311 | * 312 | * @return void 313 | **/ 314 | _prepDocument: function() { 315 | var doc = this._doc; 316 | 317 | // Remove all style tags in head 318 | this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) { 319 | styleNode.parentNode.removeChild(styleNode); 320 | }); 321 | 322 | if (doc.body) { 323 | this._replaceBrs(doc.body); 324 | } 325 | 326 | this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) { 327 | this._setNodeTag(fontNode, "SPAN"); 328 | }); 329 | }, 330 | 331 | /** 332 | * Finds the next element, starting from the given node, and ignoring 333 | * whitespace in between. If the given node is an element, the same node is 334 | * returned. 335 | */ 336 | _nextElement: function (node) { 337 | var next = node; 338 | while (next 339 | && (next.nodeType != Node.ELEMENT_NODE) 340 | && this.REGEXPS.whitespace.test(next.textContent)) { 341 | next = next.nextSibling; 342 | } 343 | return next; 344 | }, 345 | 346 | /** 347 | * Replaces 2 or more successive
elements with a single

. 348 | * Whitespace between
elements are ignored. For example: 349 | *

foo
bar


abc
350 | * will become: 351 | *
foo
bar

abc

352 | */ 353 | _replaceBrs: function (elem) { 354 | this._forEachNode(elem.getElementsByTagName("br"), function(br) { 355 | var next = br.nextSibling; 356 | 357 | // Whether 2 or more
elements have been found and replaced with a 358 | //

block. 359 | var replaced = false; 360 | 361 | // If we find a
chain, remove the
s until we hit another element 362 | // or non-whitespace. This leaves behind the first
in the chain 363 | // (which will be replaced with a

later). 364 | while ((next = this._nextElement(next)) && (next.tagName == "BR")) { 365 | replaced = true; 366 | var sibling = next.nextSibling; 367 | next.parentNode.removeChild(next); 368 | next = sibling; 369 | } 370 | 371 | // If we removed a
chain, replace the remaining
with a

. Add 372 | // all sibling nodes as children of the

until we hit another
373 | // chain. 374 | if (replaced) { 375 | var p = this._doc.createElement("p"); 376 | br.parentNode.replaceChild(p, br); 377 | 378 | next = p.nextSibling; 379 | while (next) { 380 | // If we've hit another

, we're done adding children to this

. 381 | if (next.tagName == "BR") { 382 | var nextElem = this._nextElement(next); 383 | if (nextElem && nextElem.tagName == "BR") 384 | break; 385 | } 386 | 387 | // Otherwise, make this node a child of the new

. 388 | var sibling = next.nextSibling; 389 | p.appendChild(next); 390 | next = sibling; 391 | } 392 | } 393 | }); 394 | }, 395 | 396 | _setNodeTag: function (node, tag) { 397 | this.log("_setNodeTag", node, tag); 398 | if (node.__JSDOMParser__) { 399 | node.localName = tag.toLowerCase(); 400 | node.tagName = tag.toUpperCase(); 401 | return node; 402 | } 403 | 404 | var replacement = node.ownerDocument.createElement(tag); 405 | while (node.firstChild) { 406 | replacement.appendChild(node.firstChild); 407 | } 408 | node.parentNode.replaceChild(replacement, node); 409 | if (node.readability) 410 | replacement.readability = node.readability; 411 | 412 | for (var i = 0; i < node.attributes.length; i++) { 413 | replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); 414 | } 415 | return replacement; 416 | }, 417 | 418 | /** 419 | * Prepare the article node for display. Clean out any inline styles, 420 | * iframes, forms, strip extraneous

tags, etc. 421 | * 422 | * @param Element 423 | * @return void 424 | **/ 425 | _prepArticle: function(articleContent) { 426 | this._cleanStyles(articleContent); 427 | 428 | // Clean out junk from the article content 429 | this._cleanConditionally(articleContent, "form"); 430 | this._clean(articleContent, "object"); 431 | this._clean(articleContent, "embed"); 432 | this._clean(articleContent, "h1"); 433 | this._clean(articleContent, "footer"); 434 | 435 | // If there is only one h2, they are probably using it as a header 436 | // and not a subheader, so remove it since we already have a header. 437 | if (articleContent.getElementsByTagName('h2').length === 1) 438 | this._clean(articleContent, "h2"); 439 | 440 | this._clean(articleContent, "iframe"); 441 | this._cleanHeaders(articleContent); 442 | 443 | // Do these last as the previous stuff may have removed junk 444 | // that will affect these 445 | this._cleanConditionally(articleContent, "table"); 446 | this._cleanConditionally(articleContent, "ul"); 447 | this._cleanConditionally(articleContent, "div"); 448 | 449 | // Remove extra paragraphs 450 | this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) { 451 | var imgCount = paragraph.getElementsByTagName('img').length; 452 | var embedCount = paragraph.getElementsByTagName('embed').length; 453 | var objectCount = paragraph.getElementsByTagName('object').length; 454 | // At this point, nasty iframes have been removed, only remain embedded video ones. 455 | var iframeCount = paragraph.getElementsByTagName('iframe').length; 456 | var totalCount = imgCount + embedCount + objectCount + iframeCount; 457 | 458 | if (totalCount === 0 && !this._getInnerText(paragraph, false)) 459 | paragraph.parentNode.removeChild(paragraph); 460 | }); 461 | 462 | this._forEachNode(articleContent.getElementsByTagName("br"), function(br) { 463 | var next = this._nextElement(br.nextSibling); 464 | if (next && next.tagName == "P") 465 | br.parentNode.removeChild(br); 466 | }); 467 | }, 468 | 469 | /** 470 | * Initialize a node with the readability object. Also checks the 471 | * className/id for special names to add to its score. 472 | * 473 | * @param Element 474 | * @return void 475 | **/ 476 | _initializeNode: function(node) { 477 | node.readability = {"contentScore": 0}; 478 | 479 | switch(node.tagName) { 480 | case 'DIV': 481 | node.readability.contentScore += 5; 482 | break; 483 | 484 | case 'PRE': 485 | case 'TD': 486 | case 'BLOCKQUOTE': 487 | node.readability.contentScore += 3; 488 | break; 489 | 490 | case 'ADDRESS': 491 | case 'OL': 492 | case 'UL': 493 | case 'DL': 494 | case 'DD': 495 | case 'DT': 496 | case 'LI': 497 | case 'FORM': 498 | node.readability.contentScore -= 3; 499 | break; 500 | 501 | case 'H1': 502 | case 'H2': 503 | case 'H3': 504 | case 'H4': 505 | case 'H5': 506 | case 'H6': 507 | case 'TH': 508 | node.readability.contentScore -= 5; 509 | break; 510 | } 511 | 512 | node.readability.contentScore += this._getClassWeight(node); 513 | }, 514 | 515 | _removeAndGetNext: function(node) { 516 | var nextNode = this._getNextNode(node, true); 517 | node.parentNode.removeChild(node); 518 | return nextNode; 519 | }, 520 | 521 | /** 522 | * Traverse the DOM from node to node, starting at the node passed in. 523 | * Pass true for the second parameter to indicate this node itself 524 | * (and its kids) are going away, and we want the next node over. 525 | * 526 | * Calling this in a loop will traverse the DOM depth-first. 527 | */ 528 | _getNextNode: function(node, ignoreSelfAndKids) { 529 | // First check for kids if those aren't being ignored 530 | if (!ignoreSelfAndKids && node.firstElementChild) { 531 | return node.firstElementChild; 532 | } 533 | // Then for siblings... 534 | if (node.nextElementSibling) { 535 | return node.nextElementSibling; 536 | } 537 | // And finally, move up the parent chain *and* find a sibling 538 | // (because this is depth-first traversal, we will have already 539 | // seen the parent nodes themselves). 540 | do { 541 | node = node.parentNode; 542 | } while (node && !node.nextElementSibling); 543 | return node && node.nextElementSibling; 544 | }, 545 | 546 | /** 547 | * Like _getNextNode, but for DOM implementations with no 548 | * firstElementChild/nextElementSibling functionality... 549 | */ 550 | _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) { 551 | function nextSiblingEl(n) { 552 | do { 553 | n = n.nextSibling; 554 | } while (n && n.nodeType !== n.ELEMENT_NODE); 555 | return n; 556 | } 557 | // First check for kids if those aren't being ignored 558 | if (!ignoreSelfAndKids && node.children[0]) { 559 | return node.children[0]; 560 | } 561 | // Then for siblings... 562 | var next = nextSiblingEl(node); 563 | if (next) { 564 | return next; 565 | } 566 | // And finally, move up the parent chain *and* find a sibling 567 | // (because this is depth-first traversal, we will have already 568 | // seen the parent nodes themselves). 569 | do { 570 | node = node.parentNode; 571 | if (node) 572 | next = nextSiblingEl(node); 573 | } while (node && !next); 574 | return node && next; 575 | }, 576 | 577 | _checkByline: function(node, matchString) { 578 | if (this._articleByline) { 579 | return false; 580 | } 581 | 582 | if (node.getAttribute !== undefined) { 583 | var rel = node.getAttribute("rel"); 584 | } 585 | 586 | if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { 587 | this._articleByline = node.textContent.trim(); 588 | return true; 589 | } 590 | 591 | return false; 592 | }, 593 | 594 | _getNodeAncestors: function(node, maxDepth) { 595 | maxDepth = maxDepth || 0; 596 | var i = 0, ancestors = []; 597 | while (node.parentNode) { 598 | ancestors.push(node.parentNode) 599 | if (maxDepth && ++i === maxDepth) 600 | break; 601 | node = node.parentNode; 602 | } 603 | return ancestors; 604 | }, 605 | 606 | /*** 607 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 608 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 609 | * 610 | * @param page a document to run upon. Needs to be a full document, complete with body. 611 | * @return Element 612 | **/ 613 | _grabArticle: function (page) { 614 | this.log("**** grabArticle ****"); 615 | var doc = this._doc; 616 | var isPaging = (page !== null ? true: false); 617 | page = page ? page : this._doc.body; 618 | 619 | // We can't grab an article if we don't have a page! 620 | if (!page) { 621 | this.log("No body found in document. Abort."); 622 | return null; 623 | } 624 | 625 | var pageCacheHtml = page.innerHTML; 626 | 627 | // Check if any "dir" is set on the toplevel document element 628 | this._articleDir = doc.documentElement.getAttribute("dir"); 629 | 630 | while (true) { 631 | var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); 632 | 633 | // First, node prepping. Trash nodes that look cruddy (like ones with the 634 | // class name "comment", etc), and turn divs into P tags where they have been 635 | // used inappropriately (as in, where they contain no other block level elements.) 636 | var elementsToScore = []; 637 | var node = this._doc.documentElement; 638 | 639 | while (node) { 640 | var matchString = node.className + " " + node.id; 641 | 642 | // Check to see if this node is a byline, and remove it if it is. 643 | if (this._checkByline(node, matchString)) { 644 | node = this._removeAndGetNext(node); 645 | continue; 646 | } 647 | 648 | // Remove unlikely candidates 649 | if (stripUnlikelyCandidates) { 650 | if (this.REGEXPS.unlikelyCandidates.test(matchString) && 651 | !this.REGEXPS.okMaybeItsACandidate.test(matchString) && 652 | node.tagName !== "BODY" && 653 | node.tagName !== "A") { 654 | this.log("Removing unlikely candidate - " + matchString); 655 | node = this._removeAndGetNext(node); 656 | continue; 657 | } 658 | } 659 | 660 | if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { 661 | elementsToScore.push(node); 662 | } 663 | 664 | // Turn all divs that don't have children block level elements into p's 665 | if (node.tagName === "DIV") { 666 | // Sites like http://mobile.slate.com encloses each paragraph with a DIV 667 | // element. DIVs with only a P element inside and no text content can be 668 | // safely converted into plain P elements to avoid confusing the scoring 669 | // algorithm with DIVs with are, in practice, paragraphs. 670 | if (this._hasSinglePInsideElement(node)) { 671 | var newNode = node.children[0]; 672 | node.parentNode.replaceChild(newNode, node); 673 | node = newNode; 674 | } else if (!this._hasChildBlockElement(node)) { 675 | node = this._setNodeTag(node, "P"); 676 | elementsToScore.push(node); 677 | } else { 678 | // EXPERIMENTAL 679 | this._forEachNode(node.childNodes, function(childNode) { 680 | if (childNode.nodeType === Node.TEXT_NODE) { 681 | var p = doc.createElement('p'); 682 | p.textContent = childNode.textContent; 683 | p.style.display = 'inline'; 684 | p.className = 'readability-styled'; 685 | node.replaceChild(p, childNode); 686 | } 687 | }); 688 | } 689 | } 690 | node = this._getNextNode(node); 691 | } 692 | 693 | /** 694 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. 695 | * Then add their score to their parent node. 696 | * 697 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 698 | **/ 699 | var candidates = []; 700 | this._forEachNode(elementsToScore, function(elementToScore) { 701 | if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined') 702 | return; 703 | 704 | // If this paragraph is less than 25 characters, don't even count it. 705 | var innerText = this._getInnerText(elementToScore); 706 | if (innerText.length < 25) 707 | return; 708 | 709 | // Exclude nodes with no ancestor. 710 | var ancestors = this._getNodeAncestors(elementToScore, 3); 711 | if (ancestors.length === 0) 712 | return; 713 | 714 | var contentScore = 0; 715 | 716 | // Add a point for the paragraph itself as a base. 717 | contentScore += 1; 718 | 719 | // Add points for any commas within this paragraph. 720 | contentScore += innerText.split(',').length; 721 | 722 | // For every 100 characters in this paragraph, add another point. Up to 3 points. 723 | contentScore += Math.min(Math.floor(innerText.length / 100), 3); 724 | 725 | // Initialize and score ancestors. 726 | this._forEachNode(ancestors, function(ancestor, level) { 727 | if (!ancestor.tagName) 728 | return; 729 | 730 | if (typeof(ancestor.readability) === 'undefined') { 731 | this._initializeNode(ancestor); 732 | candidates.push(ancestor); 733 | } 734 | 735 | // Node score divider: 736 | // - parent: 1 (no division) 737 | // - grandparent: 2 738 | // - great grandparent+: ancestor level * 3 739 | var scoreDivider = level === 0 ? 1 : level === 1 ? 2 : level * 3; 740 | ancestor.readability.contentScore += contentScore / scoreDivider; 741 | }); 742 | }); 743 | 744 | // After we've calculated scores, loop through all of the possible 745 | // candidate nodes we found and find the one with the highest score. 746 | var topCandidates = []; 747 | for (var c = 0, cl = candidates.length; c < cl; c += 1) { 748 | var candidate = candidates[c]; 749 | 750 | // Scale the final candidates score based on link density. Good content 751 | // should have a relatively small link density (5% or less) and be mostly 752 | // unaffected by this operation. 753 | var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); 754 | candidate.readability.contentScore = candidateScore; 755 | 756 | this.log('Candidate:', candidate, "with score " + candidateScore); 757 | 758 | for (var t = 0; t < this._nbTopCandidates; t++) { 759 | var aTopCandidate = topCandidates[t]; 760 | 761 | if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { 762 | topCandidates.splice(t, 0, candidate); 763 | if (topCandidates.length > this._nbTopCandidates) 764 | topCandidates.pop(); 765 | break; 766 | } 767 | } 768 | } 769 | 770 | var topCandidate = topCandidates[0] || null; 771 | var neededToCreateTopCandidate = false; 772 | 773 | // If we still have no top candidate, just use the body as a last resort. 774 | // We also have to copy the body node so it is something we can modify. 775 | if (topCandidate === null || topCandidate.tagName === "BODY") { 776 | // Move all of the page's children into topCandidate 777 | topCandidate = doc.createElement("DIV"); 778 | neededToCreateTopCandidate = true; 779 | // Move everything (not just elements, also text nodes etc.) into the container 780 | // so we even include text directly in the body: 781 | var kids = page.childNodes; 782 | while (kids.length) { 783 | this.log("Moving child out:", kids[0]); 784 | topCandidate.appendChild(kids[0]); 785 | } 786 | 787 | page.appendChild(topCandidate); 788 | 789 | this._initializeNode(topCandidate); 790 | } else if (topCandidate) { 791 | // Because of our bonus system, parents of candidates might have scores 792 | // themselves. They get half of the node. There won't be nodes with higher 793 | // scores than our topCandidate, but if we see the score going *up* in the first 794 | // few steps up the tree, that's a decent sign that there might be more content 795 | // lurking in other places that we want to unify in. The sibling stuff 796 | // below does some of that - but only if we've looked high enough up the DOM 797 | // tree. 798 | var parentOfTopCandidate = topCandidate.parentNode; 799 | var lastScore = topCandidate.readability.contentScore; 800 | // The scores shouldn't get too low. 801 | var scoreThreshold = lastScore / 3; 802 | while (parentOfTopCandidate && parentOfTopCandidate.readability) { 803 | var parentScore = parentOfTopCandidate.readability.contentScore; 804 | if (parentScore < scoreThreshold) 805 | break; 806 | if (parentScore > lastScore) { 807 | // Alright! We found a better parent to use. 808 | topCandidate = parentOfTopCandidate; 809 | break; 810 | } 811 | lastScore = parentOfTopCandidate.readability.contentScore; 812 | parentOfTopCandidate = parentOfTopCandidate.parentNode; 813 | } 814 | } 815 | 816 | // Now that we have the top candidate, look through its siblings for content 817 | // that might also be related. Things like preambles, content split by ads 818 | // that we removed, etc. 819 | var articleContent = doc.createElement("DIV"); 820 | if (isPaging) 821 | articleContent.id = "readability-content"; 822 | 823 | var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); 824 | var siblings = topCandidate.parentNode.children; 825 | 826 | for (var s = 0, sl = siblings.length; s < sl; s++) { 827 | var sibling = siblings[s]; 828 | var append = false; 829 | 830 | this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ''); 831 | this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown'); 832 | 833 | if (sibling === topCandidate) { 834 | append = true; 835 | } else { 836 | var contentBonus = 0; 837 | 838 | // Give a bonus if sibling nodes and top candidates have the example same classname 839 | if (sibling.className === topCandidate.className && topCandidate.className !== "") 840 | contentBonus += topCandidate.readability.contentScore * 0.2; 841 | 842 | if (sibling.readability && 843 | ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { 844 | append = true; 845 | } else if (sibling.nodeName === "P") { 846 | var linkDensity = this._getLinkDensity(sibling); 847 | var nodeContent = this._getInnerText(sibling); 848 | var nodeLength = nodeContent.length; 849 | 850 | if (nodeLength > 80 && linkDensity < 0.25) { 851 | append = true; 852 | } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { 853 | append = true; 854 | } 855 | } 856 | } 857 | 858 | if (append) { 859 | this.log("Appending node:", sibling); 860 | 861 | if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { 862 | // We have a node that isn't a common block level element, like a form or td tag. 863 | // Turn it into a div so it doesn't get filtered out later by accident. 864 | this.log("Altering sibling:", sibling, 'to div.'); 865 | 866 | sibling = this._setNodeTag(sibling, "DIV"); 867 | } 868 | 869 | articleContent.appendChild(sibling); 870 | // siblings is a reference to the children array, and 871 | // sibling is removed from the array when we call appendChild(). 872 | // As a result, we must revisit this index since the nodes 873 | // have been shifted. 874 | s -= 1; 875 | sl -= 1; 876 | } 877 | } 878 | 879 | if (this._debug) 880 | this.log("Article content pre-prep: " + articleContent.innerHTML); 881 | // So we have all of the content that we need. Now we clean it up for presentation. 882 | this._prepArticle(articleContent); 883 | if (this._debug) 884 | this.log("Article content post-prep: " + articleContent.innerHTML); 885 | 886 | if (this._curPageNum === 1) { 887 | if (neededToCreateTopCandidate) { 888 | // We already created a fake div thing, and there wouldn't have been any siblings left 889 | // for the previous loop, so there's no point trying to create a new div, and then 890 | // move all the children over. Just assign IDs and class names here. No need to append 891 | // because that already happened anyway. 892 | topCandidate.id = "readability-page-1"; 893 | topCandidate.className = "page"; 894 | } else { 895 | var div = doc.createElement("DIV"); 896 | div.id = "readability-page-1"; 897 | div.className = "page"; 898 | var children = articleContent.childNodes; 899 | while (children.length) { 900 | div.appendChild(children[0]); 901 | } 902 | articleContent.appendChild(div); 903 | } 904 | } 905 | 906 | if (this._debug) 907 | this.log("Article content after paging: " + articleContent.innerHTML); 908 | 909 | // Now that we've gone through the full algorithm, check to see if 910 | // we got any meaningful content. If we didn't, we may need to re-run 911 | // grabArticle with different flags set. This gives us a higher likelihood of 912 | // finding the content, and the sieve approach gives us a higher likelihood of 913 | // finding the -right- content. 914 | if (this._getInnerText(articleContent, true).length < 500) { 915 | page.innerHTML = pageCacheHtml; 916 | 917 | if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { 918 | this._removeFlag(this.FLAG_STRIP_UNLIKELYS); 919 | } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 920 | this._removeFlag(this.FLAG_WEIGHT_CLASSES); 921 | } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 922 | this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); 923 | } else { 924 | return null; 925 | } 926 | } else { 927 | return articleContent; 928 | } 929 | } 930 | }, 931 | 932 | /** 933 | * Check whether the input string could be a byline. 934 | * This verifies that the input is a string, and that the length 935 | * is less than 100 chars. 936 | * 937 | * @param possibleByline {string} - a string to check whether its a byline. 938 | * @return Boolean - whether the input string is a byline. 939 | */ 940 | _isValidByline: function(byline) { 941 | if (typeof byline == 'string' || byline instanceof String) { 942 | byline = byline.trim(); 943 | return (byline.length > 0) && (byline.length < 100); 944 | } 945 | return false; 946 | }, 947 | 948 | /** 949 | * Attempts to get excerpt and byline metadata for the article. 950 | * 951 | * @return Object with optional "excerpt" and "byline" properties 952 | */ 953 | _getArticleMetadata: function() { 954 | var metadata = {}; 955 | var values = {}; 956 | var metaElements = this._doc.getElementsByTagName("meta"); 957 | 958 | // Match "description", or Twitter's "twitter:description" (Cards) 959 | // in name attribute. 960 | var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi; 961 | 962 | // Match Facebook's Open Graph title & description properties. 963 | var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi; 964 | 965 | // Find description tags. 966 | this._forEachNode(metaElements, function(element) { 967 | var elementName = element.getAttribute("name"); 968 | var elementProperty = element.getAttribute("property"); 969 | 970 | if ([elementName, elementProperty].indexOf("author") !== -1) { 971 | metadata.byline = element.getAttribute("content"); 972 | return; 973 | } 974 | 975 | var name = null; 976 | if (namePattern.test(elementName)) { 977 | name = elementName; 978 | } else if (propertyPattern.test(elementProperty)) { 979 | name = elementProperty; 980 | } 981 | 982 | if (name) { 983 | var content = element.getAttribute("content"); 984 | if (content) { 985 | // Convert to lowercase and remove any whitespace 986 | // so we can match below. 987 | name = name.toLowerCase().replace(/\s/g, ''); 988 | values[name] = content.trim(); 989 | } 990 | } 991 | }); 992 | 993 | if ("description" in values) { 994 | metadata.excerpt = values["description"]; 995 | } else if ("og:description" in values) { 996 | // Use facebook open graph description. 997 | metadata.excerpt = values["og:description"]; 998 | } else if ("twitter:description" in values) { 999 | // Use twitter cards description. 1000 | metadata.excerpt = values["twitter:description"]; 1001 | } 1002 | 1003 | if ("og:title" in values) { 1004 | // Use facebook open graph title. 1005 | metadata.title = values["og:title"]; 1006 | } else if ("twitter:title" in values) { 1007 | // Use twitter cards title. 1008 | metadata.title = values["twitter:title"]; 1009 | } 1010 | 1011 | return metadata; 1012 | }, 1013 | 1014 | /** 1015 | * Removes script tags from the document. 1016 | * 1017 | * @param Element 1018 | **/ 1019 | _removeScripts: function(doc) { 1020 | this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) { 1021 | scriptNode.nodeValue = ""; 1022 | scriptNode.removeAttribute('src'); 1023 | 1024 | if (scriptNode.parentNode) 1025 | scriptNode.parentNode.removeChild(scriptNode); 1026 | }); 1027 | this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) { 1028 | if (noscriptNode.parentNode) 1029 | noscriptNode.parentNode.removeChild(noscriptNode); 1030 | }); 1031 | }, 1032 | 1033 | /** 1034 | * Check if this node has only whitespace and a single P element 1035 | * Returns false if the DIV node contains non-empty text nodes 1036 | * or if it contains no P or more than 1 element. 1037 | * 1038 | * @param Element 1039 | **/ 1040 | _hasSinglePInsideElement: function(element) { 1041 | // There should be exactly 1 element child which is a P: 1042 | if (element.children.length != 1 || element.children[0].tagName !== "P") { 1043 | return false; 1044 | } 1045 | 1046 | // And there should be no text nodes with real content 1047 | return !this._someNode(element.childNodes, function(node) { 1048 | return node.nodeType === Node.TEXT_NODE && 1049 | this.REGEXPS.hasContent.test(node.textContent); 1050 | }); 1051 | }, 1052 | 1053 | /** 1054 | * Determine whether element has any children block level elements. 1055 | * 1056 | * @param Element 1057 | */ 1058 | _hasChildBlockElement: function (element) { 1059 | return this._someNode(element.childNodes, function(node) { 1060 | return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 || 1061 | this._hasChildBlockElement(node); 1062 | }); 1063 | }, 1064 | 1065 | /** 1066 | * Get the inner text of a node - cross browser compatibly. 1067 | * This also strips out any excess whitespace to be found. 1068 | * 1069 | * @param Element 1070 | * @param Boolean normalizeSpaces (default: true) 1071 | * @return string 1072 | **/ 1073 | _getInnerText: function(e, normalizeSpaces) { 1074 | normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; 1075 | var textContent = e.textContent.trim(); 1076 | 1077 | if (normalizeSpaces) { 1078 | return textContent.replace(this.REGEXPS.normalize, " "); 1079 | } else { 1080 | return textContent; 1081 | } 1082 | }, 1083 | 1084 | /** 1085 | * Get the number of times a string s appears in the node e. 1086 | * 1087 | * @param Element 1088 | * @param string - what to split on. Default is "," 1089 | * @return number (integer) 1090 | **/ 1091 | _getCharCount: function(e,s) { 1092 | s = s || ","; 1093 | return this._getInnerText(e).split(s).length - 1; 1094 | }, 1095 | 1096 | /** 1097 | * Remove the style attribute on every e and under. 1098 | * TODO: Test if getElementsByTagName(*) is faster. 1099 | * 1100 | * @param Element 1101 | * @return void 1102 | **/ 1103 | _cleanStyles: function(e) { 1104 | e = e || this._doc; 1105 | if (!e) 1106 | return; 1107 | var cur = e.firstChild; 1108 | 1109 | // Remove any root styles, if we're able. 1110 | if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') 1111 | e.removeAttribute('style'); 1112 | 1113 | // Go until there are no more child nodes 1114 | while (cur !== null) { 1115 | if (cur.nodeType === cur.ELEMENT_NODE) { 1116 | // Remove style attribute(s) : 1117 | if (cur.className !== "readability-styled") 1118 | cur.removeAttribute("style"); 1119 | 1120 | this._cleanStyles(cur); 1121 | } 1122 | 1123 | cur = cur.nextSibling; 1124 | } 1125 | }, 1126 | 1127 | /** 1128 | * Get the density of links as a percentage of the content 1129 | * This is the amount of text that is inside a link divided by the total text in the node. 1130 | * 1131 | * @param Element 1132 | * @return number (float) 1133 | **/ 1134 | _getLinkDensity: function(element) { 1135 | var textLength = this._getInnerText(element).length; 1136 | if (textLength === 0) 1137 | return; 1138 | 1139 | var linkLength = 0; 1140 | 1141 | // XXX implement _reduceNodeList? 1142 | this._forEachNode(element.getElementsByTagName("a"), function(linkNode) { 1143 | linkLength += this._getInnerText(linkNode).length; 1144 | }); 1145 | 1146 | return linkLength / textLength; 1147 | }, 1148 | 1149 | /** 1150 | * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. 1151 | * 1152 | * @author Dan Lacy 1153 | * @return string the base url 1154 | **/ 1155 | _findBaseUrl: function() { 1156 | var uri = this._uri; 1157 | var noUrlParams = uri.path.split("?")[0]; 1158 | var urlSlashes = noUrlParams.split("/").reverse(); 1159 | var cleanedSegments = []; 1160 | var possibleType = ""; 1161 | 1162 | for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) { 1163 | var segment = urlSlashes[i]; 1164 | 1165 | // Split off and save anything that looks like a file type. 1166 | if (segment.indexOf(".") !== -1) { 1167 | possibleType = segment.split(".")[1]; 1168 | 1169 | // If the type isn't alpha-only, it's probably not actually a file extension. 1170 | if (!possibleType.match(/[^a-zA-Z]/)) 1171 | segment = segment.split(".")[0]; 1172 | } 1173 | 1174 | // EW-CMS specific segment replacement. Ugly. 1175 | // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html 1176 | if (segment.indexOf(',00') !== -1) 1177 | segment = segment.replace(',00', ''); 1178 | 1179 | // If our first or second segment has anything looking like a page number, remove it. 1180 | if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) 1181 | segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); 1182 | 1183 | var del = false; 1184 | 1185 | // If this is purely a number, and it's the first or second segment, 1186 | // it's probably a page number. Remove it. 1187 | if (i < 2 && segment.match(/^\d{1,2}$/)) 1188 | del = true; 1189 | 1190 | // If this is the first segment and it's just "index", remove it. 1191 | if (i === 0 && segment.toLowerCase() === "index") 1192 | del = true; 1193 | 1194 | // If our first or second segment is smaller than 3 characters, 1195 | // and the first segment was purely alphas, remove it. 1196 | if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) 1197 | del = true; 1198 | 1199 | // If it's not marked for deletion, push it to cleanedSegments. 1200 | if (!del) 1201 | cleanedSegments.push(segment); 1202 | } 1203 | 1204 | // This is our final, cleaned, base article URL. 1205 | return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/"); 1206 | }, 1207 | 1208 | /** 1209 | * Look for any paging links that may occur within the document. 1210 | * 1211 | * @param body 1212 | * @return object (array) 1213 | **/ 1214 | _findNextPageLink: function(elem) { 1215 | var uri = this._uri; 1216 | var possiblePages = {}; 1217 | var allLinks = elem.getElementsByTagName('a'); 1218 | var articleBaseUrl = this._findBaseUrl(); 1219 | 1220 | // Loop through all links, looking for hints that they may be next-page links. 1221 | // Things like having "page" in their textContent, className or id, or being a child 1222 | // of a node with a page-y className or id. 1223 | // 1224 | // Also possible: levenshtein distance? longest common subsequence? 1225 | // 1226 | // After we do that, assign each page a score, and 1227 | for (var i = 0, il = allLinks.length; i < il; i += 1) { 1228 | var link = allLinks[i]; 1229 | var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); 1230 | 1231 | // If we've already seen this page, ignore it. 1232 | if (linkHref === "" || 1233 | linkHref === articleBaseUrl || 1234 | linkHref === uri.spec || 1235 | linkHref in this._parsedPages) { 1236 | continue; 1237 | } 1238 | 1239 | // If it's on a different domain, skip it. 1240 | if (uri.host !== linkHref.split(/\/+/g)[1]) 1241 | continue; 1242 | 1243 | var linkText = this._getInnerText(link); 1244 | 1245 | // If the linkText looks like it's not the next page, skip it. 1246 | if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25) 1247 | continue; 1248 | 1249 | // If the leftovers of the URL after removing the base URL don't contain 1250 | // any digits, it's certainly not a next page link. 1251 | var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); 1252 | if (!linkHrefLeftover.match(/\d/)) 1253 | continue; 1254 | 1255 | if (!(linkHref in possiblePages)) { 1256 | possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; 1257 | } else { 1258 | possiblePages[linkHref].linkText += ' | ' + linkText; 1259 | } 1260 | 1261 | var linkObj = possiblePages[linkHref]; 1262 | 1263 | // If the articleBaseUrl isn't part of this URL, penalize this link. It could 1264 | // still be the link, but the odds are lower. 1265 | // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 1266 | if (linkHref.indexOf(articleBaseUrl) !== 0) 1267 | linkObj.score -= 25; 1268 | 1269 | var linkData = linkText + ' ' + link.className + ' ' + link.id; 1270 | if (linkData.match(this.REGEXPS.nextLink)) 1271 | linkObj.score += 50; 1272 | 1273 | if (linkData.match(/pag(e|ing|inat)/i)) 1274 | linkObj.score += 25; 1275 | 1276 | if (linkData.match(/(first|last)/i)) { 1277 | // -65 is enough to negate any bonuses gotten from a > or » in the text, 1278 | // If we already matched on "next", last is probably fine. 1279 | // If we didn't, then it's bad. Penalize. 1280 | if (!linkObj.linkText.match(this.REGEXPS.nextLink)) 1281 | linkObj.score -= 65; 1282 | } 1283 | 1284 | if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous)) 1285 | linkObj.score -= 50; 1286 | 1287 | if (linkData.match(this.REGEXPS.prevLink)) 1288 | linkObj.score -= 200; 1289 | 1290 | // If a parentNode contains page or paging or paginat 1291 | var parentNode = link.parentNode; 1292 | var positiveNodeMatch = false; 1293 | var negativeNodeMatch = false; 1294 | 1295 | while (parentNode) { 1296 | var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; 1297 | 1298 | if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { 1299 | positiveNodeMatch = true; 1300 | linkObj.score += 25; 1301 | } 1302 | 1303 | if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) { 1304 | // If this is just something like "footer", give it a negative. 1305 | // If it's something like "body-and-footer", leave it be. 1306 | if (!parentNodeClassAndId.match(this.REGEXPS.positive)) { 1307 | linkObj.score -= 25; 1308 | negativeNodeMatch = true; 1309 | } 1310 | } 1311 | 1312 | parentNode = parentNode.parentNode; 1313 | } 1314 | 1315 | // If the URL looks like it has paging in it, add to the score. 1316 | // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 1317 | if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) 1318 | linkObj.score += 25; 1319 | 1320 | // If the URL contains negative values, give a slight decrease. 1321 | if (linkHref.match(this.REGEXPS.extraneous)) 1322 | linkObj.score -= 15; 1323 | 1324 | /** 1325 | * Minor punishment to anything that doesn't match our current URL. 1326 | * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. 1327 | * Dan, can you show me a counterexample where this is necessary? 1328 | * if (linkHref.indexOf(window.location.href) !== 0) { 1329 | * linkObj.score -= 1; 1330 | * } 1331 | **/ 1332 | 1333 | // If the link text can be parsed as a number, give it a minor bonus, with a slight 1334 | // bias towards lower numbered pages. This is so that pages that might not have 'next' 1335 | // in their text can still get scored, and sorted properly by score. 1336 | var linkTextAsNumber = parseInt(linkText, 10); 1337 | if (linkTextAsNumber) { 1338 | // Punish 1 since we're either already there, or it's probably 1339 | // before what we want anyways. 1340 | if (linkTextAsNumber === 1) { 1341 | linkObj.score -= 10; 1342 | } else { 1343 | linkObj.score += Math.max(0, 10 - linkTextAsNumber); 1344 | } 1345 | } 1346 | } 1347 | 1348 | // Loop thrugh all of our possible pages from above and find our top 1349 | // candidate for the next page URL. Require at least a score of 50, which 1350 | // is a relatively high confidence that this page is the next link. 1351 | var topPage = null; 1352 | for (var page in possiblePages) { 1353 | if (possiblePages.hasOwnProperty(page)) { 1354 | if (possiblePages[page].score >= 50 && 1355 | (!topPage || topPage.score < possiblePages[page].score)) 1356 | topPage = possiblePages[page]; 1357 | } 1358 | } 1359 | 1360 | if (topPage) { 1361 | var nextHref = topPage.href.replace(/\/$/,''); 1362 | 1363 | this.log('NEXT PAGE IS ' + nextHref); 1364 | this._parsedPages[nextHref] = true; 1365 | return nextHref; 1366 | } else { 1367 | return null; 1368 | } 1369 | }, 1370 | 1371 | _successfulRequest: function(request) { 1372 | return (request.status >= 200 && request.status < 300) || 1373 | request.status === 304 || 1374 | (request.status === 0 && request.responseText); 1375 | }, 1376 | 1377 | _ajax: function(url, options) { 1378 | var request = new XMLHttpRequest(); 1379 | 1380 | function respondToReadyState(readyState) { 1381 | if (request.readyState === 4) { 1382 | if (this._successfulRequest(request)) { 1383 | if (options.success) 1384 | options.success(request); 1385 | } else { 1386 | if (options.error) 1387 | options.error(request); 1388 | } 1389 | } 1390 | } 1391 | 1392 | if (typeof options === 'undefined') 1393 | options = {}; 1394 | 1395 | request.onreadystatechange = respondToReadyState; 1396 | 1397 | request.open('get', url, true); 1398 | request.setRequestHeader('Accept', 'text/html'); 1399 | 1400 | try { 1401 | request.send(options.postBody); 1402 | } catch (e) { 1403 | if (options.error) 1404 | options.error(); 1405 | } 1406 | 1407 | return request; 1408 | }, 1409 | 1410 | _appendNextPage: function(nextPageLink) { 1411 | var doc = this._doc; 1412 | this._curPageNum += 1; 1413 | 1414 | var articlePage = doc.createElement("DIV"); 1415 | articlePage.id = 'readability-page-' + this._curPageNum; 1416 | articlePage.className = 'page'; 1417 | articlePage.innerHTML = '

§

'; 1418 | 1419 | doc.getElementById("readability-content").appendChild(articlePage); 1420 | 1421 | if (this._curPageNum > this._maxPages) { 1422 | var nextPageMarkup = "
"; 1423 | articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; 1424 | return; 1425 | } 1426 | 1427 | // Now that we've built the article page DOM element, get the page content 1428 | // asynchronously and load the cleaned content into the div we created for it. 1429 | (function(pageUrl, thisPage) { 1430 | this._ajax(pageUrl, { 1431 | success: function(r) { 1432 | 1433 | // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. 1434 | var eTag = r.getResponseHeader('ETag'); 1435 | if (eTag) { 1436 | if (eTag in this._pageETags) { 1437 | this.log("Exact duplicate page found via ETag. Aborting."); 1438 | articlePage.style.display = 'none'; 1439 | return; 1440 | } else { 1441 | this._pageETags[eTag] = 1; 1442 | } 1443 | } 1444 | 1445 | // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. 1446 | var page = doc.createElement("DIV"); 1447 | 1448 | // Do some preprocessing to our HTML to make it ready for appending. 1449 | // - Remove any script tags. Swap and reswap newlines with a unicode 1450 | // character because multiline regex doesn't work in javascript. 1451 | // - Turn any noscript tags into divs so that we can parse them. This 1452 | // allows us to find any next page links hidden via javascript. 1453 | // - Turn all double br's into p's - was handled by prepDocument in the original view. 1454 | // Maybe in the future abstract out prepDocument to work for both the original document 1455 | // and AJAX-added pages. 1456 | var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''); 1457 | responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''); 1458 | responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div'); 1459 | responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>'); 1460 | 1461 | page.innerHTML = responseHtml; 1462 | this._replaceBrs(page); 1463 | 1464 | // Reset all flags for the next page, as they will search through it and 1465 | // disable as necessary at the end of grabArticle. 1466 | this._flags = 0x1 | 0x2 | 0x4; 1467 | 1468 | var nextPageLink = this._findNextPageLink(page); 1469 | 1470 | // NOTE: if we end up supporting _appendNextPage(), we'll need to 1471 | // change this call to be async 1472 | var content = this._grabArticle(page); 1473 | 1474 | if (!content) { 1475 | this.log("No content found in page to append. Aborting."); 1476 | return; 1477 | } 1478 | 1479 | // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. 1480 | // Compare it against all of the the previous document's we've gotten. If the previous 1481 | // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. 1482 | var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; 1483 | if (firstP && firstP.innerHTML.length > 100) { 1484 | for (var i = 1; i <= this._curPageNum; i += 1) { 1485 | var rPage = doc.getElementById('readability-page-' + i); 1486 | if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { 1487 | this.log('Duplicate of page ' + i + ' - skipping.'); 1488 | articlePage.style.display = 'none'; 1489 | this._parsedPages[pageUrl] = true; 1490 | return; 1491 | } 1492 | } 1493 | } 1494 | 1495 | this._removeScripts(content); 1496 | 1497 | thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; 1498 | 1499 | // After the page has rendered, post process the content. This delay is necessary because, 1500 | // in webkit at least, offsetWidth is not set in time to determine image width. We have to 1501 | // wait a little bit for reflow to finish before we can fix floating images. 1502 | setTimeout((function() { 1503 | this._postProcessContent(thisPage); 1504 | }).bind(this), 500); 1505 | 1506 | 1507 | if (nextPageLink) 1508 | this._appendNextPage(nextPageLink); 1509 | } 1510 | }); 1511 | }).bind(this)(nextPageLink, articlePage); 1512 | }, 1513 | 1514 | /** 1515 | * Get an elements class/id weight. Uses regular expressions to tell if this 1516 | * element looks good or bad. 1517 | * 1518 | * @param Element 1519 | * @return number (Integer) 1520 | **/ 1521 | _getClassWeight: function(e) { 1522 | if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) 1523 | return 0; 1524 | 1525 | var weight = 0; 1526 | 1527 | // Look for a special classname 1528 | if (typeof(e.className) === 'string' && e.className !== '') { 1529 | if (this.REGEXPS.negative.test(e.className)) 1530 | weight -= 25; 1531 | 1532 | if (this.REGEXPS.positive.test(e.className)) 1533 | weight += 25; 1534 | } 1535 | 1536 | // Look for a special ID 1537 | if (typeof(e.id) === 'string' && e.id !== '') { 1538 | if (this.REGEXPS.negative.test(e.id)) 1539 | weight -= 25; 1540 | 1541 | if (this.REGEXPS.positive.test(e.id)) 1542 | weight += 25; 1543 | } 1544 | 1545 | return weight; 1546 | }, 1547 | 1548 | /** 1549 | * Clean a node of all elements of type "tag". 1550 | * (Unless it's a youtube/vimeo video. People love movies.) 1551 | * 1552 | * @param Element 1553 | * @param string tag to clean 1554 | * @return void 1555 | **/ 1556 | _clean: function(e, tag) { 1557 | var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; 1558 | 1559 | this._forEachNode(e.getElementsByTagName(tag), function(element) { 1560 | // Allow youtube and vimeo videos through as people usually want to see those. 1561 | if (isEmbed) { 1562 | var attributeValues = [].map.call(element.attributes, function(attr) { 1563 | return attr.value; 1564 | }).join("|"); 1565 | 1566 | // First, check the elements attributes to see if any of them contain youtube or vimeo 1567 | if (this.REGEXPS.videos.test(attributeValues)) 1568 | return; 1569 | 1570 | // Then check the elements inside this element for the same. 1571 | if (this.REGEXPS.videos.test(element.innerHTML)) 1572 | return; 1573 | } 1574 | 1575 | element.parentNode.removeChild(element); 1576 | }); 1577 | }, 1578 | 1579 | /** 1580 | * Check if a given node has one of its ancestor tag name matching the 1581 | * provided one. 1582 | * @param HTMLElement node 1583 | * @param String tagName 1584 | * @param Number maxDepth 1585 | * @return Boolean 1586 | */ 1587 | _hasAncestorTag: function(node, tagName, maxDepth) { 1588 | maxDepth = maxDepth || 3; 1589 | tagName = tagName.toUpperCase(); 1590 | var depth = 0; 1591 | while (node.parentNode) { 1592 | if (depth > maxDepth) 1593 | return false; 1594 | if (node.parentNode.tagName === tagName) 1595 | return true; 1596 | node = node.parentNode; 1597 | depth++; 1598 | } 1599 | return false; 1600 | }, 1601 | 1602 | /** 1603 | * Clean an element of all tags of type "tag" if they look fishy. 1604 | * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 1605 | * 1606 | * @return void 1607 | **/ 1608 | _cleanConditionally: function(e, tag) { 1609 | if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) 1610 | return; 1611 | 1612 | var tagsList = e.getElementsByTagName(tag); 1613 | var curTagsLength = tagsList.length; 1614 | var isList = tag === "ul" || tag === "ol"; 1615 | 1616 | // Gather counts for other typical elements embedded within. 1617 | // Traverse backwards so we can remove nodes at the same time 1618 | // without effecting the traversal. 1619 | // 1620 | // TODO: Consider taking into account original contentScore here. 1621 | for (var i = curTagsLength-1; i >= 0; i -= 1) { 1622 | var weight = this._getClassWeight(tagsList[i]); 1623 | var contentScore = 0; 1624 | 1625 | this.log("Cleaning Conditionally", tagsList[i]); 1626 | 1627 | if (weight + contentScore < 0) { 1628 | tagsList[i].parentNode.removeChild(tagsList[i]); 1629 | } else if (this._getCharCount(tagsList[i],',') < 10) { 1630 | // If there are not very many commas, and the number of 1631 | // non-paragraph elements is more than paragraphs or other 1632 | // ominous signs, remove the element. 1633 | var p = tagsList[i].getElementsByTagName("p").length; 1634 | var img = tagsList[i].getElementsByTagName("img").length; 1635 | var li = tagsList[i].getElementsByTagName("li").length-100; 1636 | var input = tagsList[i].getElementsByTagName("input").length; 1637 | 1638 | var embedCount = 0; 1639 | var embeds = tagsList[i].getElementsByTagName("embed"); 1640 | for (var ei = 0, il = embeds.length; ei < il; ei += 1) { 1641 | if (!this.REGEXPS.videos.test(embeds[ei].src)) 1642 | embedCount += 1; 1643 | } 1644 | 1645 | var linkDensity = this._getLinkDensity(tagsList[i]); 1646 | var contentLength = this._getInnerText(tagsList[i]).length; 1647 | var toRemove = false; 1648 | if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) { 1649 | toRemove = true; 1650 | } else if (!isList && li > p) { 1651 | toRemove = true; 1652 | } else if (input > Math.floor(p/3)) { 1653 | toRemove = true; 1654 | } else if (!isList && contentLength < 25 && (img === 0 || img > 2)) { 1655 | toRemove = true; 1656 | } else if (!isList && weight < 25 && linkDensity > 0.2) { 1657 | toRemove = true; 1658 | } else if (weight >= 25 && linkDensity > 0.5) { 1659 | toRemove = true; 1660 | } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) { 1661 | toRemove = true; 1662 | } 1663 | 1664 | if (toRemove) { 1665 | tagsList[i].parentNode.removeChild(tagsList[i]); 1666 | } 1667 | } 1668 | } 1669 | }, 1670 | 1671 | /** 1672 | * Clean out spurious headers from an Element. Checks things like classnames and link density. 1673 | * 1674 | * @param Element 1675 | * @return void 1676 | **/ 1677 | _cleanHeaders: function(e) { 1678 | for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { 1679 | var headers = e.getElementsByTagName('h' + headerIndex); 1680 | for (var i = headers.length - 1; i >= 0; i -= 1) { 1681 | if (this._getClassWeight(headers[i]) < 0) 1682 | headers[i].parentNode.removeChild(headers[i]); 1683 | } 1684 | } 1685 | }, 1686 | 1687 | _flagIsActive: function(flag) { 1688 | return (this._flags & flag) > 0; 1689 | }, 1690 | 1691 | _addFlag: function(flag) { 1692 | this._flags = this._flags | flag; 1693 | }, 1694 | 1695 | _removeFlag: function(flag) { 1696 | this._flags = this._flags & ~flag; 1697 | }, 1698 | 1699 | /** 1700 | * Decides whether or not the document is reader-able without parsing the whole thing. 1701 | * 1702 | * @return boolean Whether or not we suspect parse() will suceeed at returning an article object. 1703 | */ 1704 | isProbablyReaderable: function(helperIsVisible) { 1705 | var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]); 1706 | 1707 | // FIXME we should have a fallback for helperIsVisible, but this is 1708 | // problematic because of jsdom's elem.style handling - see 1709 | // https://github.com/mozilla/readability/pull/186 for context. 1710 | 1711 | var score = 0; 1712 | // This is a little cheeky, we use the accumulator 'score' to decide what to return from 1713 | // this callback: 1714 | return this._someNode(nodes, function(node) { 1715 | if (helperIsVisible && !helperIsVisible(node)) 1716 | return false; 1717 | var matchString = node.className + " " + node.id; 1718 | 1719 | if (this.REGEXPS.unlikelyCandidates.test(matchString) && 1720 | !this.REGEXPS.okMaybeItsACandidate.test(matchString)) { 1721 | return false; 1722 | } 1723 | 1724 | if (node.matches && node.matches("li p")) { 1725 | return false; 1726 | } 1727 | 1728 | var textContentLength = node.textContent.trim().length; 1729 | if (textContentLength < 140) { 1730 | return false; 1731 | } 1732 | 1733 | score += Math.sqrt(textContentLength - 140); 1734 | 1735 | if (score > 20) { 1736 | return true; 1737 | } 1738 | return false; 1739 | }); 1740 | }, 1741 | 1742 | /** 1743 | * Runs readability. 1744 | * 1745 | * Workflow: 1746 | * 1. Prep the document by removing script tags, css, etc. 1747 | * 2. Build readability's DOM tree. 1748 | * 3. Grab the article content from the current dom tree. 1749 | * 4. Replace the current DOM tree with the new one. 1750 | * 5. Read peacefully. 1751 | * 1752 | * @return void 1753 | **/ 1754 | parse: function () { 1755 | // Avoid parsing too large documents, as per configuration option 1756 | if (this._maxElemsToParse > 0) { 1757 | var numTags = this._doc.getElementsByTagName("*").length; 1758 | if (numTags > this._maxElemsToParse) { 1759 | throw new Error("Aborting parsing document; " + numTags + " elements found"); 1760 | } 1761 | } 1762 | 1763 | if (typeof this._doc.documentElement.firstElementChild === "undefined") { 1764 | this._getNextNode = this._getNextNodeNoElementProperties; 1765 | } 1766 | // Remove script tags from the document. 1767 | this._removeScripts(this._doc); 1768 | 1769 | // FIXME: Disabled multi-page article support for now as it 1770 | // needs more work on infrastructure. 1771 | 1772 | // Make sure this document is added to the list of parsed pages first, 1773 | // so we don't double up on the first page. 1774 | // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; 1775 | 1776 | // Pull out any possible next page link first. 1777 | // var nextPageLink = this._findNextPageLink(doc.body); 1778 | 1779 | this._prepDocument(); 1780 | 1781 | var metadata = this._getArticleMetadata(); 1782 | var articleTitle = metadata.title || this._getArticleTitle(); 1783 | 1784 | var articleContent = this._grabArticle(); 1785 | if (!articleContent) 1786 | return null; 1787 | 1788 | this.log("Grabbed: " + articleContent.innerHTML); 1789 | 1790 | this._postProcessContent(articleContent); 1791 | 1792 | // if (nextPageLink) { 1793 | // // Append any additional pages after a small timeout so that people 1794 | // // can start reading without having to wait for this to finish processing. 1795 | // setTimeout((function() { 1796 | // this._appendNextPage(nextPageLink); 1797 | // }).bind(this), 500); 1798 | // } 1799 | 1800 | // If we haven't found an excerpt in the article's metadata, use the article's 1801 | // first paragraph as the excerpt. This is used for displaying a preview of 1802 | // the article's content. 1803 | if (!metadata.excerpt) { 1804 | var paragraphs = articleContent.getElementsByTagName("p"); 1805 | if (paragraphs.length > 0) { 1806 | metadata.excerpt = paragraphs[0].textContent.trim(); 1807 | } 1808 | } 1809 | 1810 | return { uri: this._uri, 1811 | title: articleTitle, 1812 | byline: metadata.byline || this._articleByline, 1813 | dir: this._articleDir, 1814 | content: articleContent.innerHTML, 1815 | length: articleContent.textContent.length, 1816 | excerpt: metadata.excerpt }; 1817 | } 1818 | }; 1819 | --------------------------------------------------------------------------------