├── .editorconfig ├── .gitignore ├── .prettierrc ├── README.md ├── package-lock.json ├── package.json ├── renovate.json ├── src ├── distiller.ts ├── index.ts └── third_party │ ├── dom-distiller │ ├── README.md │ └── domdistiller.ts │ ├── readability │ ├── LICENSE.md │ └── readability.ts │ └── turndown-client │ ├── LISENCE │ ├── README.md │ ├── turndown-plugin-gfm.ts │ └── turndown.ts ├── test ├── index.spec.ts └── tsconfig.json ├── tsconfig.json ├── vitest.config.ts ├── worker-configuration.d.ts └── wrangler.toml /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = tab 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.yml] 12 | indent_style = space 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | 3 | logs 4 | _.log 5 | npm-debug.log_ 6 | yarn-debug.log* 7 | yarn-error.log* 8 | lerna-debug.log* 9 | .pnpm-debug.log* 10 | 11 | # Diagnostic reports (https://nodejs.org/api/report.html) 12 | 13 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 14 | 15 | # Runtime data 16 | 17 | pids 18 | _.pid 19 | _.seed 20 | \*.pid.lock 21 | 22 | # Directory for instrumented libs generated by jscoverage/JSCover 23 | 24 | lib-cov 25 | 26 | # Coverage directory used by tools like istanbul 27 | 28 | coverage 29 | \*.lcov 30 | 31 | # nyc test coverage 32 | 33 | .nyc_output 34 | 35 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 36 | 37 | .grunt 38 | 39 | # Bower dependency directory (https://bower.io/) 40 | 41 | bower_components 42 | 43 | # node-waf configuration 44 | 45 | .lock-wscript 46 | 47 | # Compiled binary addons (https://nodejs.org/api/addons.html) 48 | 49 | build/Release 50 | 51 | # Dependency directories 52 | 53 | node_modules/ 54 | jspm_packages/ 55 | 56 | # Snowpack dependency directory (https://snowpack.dev/) 57 | 58 | web_modules/ 59 | 60 | # TypeScript cache 61 | 62 | \*.tsbuildinfo 63 | 64 | # Optional npm cache directory 65 | 66 | .npm 67 | 68 | # Optional eslint cache 69 | 70 | .eslintcache 71 | 72 | # Optional stylelint cache 73 | 74 | .stylelintcache 75 | 76 | # Microbundle cache 77 | 78 | .rpt2_cache/ 79 | .rts2_cache_cjs/ 80 | .rts2_cache_es/ 81 | .rts2_cache_umd/ 82 | 83 | # Optional REPL history 84 | 85 | .node_repl_history 86 | 87 | # Output of 'npm pack' 88 | 89 | \*.tgz 90 | 91 | # Yarn Integrity file 92 | 93 | .yarn-integrity 94 | 95 | # dotenv environment variable files 96 | 97 | .env 98 | .env.development.local 99 | .env.test.local 100 | .env.production.local 101 | .env.local 102 | 103 | # parcel-bundler cache (https://parceljs.org/) 104 | 105 | .cache 106 | .parcel-cache 107 | 108 | # Next.js build output 109 | 110 | .next 111 | out 112 | 113 | # Nuxt.js build / generate output 114 | 115 | .nuxt 116 | dist 117 | 118 | # Gatsby files 119 | 120 | .cache/ 121 | 122 | # Comment in the public line in if your project uses Gatsby and not Next.js 123 | 124 | # https://nextjs.org/blog/next-9-1#public-directory-support 125 | 126 | # public 127 | 128 | # vuepress build output 129 | 130 | .vuepress/dist 131 | 132 | # vuepress v2.x temp and cache directory 133 | 134 | .temp 135 | .cache 136 | 137 | # Docusaurus cache and generated files 138 | 139 | .docusaurus 140 | 141 | # Serverless directories 142 | 143 | .serverless/ 144 | 145 | # FuseBox cache 146 | 147 | .fusebox/ 148 | 149 | # DynamoDB Local files 150 | 151 | .dynamodb/ 152 | 153 | # TernJS port file 154 | 155 | .tern-port 156 | 157 | # Stores VSCode versions used for testing VSCode extensions 158 | 159 | .vscode-test 160 | 161 | # yarn v2 162 | 163 | .yarn/cache 164 | .yarn/unplugged 165 | .yarn/build-state.yml 166 | .yarn/install-state.gz 167 | .pnp.\* 168 | 169 | # wrangler project 170 | 171 | .dev.vars 172 | .wrangler/ 173 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 140, 3 | "singleQuote": true, 4 | "semi": true, 5 | "useTabs": true 6 | } 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cloudflare DOM Distiller 2 | 3 | This repository provides an API implementation for easily retrieving content from target web pages on Cloudflare Workers. 4 | 5 | ## Features 6 | 7 | - **Cloudflare Workers & Browser Rendering**: Utilizes Cloudflare Workers and browser rendering to fetch page information. 8 | - **Readability**: Uses Readability to extract page content and remove unnecessary information. 9 | - **DOM-Distiller**: If you set option `useReadability: false` in a request, uses dom-distiller to extract page content and remove unnecessary information. 10 | - **Turndown**: Converts the extracted HTML to Markdown format for better readability. 11 | 12 | ## Example Usage 13 | 14 | To run the API in development mode: 15 | 16 | ```bash 17 | npx wrangler dev --remote 18 | ``` 19 | 20 | You can make a request to your local server and verify that the content of the target web page is converted to Markdown format: 21 | 22 | ```bash 23 | $ curl -H 'Content-Type: application/json' \ 24 | -X POST http://localhost:8787/distill \ 25 | -d '{"url": "https://blog.samaltman.com/gpt-4o", "markdown": true}' 26 | 27 | {"body":"There ... to the team that poured so much work into making this happen!"} 28 | ``` 29 | 30 | ## Endpoint: `/distill` 31 | 32 | ### Request Format 33 | 34 | - **url**: The URL of the target web page to fetch content from. 35 | - **markdown**: Boolean value to indicate whether the content should be converted to Markdown format. 36 | 37 | ### Response Format 38 | 39 | - **body**: Returns the content of the web page. 40 | 41 | ## References 42 | 43 | - [mixmark\-io/turndown: 🛏 An HTML to Markdown converter written in JavaScript](https://github.com/mixmark-io/turndown) 44 | - [mozilla/readability: A standalone version of the readability lib](https://github.com/mozilla/readability) 45 | - [chromium/dom\-distiller: Distills the DOM](https://github.com/chromium/dom-distiller) 46 | - [Puppeteer · Browser Rendering docs](https://developers.cloudflare.com/browser-rendering/platform/puppeteer/) 47 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cloudflare-dom-distiller", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "deploy": "wrangler deploy", 7 | "dev": "wrangler dev", 8 | "start": "wrangler dev", 9 | "test": "vitest", 10 | "cf-typegen": "wrangler types" 11 | }, 12 | "devDependencies": { 13 | "@cloudflare/puppeteer": "^0.0.14", 14 | "@cloudflare/vitest-pool-workers": "^0.1.0", 15 | "@cloudflare/workers-types": "^4.20240605.0", 16 | "@types/turndown": "^5.0.4", 17 | "typescript": "^5.0.4", 18 | "vitest": "1.3.0", 19 | "wrangler": "^4.0.0" 20 | }, 21 | "dependencies": { 22 | "@hono/zod-validator": "^0.5.0", 23 | "hono": "^4.5.8", 24 | "turndown": "^7.2.0" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /src/distiller.ts: -------------------------------------------------------------------------------- 1 | import puppeteer from '@cloudflare/puppeteer'; 2 | 3 | // @ts-ignore 4 | import { readabilityJsBundle } from './third_party/readability/readability'; 5 | import { domdistillerJsBundle } from './third_party/dom-distiller/domdistiller'; 6 | import { turndownJsBundle } from './third_party/turndown-client/turndown'; 7 | import { turndownPluginGfmJsBundle } from './third_party/turndown-client/turndown-plugin-gfm'; 8 | // @ts-ignore 9 | import { readabilityJsBundle } from './third_party/readability/readability'; 10 | 11 | export async function scrapeAndDistill( 12 | browserWorker: puppeteer.BrowserWorker, 13 | url: string, 14 | markdown: boolean, 15 | useReadability: boolean 16 | ): Promise { 17 | const { browser } = await pickRandomSession(browserWorker); 18 | try { 19 | const page = await browser.newPage(); 20 | await page.goto(url, { waitUntil: 'networkidle2' }); 21 | 22 | // load the DOM Distiller script 23 | const content = useReadability ? await extractWithReadability(page) : await extractWithDomDistiller(page); 24 | 25 | if (markdown) { 26 | await page.evaluate(turndownJsBundle); 27 | await page.evaluate(turndownPluginGfmJsBundle); 28 | await page.evaluate(`var content = ${JSON.stringify(content)};`); 29 | const markdown = await page.evaluate(() => { 30 | // @ts-ignore 31 | const turndownService = new TurndownService({ 32 | codeBlockStyle: 'fenced', 33 | preformattedCode: true, 34 | }); 35 | 36 | // @ts-ignore 37 | turndownService.use(turndownPluginGfm.gfm); 38 | 39 | // https://github.com/mixmark-io/turndown/issues/192#issuecomment-1242819018 40 | // @ts-ignore 41 | const getExt = (node) => { 42 | // Simple match where the
 has the `highlight-source-js` tags
 43 | 					// @ts-ignore
 44 | 					const getFirstTag = (node) => node.outerHTML.split('>').shift() + '>';
 45 | 					const match = getFirstTag(node).match(/highlight-source-[a-z]+/);
 46 | 					if (match) return match[0].split('-').pop();
 47 | 
 48 | 					// More complex match where the _parent_ (single) has that.
 49 | 					// The parent of the 
 is not a "wrapping" parent, so skip those
 50 | 					if (node.parentNode.childNodes.length !== 1) return '';
 51 | 
 52 | 					// Check the parent just in case
 53 | 					const parent = getFirstTag(node.parentNode).match(/highlight-source-[a-z]+/);
 54 | 					if (parent) return parent[0].split('-').pop();
 55 | 
 56 | 					// Nothing was found...
 57 | 					return '';
 58 | 				};
 59 | 				turndownService.addRule('fenceAllPreformattedText', {
 60 | 					filter: ['pre'],
 61 | 					// @ts-ignore
 62 | 					replacement: function (content, node) {
 63 | 						const ext = getExt(node);
 64 | 						const code = [...node.childNodes].map((c) => c.textContent).join('');
 65 | 						return '\n```' + ext + '\n' + code + '\n```\n\n';
 66 | 					},
 67 | 				});
 68 | 				// @ts-ignore
 69 | 				return turndownService.turndown(content);
 70 | 			});
 71 | 			return markdown;
 72 | 		}
 73 | 
 74 | 		return content;
 75 | 	} finally {
 76 | 		await browser.close();
 77 | 	}
 78 | }
 79 | 
 80 | async function extractWithDomDistiller(page: puppeteer.Page) {
 81 | 	const distillerScript = domdistillerJsBundle;
 82 | 	console.debug('Injecting DOM Distiller script');
 83 | 	await page.evaluate(distillerScript);
 84 | 
 85 | 	// run the DOM Distiller script
 86 | 	console.debug('Running DOM Distiller');
 87 | 	const distilledContent = await page.evaluate(() => {
 88 | 		// @ts-ignore
 89 | 		return org.chromium.distiller.DomDistiller.apply();
 90 | 	});
 91 | 
 92 | 	console.debug('Distilled content:', distilledContent);
 93 | 
 94 | 	// console.log(distilledContent);
 95 | 	const content = distilledContent[2][1];
 96 | 	return content;
 97 | }
 98 | 
 99 | async function extractWithReadability(page: puppeteer.Page) {
100 | 	const readabilityScript = readabilityJsBundle;
101 | 
102 | 	console.debug('Injecting Readability script');
103 | 	await page.evaluate(readabilityScript);
104 | 
105 | 	// run the Readability script
106 | 	console.debug('Running Readability');
107 | 	const content = await page.evaluate(() => {
108 | 		// @ts-ignore
109 | 		const article = new Readability(document).parse();
110 | 		return article.content;
111 | 	});
112 | 
113 | 	return content;
114 | }
115 | 
116 | // Pick random free session
117 | // Other custom logic could be used instead
118 | // https://developers.cloudflare.com/browser-rendering/get-started/reuse-sessions/
119 | async function getRandomSession(endpoint: puppeteer.BrowserWorker): Promise {
120 | 	const sessions: puppeteer.ActiveSession[] = await puppeteer.sessions(endpoint);
121 | 	console.log(`Sessions: ${JSON.stringify(sessions)}`);
122 | 	const sessionsIds = sessions
123 | 		.filter((v) => {
124 | 			return !v.connectionId; // remove sessions with workers connected to them
125 | 		})
126 | 		.map((v) => {
127 | 			return v.sessionId;
128 | 		});
129 | 	if (sessionsIds.length === 0) {
130 | 		return;
131 | 	}
132 | 
133 | 	const sessionId = sessionsIds[Math.floor(Math.random() * sessionsIds.length)];
134 | 
135 | 	return sessionId!;
136 | }
137 | 
138 | async function pickRandomSession(browserWorker: puppeteer.BrowserWorker) {
139 | 	// Pick random session from open sessions
140 | 	let sessionId = await getRandomSession(browserWorker);
141 | 	let browser, launched;
142 | 	if (sessionId) {
143 | 		try {
144 | 			browser = await puppeteer.connect(browserWorker, sessionId);
145 | 		} catch (e) {
146 | 			// another worker may have connected first
147 | 			console.log(`Failed to connect to ${sessionId}. Error ${e}`);
148 | 		}
149 | 	}
150 | 	if (!browser) {
151 | 		// No open sessions, launch new session
152 | 		browser = await puppeteer.launch(browserWorker);
153 | 		launched = true;
154 | 	}
155 | 
156 | 	return { browser, launched };
157 | }
158 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | import puppeteer, { BrowserWorker } from '@cloudflare/puppeteer';
 2 | 
 3 | import { Hono } from 'hono';
 4 | import { zValidator } from '@hono/zod-validator';
 5 | import { z } from 'zod';
 6 | // @ts-ignore
 7 | import { scrapeAndDistill } from './distiller';
 8 | 
 9 | const DistillRequestSchema = z.object({
10 | 	url: z.string(),
11 | 	markdown: z.boolean(),
12 | 	useReadability: z.boolean().optional(),
13 | });
14 | 
15 | type Request = z.infer;
16 | 
17 | const DistillResponseSchema = z.object({
18 | 	body: z.string(),
19 | });
20 | 
21 | type Response = z.infer;
22 | const app = new Hono<{ Bindings: Bindings }>();
23 | 
24 | type Bindings = {
25 | 	MYBROWSER: BrowserWorker;
26 | 	SERVICE_API_KEY?: string;
27 | };
28 | 
29 | // set bearer auth if SERVICE_API_KEY is set
30 | app.use(async (c, next) => {
31 | 	const serviceApiKey = c.env.SERVICE_API_KEY;
32 | 	// bypass auth if SERVICE_API_KEY is not set
33 | 	if (!serviceApiKey) {
34 | 		return await next();
35 | 	}
36 | 
37 | 	const authHeader = c.req.header('Authorization');
38 | 	if (!authHeader) {
39 | 		return c.text('Authorization header is missing', { status: 401 });
40 | 	}
41 | 
42 | 	const [authType, authValue] = authHeader.split(' ');
43 | 
44 | 	if (authType !== 'Bearer') {
45 | 		return c.text('Invalid authorization type', { status: 401 });
46 | 	}
47 | 
48 | 	if (authValue !== serviceApiKey) {
49 | 		return c.text('Invalid API key', { status: 401 });
50 | 	}
51 | 
52 | 	return await next();
53 | });
54 | 
55 | app.post('/distill', zValidator('json', DistillRequestSchema), async (c) => {
56 | 	const req = c.req.valid('json');
57 | 
58 | 	const browserWorker = c.env.MYBROWSER;
59 | 
60 | 	// return 429 if the browser worker is busy
61 | 	// https://github.com/cloudflare/puppeteer/blob/808f08afdd25ee49a267479f05eecd0a1b3edf0a/src/puppeteer-core.ts#L86
62 | 	const limits = await puppeteer.limits(browserWorker);
63 | 	if (limits.allowedBrowserAcquisitions < 1) {
64 | 		const retryAfter = limits.timeUntilNextAllowedBrowserAcquisition;
65 | 		return c.text('The browser worker is busy', 429, {
66 | 			'Retry-After': retryAfter.toString(),
67 | 		});
68 | 	}
69 | 
70 | 	// by default, use readability
71 | 	const useReadability = req.useReadability ?? true;
72 | 
73 | 	const distilled = await scrapeAndDistill(browserWorker, req.url, req.markdown, useReadability);
74 | 
75 | 	const res: Response = {
76 | 		body: distilled,
77 | 	};
78 | 
79 | 	return c.json(res);
80 | });
81 | 
82 | export default app;
83 | 


--------------------------------------------------------------------------------
/src/third_party/dom-distiller/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # dom-distiller
3 | 
4 | This directory contains dom-distiller.js, which was generated using the repository , a fork of .
5 | The license follows the original repository from which it was forked.
6 | 


--------------------------------------------------------------------------------
/src/third_party/readability/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010 Arc90 Inc
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/src/third_party/readability/readability.ts:
--------------------------------------------------------------------------------
   1 | export const readabilityJsBundle=`/*
   2 |  * Copyright (c) 2010 Arc90 Inc
   3 |  *
   4 |  * Licensed under the Apache License, Version 2.0 (the "License");
   5 |  * you may not use this file except in compliance with the License.
   6 |  * You may obtain a copy of the License at
   7 |  *
   8 |  *     http://www.apache.org/licenses/LICENSE-2.0
   9 |  *
  10 |  * Unless required by applicable law or agreed to in writing, software
  11 |  * distributed under the License is distributed on an "AS IS" BASIS,
  12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 |  * See the License for the specific language governing permissions and
  14 |  * limitations under the License.
  15 |  */
  16 | 
  17 | /*
  18 |  * This code is heavily based on Arc90's readability.js (1.7.1) script
  19 |  * available at: http://code.google.com/p/arc90labs-readability
  20 |  */
  21 | 
  22 | /**
  23 |  * Public constructor.
  24 |  * @param {HTMLDocument} doc     The document to parse.
  25 |  * @param {Object}       options The options object.
  26 |  */
  27 | function Readability(doc, options) {
  28 |   // In some older versions, people passed a URI as the first argument. Cope:
  29 |   if (options && options.documentElement) {
  30 |     doc = options;
  31 |     options = arguments[2];
  32 |   } else if (!doc || !doc.documentElement) {
  33 |     throw new Error("First argument to Readability constructor should be a document object.");
  34 |   }
  35 |   options = options || {};
  36 | 
  37 |   this._doc = doc;
  38 |   this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
  39 |   this._articleTitle = null;
  40 |   this._articleByline = null;
  41 |   this._articleDir = null;
  42 |   this._articleSiteName = null;
  43 |   this._attempts = [];
  44 | 
  45 |   // Configurable options
  46 |   this._debug = !!options.debug;
  47 |   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
  48 |   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
  49 |   this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
  50 |   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
  51 |   this._keepClasses = !!options.keepClasses;
  52 |   this._serializer = options.serializer || function(el) {
  53 |     return el.innerHTML;
  54 |   };
  55 |   this._disableJSONLD = !!options.disableJSONLD;
  56 |   this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
  57 |   this._linkDensityModifier = options.linkDensityModifier || 0;
  58 | 
  59 |   // Start with all flags set
  60 |   this._flags = this.FLAG_STRIP_UNLIKELYS |
  61 |                 this.FLAG_WEIGHT_CLASSES |
  62 |                 this.FLAG_CLEAN_CONDITIONALLY;
  63 | 
  64 | 
  65 |   // Control whether log messages are sent to the console
  66 |   if (this._debug) {
  67 |     let logNode = function(node) {
  68 |       if (node.nodeType == node.TEXT_NODE) {
  69 |         return \`\${node.nodeName} ("\${node.textContent}")\`;
  70 |       }
  71 |       let attrPairs = Array.from(node.attributes || [], function(attr) {
  72 |         return \`\${attr.name}="\${attr.value}"\`;
  73 |       }).join(" ");
  74 |       return \`<\${node.localName} \${attrPairs}>\`;
  75 |     };
  76 |     this.log = function () {
  77 |       if (typeof console !== "undefined") {
  78 |         let args = Array.from(arguments, arg => {
  79 |           if (arg && arg.nodeType == this.ELEMENT_NODE) {
  80 |             return logNode(arg);
  81 |           }
  82 |           return arg;
  83 |         });
  84 |         args.unshift("Reader: (Readability)");
  85 |         console.log.apply(console, args);
  86 |       } else if (typeof dump !== "undefined") {
  87 |         /* global dump */
  88 |         var msg = Array.prototype.map.call(arguments, function(x) {
  89 |           return (x && x.nodeName) ? logNode(x) : x;
  90 |         }).join(" ");
  91 |         dump("Reader: (Readability) " + msg + "\\n");
  92 |       }
  93 |     };
  94 |   } else {
  95 |     this.log = function () {};
  96 |   }
  97 | }
  98 | 
  99 | Readability.prototype = {
 100 |   FLAG_STRIP_UNLIKELYS: 0x1,
 101 |   FLAG_WEIGHT_CLASSES: 0x2,
 102 |   FLAG_CLEAN_CONDITIONALLY: 0x4,
 103 | 
 104 |   // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
 105 |   ELEMENT_NODE: 1,
 106 |   TEXT_NODE: 3,
 107 | 
 108 |   // Max number of nodes supported by this parser. Default: 0 (no limit)
 109 |   DEFAULT_MAX_ELEMS_TO_PARSE: 0,
 110 | 
 111 |   // The number of top candidates to consider when analysing how
 112 |   // tight the competition is among candidates.
 113 |   DEFAULT_N_TOP_CANDIDATES: 5,
 114 | 
 115 |   // Element tags to score by default.
 116 |   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 117 | 
 118 |   // The default number of chars an article must have in order to return a result
 119 |   DEFAULT_CHAR_THRESHOLD: 500,
 120 | 
 121 |   // All of the regular expressions in use within readability.
 122 |   // Defined up here so we don't instantiate them repeatedly in loops.
 123 |   REGEXPS: {
 124 |     // NOTE: These two regular expressions are duplicated in
 125 |     // Readability-readerable.js. Please keep both copies in sync.
 126 |     unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
 127 |     okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
 128 | 
 129 |     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
 130 |     negative: /-ad-|hidden|^hid\$| hid\$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
 131 |     extraneous: /print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility/i,
 132 |     byline: /byline|author|dateline|writtenby|p-author/i,
 133 |     replaceFonts: /<(\\/?)font[^>]*>/gi,
 134 |     normalize: /\\s{2,}/g,
 135 |     videos: /\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq)\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,
 136 |     shareElements: /(\\b|_)(share|sharedaddy)(\\b|_)/i,
 137 |     nextLink: /(next|weiter|continue|>([^\\|]|\$)|»([^\\|]|\$))/i,
 138 |     prevLink: /(prev|earl|old|new|<|«)/i,
 139 |     tokenize: /\\W+/g,
 140 |     whitespace: /^\\s*\$/,
 141 |     hasContent: /\\S\$/,
 142 |     hashUrl: /^#.+/,
 143 |     srcsetUrl: /(\\S+)(\\s+[\\d.]+[xw])?(\\s*(?:,|\$))/g,
 144 |     b64DataUrl: /^data:\\s*([^\\s;,]+)\\s*;\\s*base64\\s*,/i,
 145 |     // Commas as used in Latin, Sindhi, Chinese and various other scripts.
 146 |     // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
 147 |     commas: /\\u002C|\\u060C|\\uFE50|\\uFE10|\\uFE11|\\u2E41|\\u2E34|\\u2E32|\\uFF0C/g,
 148 |     // See: https://schema.org/Article
 149 |     jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference\$/,
 150 |     // used to see if a node's content matches words commonly used for ad blocks or loading indicators
 151 |     adWords: /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)\$/iu,
 152 |     loadingWords: /^((loading|正在加载|Загрузка|chargement|cargando)(…|\\.\\.\\.)?)\$/iu,
 153 |   },
 154 | 
 155 |   UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ],
 156 | 
 157 |   DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]),
 158 | 
 159 |   ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
 160 | 
 161 |   PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
 162 | 
 163 |   DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
 164 | 
 165 |   // The commented out elements qualify as phrasing content but tend to be
 166 |   // removed by readability when put into paragraphs, so we ignore them here.
 167 |   PHRASING_ELEMS: [
 168 |     // "CANVAS", "IFRAME", "SVG", "VIDEO",
 169 |     "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
 170 |     "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
 171 |     "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
 172 |     "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
 173 |     "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
 174 |   ],
 175 | 
 176 |   // These are the classes that readability sets itself.
 177 |   CLASSES_TO_PRESERVE: [ "page" ],
 178 | 
 179 |   // These are the list of HTML entities that need to be escaped.
 180 |   HTML_ESCAPE_MAP: {
 181 |     "lt": "<",
 182 |     "gt": ">",
 183 |     "amp": "&",
 184 |     "quot": '"',
 185 |     "apos": "'",
 186 |   },
 187 | 
 188 |   /**
 189 |    * Run any post-process modifications to article content as necessary.
 190 |    *
 191 |    * @param Element
 192 |    * @return void
 193 |   **/
 194 |   _postProcessContent: function(articleContent) {
 195 |     // Readability cannot open relative uris so we convert them to absolute uris.
 196 |     this._fixRelativeUris(articleContent);
 197 | 
 198 |     this._simplifyNestedElements(articleContent);
 199 | 
 200 |     if (!this._keepClasses) {
 201 |       // Remove classes.
 202 |       this._cleanClasses(articleContent);
 203 |     }
 204 |   },
 205 | 
 206 |   /**
 207 |    * Iterates over a NodeList, calls \`filterFn\` for each node and removes node
 208 |    * if function returned \`true\`.
 209 |    *
 210 |    * If function is not passed, removes all the nodes in node list.
 211 |    *
 212 |    * @param NodeList nodeList The nodes to operate on
 213 |    * @param Function filterFn the function to use as a filter
 214 |    * @return void
 215 |    */
 216 |   _removeNodes: function(nodeList, filterFn) {
 217 |     // Avoid ever operating on live node lists.
 218 |     if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 219 |       throw new Error("Do not pass live node lists to _removeNodes");
 220 |     }
 221 |     for (var i = nodeList.length - 1; i >= 0; i--) {
 222 |       var node = nodeList[i];
 223 |       var parentNode = node.parentNode;
 224 |       if (parentNode) {
 225 |         if (!filterFn || filterFn.call(this, node, i, nodeList)) {
 226 |           parentNode.removeChild(node);
 227 |         }
 228 |       }
 229 |     }
 230 |   },
 231 | 
 232 |   /**
 233 |    * Iterates over a NodeList, and calls _setNodeTag for each node.
 234 |    *
 235 |    * @param NodeList nodeList The nodes to operate on
 236 |    * @param String newTagName the new tag name to use
 237 |    * @return void
 238 |    */
 239 |   _replaceNodeTags: function(nodeList, newTagName) {
 240 |     // Avoid ever operating on live node lists.
 241 |     if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 242 |       throw new Error("Do not pass live node lists to _replaceNodeTags");
 243 |     }
 244 |     for (const node of nodeList) {
 245 |       this._setNodeTag(node, newTagName);
 246 |     }
 247 |   },
 248 | 
 249 |   /**
 250 |    * Iterate over a NodeList, which doesn't natively fully implement the Array
 251 |    * interface.
 252 |    *
 253 |    * For convenience, the current object context is applied to the provided
 254 |    * iterate function.
 255 |    *
 256 |    * @param  NodeList nodeList The NodeList.
 257 |    * @param  Function fn       The iterate function.
 258 |    * @return void
 259 |    */
 260 |   _forEachNode: function(nodeList, fn) {
 261 |     Array.prototype.forEach.call(nodeList, fn, this);
 262 |   },
 263 | 
 264 |   /**
 265 |    * Iterate over a NodeList, and return the first node that passes
 266 |    * the supplied test function
 267 |    *
 268 |    * For convenience, the current object context is applied to the provided
 269 |    * test function.
 270 |    *
 271 |    * @param  NodeList nodeList The NodeList.
 272 |    * @param  Function fn       The test function.
 273 |    * @return void
 274 |    */
 275 |   _findNode: function(nodeList, fn) {
 276 |     return Array.prototype.find.call(nodeList, fn, this);
 277 |   },
 278 | 
 279 |   /**
 280 |    * Iterate over a NodeList, return true if any of the provided iterate
 281 |    * function calls returns true, false otherwise.
 282 |    *
 283 |    * For convenience, the current object context is applied to the
 284 |    * provided iterate function.
 285 |    *
 286 |    * @param  NodeList nodeList The NodeList.
 287 |    * @param  Function fn       The iterate function.
 288 |    * @return Boolean
 289 |    */
 290 |   _someNode: function(nodeList, fn) {
 291 |     return Array.prototype.some.call(nodeList, fn, this);
 292 |   },
 293 | 
 294 |   /**
 295 |    * Iterate over a NodeList, return true if all of the provided iterate
 296 |    * function calls return true, false otherwise.
 297 |    *
 298 |    * For convenience, the current object context is applied to the
 299 |    * provided iterate function.
 300 |    *
 301 |    * @param  NodeList nodeList The NodeList.
 302 |    * @param  Function fn       The iterate function.
 303 |    * @return Boolean
 304 |    */
 305 |   _everyNode: function(nodeList, fn) {
 306 |     return Array.prototype.every.call(nodeList, fn, this);
 307 |   },
 308 | 
 309 |   /**
 310 |    * Concat all nodelists passed as arguments.
 311 |    *
 312 |    * @return ...NodeList
 313 |    * @return Array
 314 |    */
 315 |   _concatNodeLists: function() {
 316 |     var slice = Array.prototype.slice;
 317 |     var args = slice.call(arguments);
 318 |     var nodeLists = args.map(function(list) {
 319 |       return slice.call(list);
 320 |     });
 321 |     return Array.prototype.concat.apply([], nodeLists);
 322 |   },
 323 | 
 324 |   _getAllNodesWithTag: function(node, tagNames) {
 325 |     if (node.querySelectorAll) {
 326 |       return node.querySelectorAll(tagNames.join(","));
 327 |     }
 328 |     return [].concat.apply([], tagNames.map(function(tag) {
 329 |       var collection = node.getElementsByTagName(tag);
 330 |       return Array.isArray(collection) ? collection : Array.from(collection);
 331 |     }));
 332 |   },
 333 | 
 334 |   /**
 335 |    * Removes the class="" attribute from every element in the given
 336 |    * subtree, except those that match CLASSES_TO_PRESERVE and
 337 |    * the classesToPreserve array from the options object.
 338 |    *
 339 |    * @param Element
 340 |    * @return void
 341 |    */
 342 |   _cleanClasses: function(node) {
 343 |     var classesToPreserve = this._classesToPreserve;
 344 |     var className = (node.getAttribute("class") || "")
 345 |       .split(/\\s+/)
 346 |       .filter(function(cls) {
 347 |         return classesToPreserve.indexOf(cls) != -1;
 348 |       })
 349 |       .join(" ");
 350 | 
 351 |     if (className) {
 352 |       node.setAttribute("class", className);
 353 |     } else {
 354 |       node.removeAttribute("class");
 355 |     }
 356 | 
 357 |     for (node = node.firstElementChild; node; node = node.nextElementSibling) {
 358 |       this._cleanClasses(node);
 359 |     }
 360 |   },
 361 | 
 362 |   /**
 363 |    * Converts each  and  uri in the given element to an absolute URI,
 364 |    * ignoring #ref URIs.
 365 |    *
 366 |    * @param Element
 367 |    * @return void
 368 |    */
 369 |   _fixRelativeUris: function(articleContent) {
 370 |     var baseURI = this._doc.baseURI;
 371 |     var documentURI = this._doc.documentURI;
 372 |     function toAbsoluteURI(uri) {
 373 |       // Leave hash links alone if the base URI matches the document URI:
 374 |       if (baseURI == documentURI && uri.charAt(0) == "#") {
 375 |         return uri;
 376 |       }
 377 | 
 378 |       // Otherwise, resolve against base URI:
 379 |       try {
 380 |         return new URL(uri, baseURI).href;
 381 |       } catch (ex) {
 382 |         // Something went wrong, just return the original:
 383 |       }
 384 |       return uri;
 385 |     }
 386 | 
 387 |     var links = this._getAllNodesWithTag(articleContent, ["a"]);
 388 |     this._forEachNode(links, function(link) {
 389 |       var href = link.getAttribute("href");
 390 |       if (href) {
 391 |         // Remove links with javascript: URIs, since
 392 |         // they won't work after scripts have been removed from the page.
 393 |         if (href.indexOf("javascript:") === 0) {
 394 |           // if the link only contains simple text content, it can be converted to a text node
 395 |           if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
 396 |             var text = this._doc.createTextNode(link.textContent);
 397 |             link.parentNode.replaceChild(text, link);
 398 |           } else {
 399 |             // if the link has multiple children, they should all be preserved
 400 |             var container = this._doc.createElement("span");
 401 |             while (link.firstChild) {
 402 |               container.appendChild(link.firstChild);
 403 |             }
 404 |             link.parentNode.replaceChild(container, link);
 405 |           }
 406 |         } else {
 407 |           link.setAttribute("href", toAbsoluteURI(href));
 408 |         }
 409 |       }
 410 |     });
 411 | 
 412 |     var medias = this._getAllNodesWithTag(articleContent, [
 413 |       "img", "picture", "figure", "video", "audio", "source",
 414 |     ]);
 415 | 
 416 |     this._forEachNode(medias, function(media) {
 417 |       var src = media.getAttribute("src");
 418 |       var poster = media.getAttribute("poster");
 419 |       var srcset = media.getAttribute("srcset");
 420 | 
 421 |       if (src) {
 422 |         media.setAttribute("src", toAbsoluteURI(src));
 423 |       }
 424 | 
 425 |       if (poster) {
 426 |         media.setAttribute("poster", toAbsoluteURI(poster));
 427 |       }
 428 | 
 429 |       if (srcset) {
 430 |         var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
 431 |           return toAbsoluteURI(p1) + (p2 || "") + p3;
 432 |         });
 433 | 
 434 |         media.setAttribute("srcset", newSrcset);
 435 |       }
 436 |     });
 437 |   },
 438 | 
 439 |   _simplifyNestedElements: function(articleContent) {
 440 |     var node = articleContent;
 441 | 
 442 |     while (node) {
 443 |       if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
 444 |         if (this._isElementWithoutContent(node)) {
 445 |           node = this._removeAndGetNext(node);
 446 |           continue;
 447 |         } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) {
 448 |           var child = node.children[0];
 449 |           for (var i = 0; i < node.attributes.length; i++) {
 450 |             child.setAttribute(node.attributes[i].name, node.attributes[i].value);
 451 |           }
 452 |           node.parentNode.replaceChild(child, node);
 453 |           node = child;
 454 |           continue;
 455 |         }
 456 |       }
 457 | 
 458 |       node = this._getNextNode(node);
 459 |     }
 460 |   },
 461 | 
 462 |   /**
 463 |    * Get the article title as an H1.
 464 |    *
 465 |    * @return string
 466 |    **/
 467 |   _getArticleTitle: function() {
 468 |     var doc = this._doc;
 469 |     var curTitle = "";
 470 |     var origTitle = "";
 471 | 
 472 |     try {
 473 |       curTitle = origTitle = doc.title.trim();
 474 | 
 475 |       // If they had an element with id "title" in their HTML
 476 |       if (typeof curTitle !== "string")
 477 |         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
 478 |     } catch (e) {/* ignore exceptions setting the title. */}
 479 | 
 480 |     var titleHadHierarchicalSeparators = false;
 481 |     function wordCount(str) {
 482 |       return str.split(/\\s+/).length;
 483 |     }
 484 | 
 485 |     // If there's a separator in the title, first remove the final part
 486 |     if ((/ [\\|\\-\\\\\\/>»] /).test(curTitle)) {
 487 |       titleHadHierarchicalSeparators = / [\\\\\\/>»] /.test(curTitle);
 488 |       curTitle = origTitle.replace(/(.*)[\\|\\-\\\\\\/>»] .*/gi, "\$1");
 489 | 
 490 |       // If the resulting title is too short (3 words or fewer), remove
 491 |       // the first part instead:
 492 |       if (wordCount(curTitle) < 3)
 493 |         curTitle = origTitle.replace(/[^\\|\\-\\\\\\/>»]*[\\|\\-\\\\\\/>»](.*)/gi, "\$1");
 494 |     } else if (curTitle.indexOf(": ") !== -1) {
 495 |       // Check if we have an heading containing this exact string, so we
 496 |       // could assume it's the full title.
 497 |       var headings = this._concatNodeLists(
 498 |         doc.getElementsByTagName("h1"),
 499 |         doc.getElementsByTagName("h2")
 500 |       );
 501 |       var trimmedTitle = curTitle.trim();
 502 |       var match = this._someNode(headings, function(heading) {
 503 |         return heading.textContent.trim() === trimmedTitle;
 504 |       });
 505 | 
 506 |       // If we don't, let's extract the title out of the original title string.
 507 |       if (!match) {
 508 |         curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
 509 | 
 510 |         // If the title is now too short, try the first colon instead:
 511 |         if (wordCount(curTitle) < 3) {
 512 |           curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
 513 |           // But if we have too many words before the colon there's something weird
 514 |           // with the titles and the H tags so let's just use the original title instead
 515 |         } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
 516 |           curTitle = origTitle;
 517 |         }
 518 |       }
 519 |     } else if (curTitle.length > 150 || curTitle.length < 15) {
 520 |       var hOnes = doc.getElementsByTagName("h1");
 521 | 
 522 |       if (hOnes.length === 1)
 523 |         curTitle = this._getInnerText(hOnes[0]);
 524 |     }
 525 | 
 526 |     curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
 527 |     // If we now have 4 words or fewer as our title, and either no
 528 |     // 'hierarchical' separators (\\, /, > or ») were found in the original
 529 |     // title or we decreased the number of words by more than 1 word, use
 530 |     // the original title.
 531 |     var curTitleWordCount = wordCount(curTitle);
 532 |     if (curTitleWordCount <= 4 &&
 533 |         (!titleHadHierarchicalSeparators ||
 534 |          curTitleWordCount != wordCount(origTitle.replace(/[\\|\\-\\\\\\/>»]+/g, "")) - 1)) {
 535 |       curTitle = origTitle;
 536 |     }
 537 | 
 538 |     return curTitle;
 539 |   },
 540 | 
 541 |   /**
 542 |    * Prepare the HTML document for readability to scrape it.
 543 |    * This includes things like stripping javascript, CSS, and handling terrible markup.
 544 |    *
 545 |    * @return void
 546 |    **/
 547 |   _prepDocument: function() {
 548 |     var doc = this._doc;
 549 | 
 550 |     // Remove all style tags in head
 551 |     this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
 552 | 
 553 |     if (doc.body) {
 554 |       this._replaceBrs(doc.body);
 555 |     }
 556 | 
 557 |     this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
 558 |   },
 559 | 
 560 |   /**
 561 |    * Finds the next node, starting from the given node, and ignoring
 562 |    * whitespace in between. If the given node is an element, the same node is
 563 |    * returned.
 564 |    */
 565 |   _nextNode: function (node) {
 566 |     var next = node;
 567 |     while (next
 568 |         && (next.nodeType != this.ELEMENT_NODE)
 569 |         && this.REGEXPS.whitespace.test(next.textContent)) {
 570 |       next = next.nextSibling;
 571 |     }
 572 |     return next;
 573 |   },
 574 | 
 575 |   /**
 576 |    * Replaces 2 or more successive 
elements with a single

. 577 | * Whitespace between
elements are ignored. For example: 578 | *

foo
bar


abc
579 | * will become: 580 | *
foo
bar

abc

581 | */ 582 | _replaceBrs: function (elem) { 583 | this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) { 584 | var next = br.nextSibling; 585 | 586 | // Whether 2 or more
elements have been found and replaced with a 587 | //

block. 588 | var replaced = false; 589 | 590 | // If we find a
chain, remove the
s until we hit another node 591 | // or non-whitespace. This leaves behind the first
in the chain 592 | // (which will be replaced with a

later). 593 | while ((next = this._nextNode(next)) && (next.tagName == "BR")) { 594 | replaced = true; 595 | var brSibling = next.nextSibling; 596 | next.parentNode.removeChild(next); 597 | next = brSibling; 598 | } 599 | 600 | // If we removed a
chain, replace the remaining
with a

. Add 601 | // all sibling nodes as children of the

until we hit another
602 | // chain. 603 | if (replaced) { 604 | var p = this._doc.createElement("p"); 605 | br.parentNode.replaceChild(p, br); 606 | 607 | next = p.nextSibling; 608 | while (next) { 609 | // If we've hit another

, we're done adding children to this

. 610 | if (next.tagName == "BR") { 611 | var nextElem = this._nextNode(next.nextSibling); 612 | if (nextElem && nextElem.tagName == "BR") 613 | break; 614 | } 615 | 616 | if (!this._isPhrasingContent(next)) 617 | break; 618 | 619 | // Otherwise, make this node a child of the new

. 620 | var sibling = next.nextSibling; 621 | p.appendChild(next); 622 | next = sibling; 623 | } 624 | 625 | while (p.lastChild && this._isWhitespace(p.lastChild)) { 626 | p.removeChild(p.lastChild); 627 | } 628 | 629 | if (p.parentNode.tagName === "P") 630 | this._setNodeTag(p.parentNode, "DIV"); 631 | } 632 | }); 633 | }, 634 | 635 | _setNodeTag: function (node, tag) { 636 | this.log("_setNodeTag", node, tag); 637 | if (this._docJSDOMParser) { 638 | node.localName = tag.toLowerCase(); 639 | node.tagName = tag.toUpperCase(); 640 | return node; 641 | } 642 | 643 | var replacement = node.ownerDocument.createElement(tag); 644 | while (node.firstChild) { 645 | replacement.appendChild(node.firstChild); 646 | } 647 | node.parentNode.replaceChild(replacement, node); 648 | if (node.readability) 649 | replacement.readability = node.readability; 650 | 651 | for (var i = 0; i < node.attributes.length; i++) { 652 | try { 653 | replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); 654 | } catch (ex) { 655 | /* it's possible for setAttribute() to throw if the attribute name 656 | * isn't a valid XML Name. Such attributes can however be parsed from 657 | * source in HTML docs, see https://github.com/whatwg/html/issues/4275, 658 | * so we can hit them here and then throw. We don't care about such 659 | * attributes so we ignore them. 660 | */ 661 | } 662 | } 663 | return replacement; 664 | }, 665 | 666 | /** 667 | * Prepare the article node for display. Clean out any inline styles, 668 | * iframes, forms, strip extraneous

tags, etc. 669 | * 670 | * @param Element 671 | * @return void 672 | **/ 673 | _prepArticle: function(articleContent) { 674 | this._cleanStyles(articleContent); 675 | 676 | // Check for data tables before we continue, to avoid removing items in 677 | // those tables, which will often be isolated even though they're 678 | // visually linked to other content-ful elements (text, images, etc.). 679 | this._markDataTables(articleContent); 680 | 681 | this._fixLazyImages(articleContent); 682 | 683 | // Clean out junk from the article content 684 | this._cleanConditionally(articleContent, "form"); 685 | this._cleanConditionally(articleContent, "fieldset"); 686 | this._clean(articleContent, "object"); 687 | this._clean(articleContent, "embed"); 688 | this._clean(articleContent, "footer"); 689 | this._clean(articleContent, "link"); 690 | this._clean(articleContent, "aside"); 691 | 692 | // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, 693 | // which means we don't remove the top candidates even they have "share". 694 | 695 | var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; 696 | 697 | this._forEachNode(articleContent.children, function (topCandidate) { 698 | this._cleanMatchedNodes(topCandidate, function (node, matchString) { 699 | return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; 700 | }); 701 | }); 702 | 703 | this._clean(articleContent, "iframe"); 704 | this._clean(articleContent, "input"); 705 | this._clean(articleContent, "textarea"); 706 | this._clean(articleContent, "select"); 707 | this._clean(articleContent, "button"); 708 | this._cleanHeaders(articleContent); 709 | 710 | // Do these last as the previous stuff may have removed junk 711 | // that will affect these 712 | this._cleanConditionally(articleContent, "table"); 713 | this._cleanConditionally(articleContent, "ul"); 714 | this._cleanConditionally(articleContent, "div"); 715 | 716 | // replace H1 with H2 as H1 should be only title that is displayed separately 717 | this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); 718 | 719 | // Remove extra paragraphs 720 | this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { 721 | var imgCount = paragraph.getElementsByTagName("img").length; 722 | var embedCount = paragraph.getElementsByTagName("embed").length; 723 | var objectCount = paragraph.getElementsByTagName("object").length; 724 | // At this point, nasty iframes have been removed, only remain embedded video ones. 725 | var iframeCount = paragraph.getElementsByTagName("iframe").length; 726 | var totalCount = imgCount + embedCount + objectCount + iframeCount; 727 | 728 | return totalCount === 0 && !this._getInnerText(paragraph, false); 729 | }); 730 | 731 | this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { 732 | var next = this._nextNode(br.nextSibling); 733 | if (next && next.tagName == "P") 734 | br.parentNode.removeChild(br); 735 | }); 736 | 737 | // Remove single-cell tables 738 | this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { 739 | var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; 740 | if (this._hasSingleTagInsideElement(tbody, "TR")) { 741 | var row = tbody.firstElementChild; 742 | if (this._hasSingleTagInsideElement(row, "TD")) { 743 | var cell = row.firstElementChild; 744 | cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); 745 | table.parentNode.replaceChild(cell, table); 746 | } 747 | } 748 | }); 749 | }, 750 | 751 | /** 752 | * Initialize a node with the readability object. Also checks the 753 | * className/id for special names to add to its score. 754 | * 755 | * @param Element 756 | * @return void 757 | **/ 758 | _initializeNode: function(node) { 759 | node.readability = {"contentScore": 0}; 760 | 761 | switch (node.tagName) { 762 | case "DIV": 763 | node.readability.contentScore += 5; 764 | break; 765 | 766 | case "PRE": 767 | case "TD": 768 | case "BLOCKQUOTE": 769 | node.readability.contentScore += 3; 770 | break; 771 | 772 | case "ADDRESS": 773 | case "OL": 774 | case "UL": 775 | case "DL": 776 | case "DD": 777 | case "DT": 778 | case "LI": 779 | case "FORM": 780 | node.readability.contentScore -= 3; 781 | break; 782 | 783 | case "H1": 784 | case "H2": 785 | case "H3": 786 | case "H4": 787 | case "H5": 788 | case "H6": 789 | case "TH": 790 | node.readability.contentScore -= 5; 791 | break; 792 | } 793 | 794 | node.readability.contentScore += this._getClassWeight(node); 795 | }, 796 | 797 | _removeAndGetNext: function(node) { 798 | var nextNode = this._getNextNode(node, true); 799 | node.parentNode.removeChild(node); 800 | return nextNode; 801 | }, 802 | 803 | /** 804 | * Traverse the DOM from node to node, starting at the node passed in. 805 | * Pass true for the second parameter to indicate this node itself 806 | * (and its kids) are going away, and we want the next node over. 807 | * 808 | * Calling this in a loop will traverse the DOM depth-first. 809 | */ 810 | _getNextNode: function(node, ignoreSelfAndKids) { 811 | // First check for kids if those aren't being ignored 812 | if (!ignoreSelfAndKids && node.firstElementChild) { 813 | return node.firstElementChild; 814 | } 815 | // Then for siblings... 816 | if (node.nextElementSibling) { 817 | return node.nextElementSibling; 818 | } 819 | // And finally, move up the parent chain *and* find a sibling 820 | // (because this is depth-first traversal, we will have already 821 | // seen the parent nodes themselves). 822 | do { 823 | node = node.parentNode; 824 | } while (node && !node.nextElementSibling); 825 | return node && node.nextElementSibling; 826 | }, 827 | 828 | // compares second text to first one 829 | // 1 = same text, 0 = completely different text 830 | // works the way that it splits both texts into words and then finds words that are unique in second text 831 | // the result is given by the lower length of unique parts 832 | _textSimilarity: function(textA, textB) { 833 | var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); 834 | var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); 835 | if (!tokensA.length || !tokensB.length) { 836 | return 0; 837 | } 838 | var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); 839 | var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; 840 | return 1 - distanceB; 841 | }, 842 | 843 | _checkByline: function(node, matchString) { 844 | if (this._articleByline) { 845 | return false; 846 | } 847 | 848 | if (node.getAttribute !== undefined) { 849 | var rel = node.getAttribute("rel"); 850 | var itemprop = node.getAttribute("itemprop"); 851 | } 852 | 853 | if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { 854 | this._articleByline = node.textContent.trim(); 855 | return true; 856 | } 857 | 858 | return false; 859 | }, 860 | 861 | _getNodeAncestors: function(node, maxDepth) { 862 | maxDepth = maxDepth || 0; 863 | var i = 0, ancestors = []; 864 | while (node.parentNode) { 865 | ancestors.push(node.parentNode); 866 | if (maxDepth && ++i === maxDepth) 867 | break; 868 | node = node.parentNode; 869 | } 870 | return ancestors; 871 | }, 872 | 873 | /*** 874 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 875 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 876 | * 877 | * @param page a document to run upon. Needs to be a full document, complete with body. 878 | * @return Element 879 | **/ 880 | _grabArticle: function (page) { 881 | this.log("**** grabArticle ****"); 882 | var doc = this._doc; 883 | var isPaging = page !== null; 884 | page = page ? page : this._doc.body; 885 | 886 | // We can't grab an article if we don't have a page! 887 | if (!page) { 888 | this.log("No body found in document. Abort."); 889 | return null; 890 | } 891 | 892 | var pageCacheHtml = page.innerHTML; 893 | 894 | while (true) { 895 | this.log("Starting grabArticle loop"); 896 | var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); 897 | 898 | // First, node prepping. Trash nodes that look cruddy (like ones with the 899 | // class name "comment", etc), and turn divs into P tags where they have been 900 | // used inappropriately (as in, where they contain no other block level elements.) 901 | var elementsToScore = []; 902 | var node = this._doc.documentElement; 903 | 904 | let shouldRemoveTitleHeader = true; 905 | 906 | while (node) { 907 | 908 | if (node.tagName === "HTML") { 909 | this._articleLang = node.getAttribute("lang"); 910 | } 911 | 912 | var matchString = node.className + " " + node.id; 913 | 914 | if (!this._isProbablyVisible(node)) { 915 | this.log("Removing hidden node - " + matchString); 916 | node = this._removeAndGetNext(node); 917 | continue; 918 | } 919 | 920 | // User is not able to see elements applied with both "aria-modal = true" and "role = dialog" 921 | if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") { 922 | node = this._removeAndGetNext(node); 923 | continue; 924 | } 925 | 926 | // Check to see if this node is a byline, and remove it if it is. 927 | if (this._checkByline(node, matchString)) { 928 | node = this._removeAndGetNext(node); 929 | continue; 930 | } 931 | 932 | if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { 933 | this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); 934 | shouldRemoveTitleHeader = false; 935 | node = this._removeAndGetNext(node); 936 | continue; 937 | } 938 | 939 | // Remove unlikely candidates 940 | if (stripUnlikelyCandidates) { 941 | if (this.REGEXPS.unlikelyCandidates.test(matchString) && 942 | !this.REGEXPS.okMaybeItsACandidate.test(matchString) && 943 | !this._hasAncestorTag(node, "table") && 944 | !this._hasAncestorTag(node, "code") && 945 | node.tagName !== "BODY" && 946 | node.tagName !== "A") { 947 | this.log("Removing unlikely candidate - " + matchString); 948 | node = this._removeAndGetNext(node); 949 | continue; 950 | } 951 | 952 | if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { 953 | this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); 954 | node = this._removeAndGetNext(node); 955 | continue; 956 | } 957 | } 958 | 959 | // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). 960 | if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || 961 | node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || 962 | node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && 963 | this._isElementWithoutContent(node)) { 964 | node = this._removeAndGetNext(node); 965 | continue; 966 | } 967 | 968 | if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { 969 | elementsToScore.push(node); 970 | } 971 | 972 | // Turn all divs that don't have children block level elements into p's 973 | if (node.tagName === "DIV") { 974 | // Put phrasing content into paragraphs. 975 | var p = null; 976 | var childNode = node.firstChild; 977 | while (childNode) { 978 | var nextSibling = childNode.nextSibling; 979 | if (this._isPhrasingContent(childNode)) { 980 | if (p !== null) { 981 | p.appendChild(childNode); 982 | } else if (!this._isWhitespace(childNode)) { 983 | p = doc.createElement("p"); 984 | node.replaceChild(p, childNode); 985 | p.appendChild(childNode); 986 | } 987 | } else if (p !== null) { 988 | while (p.lastChild && this._isWhitespace(p.lastChild)) { 989 | p.removeChild(p.lastChild); 990 | } 991 | p = null; 992 | } 993 | childNode = nextSibling; 994 | } 995 | 996 | // Sites like http://mobile.slate.com encloses each paragraph with a DIV 997 | // element. DIVs with only a P element inside and no text content can be 998 | // safely converted into plain P elements to avoid confusing the scoring 999 | // algorithm with DIVs with are, in practice, paragraphs. 1000 | if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { 1001 | var newNode = node.children[0]; 1002 | node.parentNode.replaceChild(newNode, node); 1003 | node = newNode; 1004 | elementsToScore.push(node); 1005 | } else if (!this._hasChildBlockElement(node)) { 1006 | node = this._setNodeTag(node, "P"); 1007 | elementsToScore.push(node); 1008 | } 1009 | } 1010 | node = this._getNextNode(node); 1011 | } 1012 | 1013 | /** 1014 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. 1015 | * Then add their score to their parent node. 1016 | * 1017 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 1018 | **/ 1019 | var candidates = []; 1020 | this._forEachNode(elementsToScore, function(elementToScore) { 1021 | if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") 1022 | return; 1023 | 1024 | // If this paragraph is less than 25 characters, don't even count it. 1025 | var innerText = this._getInnerText(elementToScore); 1026 | if (innerText.length < 25) 1027 | return; 1028 | 1029 | // Exclude nodes with no ancestor. 1030 | var ancestors = this._getNodeAncestors(elementToScore, 5); 1031 | if (ancestors.length === 0) 1032 | return; 1033 | 1034 | var contentScore = 0; 1035 | 1036 | // Add a point for the paragraph itself as a base. 1037 | contentScore += 1; 1038 | 1039 | // Add points for any commas within this paragraph. 1040 | contentScore += innerText.split(this.REGEXPS.commas).length; 1041 | 1042 | // For every 100 characters in this paragraph, add another point. Up to 3 points. 1043 | contentScore += Math.min(Math.floor(innerText.length / 100), 3); 1044 | 1045 | // Initialize and score ancestors. 1046 | this._forEachNode(ancestors, function(ancestor, level) { 1047 | if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") 1048 | return; 1049 | 1050 | if (typeof(ancestor.readability) === "undefined") { 1051 | this._initializeNode(ancestor); 1052 | candidates.push(ancestor); 1053 | } 1054 | 1055 | // Node score divider: 1056 | // - parent: 1 (no division) 1057 | // - grandparent: 2 1058 | // - great grandparent+: ancestor level * 3 1059 | if (level === 0) 1060 | var scoreDivider = 1; 1061 | else if (level === 1) 1062 | scoreDivider = 2; 1063 | else 1064 | scoreDivider = level * 3; 1065 | ancestor.readability.contentScore += contentScore / scoreDivider; 1066 | }); 1067 | }); 1068 | 1069 | // After we've calculated scores, loop through all of the possible 1070 | // candidate nodes we found and find the one with the highest score. 1071 | var topCandidates = []; 1072 | for (var c = 0, cl = candidates.length; c < cl; c += 1) { 1073 | var candidate = candidates[c]; 1074 | 1075 | // Scale the final candidates score based on link density. Good content 1076 | // should have a relatively small link density (5% or less) and be mostly 1077 | // unaffected by this operation. 1078 | var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); 1079 | candidate.readability.contentScore = candidateScore; 1080 | 1081 | this.log("Candidate:", candidate, "with score " + candidateScore); 1082 | 1083 | for (var t = 0; t < this._nbTopCandidates; t++) { 1084 | var aTopCandidate = topCandidates[t]; 1085 | 1086 | if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { 1087 | topCandidates.splice(t, 0, candidate); 1088 | if (topCandidates.length > this._nbTopCandidates) 1089 | topCandidates.pop(); 1090 | break; 1091 | } 1092 | } 1093 | } 1094 | 1095 | var topCandidate = topCandidates[0] || null; 1096 | var neededToCreateTopCandidate = false; 1097 | var parentOfTopCandidate; 1098 | 1099 | // If we still have no top candidate, just use the body as a last resort. 1100 | // We also have to copy the body node so it is something we can modify. 1101 | if (topCandidate === null || topCandidate.tagName === "BODY") { 1102 | // Move all of the page's children into topCandidate 1103 | topCandidate = doc.createElement("DIV"); 1104 | neededToCreateTopCandidate = true; 1105 | // Move everything (not just elements, also text nodes etc.) into the container 1106 | // so we even include text directly in the body: 1107 | while (page.firstChild) { 1108 | this.log("Moving child out:", page.firstChild); 1109 | topCandidate.appendChild(page.firstChild); 1110 | } 1111 | 1112 | page.appendChild(topCandidate); 1113 | 1114 | this._initializeNode(topCandidate); 1115 | } else if (topCandidate) { 1116 | // Find a better top candidate node if it contains (at least three) nodes which belong to \`topCandidates\` array 1117 | // and whose scores are quite closed with current \`topCandidate\` node. 1118 | var alternativeCandidateAncestors = []; 1119 | for (var i = 1; i < topCandidates.length; i++) { 1120 | if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { 1121 | alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); 1122 | } 1123 | } 1124 | var MINIMUM_TOPCANDIDATES = 3; 1125 | if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { 1126 | parentOfTopCandidate = topCandidate.parentNode; 1127 | while (parentOfTopCandidate.tagName !== "BODY") { 1128 | var listsContainingThisAncestor = 0; 1129 | for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { 1130 | listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); 1131 | } 1132 | if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { 1133 | topCandidate = parentOfTopCandidate; 1134 | break; 1135 | } 1136 | parentOfTopCandidate = parentOfTopCandidate.parentNode; 1137 | } 1138 | } 1139 | if (!topCandidate.readability) { 1140 | this._initializeNode(topCandidate); 1141 | } 1142 | 1143 | // Because of our bonus system, parents of candidates might have scores 1144 | // themselves. They get half of the node. There won't be nodes with higher 1145 | // scores than our topCandidate, but if we see the score going *up* in the first 1146 | // few steps up the tree, that's a decent sign that there might be more content 1147 | // lurking in other places that we want to unify in. The sibling stuff 1148 | // below does some of that - but only if we've looked high enough up the DOM 1149 | // tree. 1150 | parentOfTopCandidate = topCandidate.parentNode; 1151 | var lastScore = topCandidate.readability.contentScore; 1152 | // The scores shouldn't get too low. 1153 | var scoreThreshold = lastScore / 3; 1154 | while (parentOfTopCandidate.tagName !== "BODY") { 1155 | if (!parentOfTopCandidate.readability) { 1156 | parentOfTopCandidate = parentOfTopCandidate.parentNode; 1157 | continue; 1158 | } 1159 | var parentScore = parentOfTopCandidate.readability.contentScore; 1160 | if (parentScore < scoreThreshold) 1161 | break; 1162 | if (parentScore > lastScore) { 1163 | // Alright! We found a better parent to use. 1164 | topCandidate = parentOfTopCandidate; 1165 | break; 1166 | } 1167 | lastScore = parentOfTopCandidate.readability.contentScore; 1168 | parentOfTopCandidate = parentOfTopCandidate.parentNode; 1169 | } 1170 | 1171 | // If the top candidate is the only child, use parent instead. This will help sibling 1172 | // joining logic when adjacent content is actually located in parent's sibling node. 1173 | parentOfTopCandidate = topCandidate.parentNode; 1174 | while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { 1175 | topCandidate = parentOfTopCandidate; 1176 | parentOfTopCandidate = topCandidate.parentNode; 1177 | } 1178 | if (!topCandidate.readability) { 1179 | this._initializeNode(topCandidate); 1180 | } 1181 | } 1182 | 1183 | // Now that we have the top candidate, look through its siblings for content 1184 | // that might also be related. Things like preambles, content split by ads 1185 | // that we removed, etc. 1186 | var articleContent = doc.createElement("DIV"); 1187 | if (isPaging) 1188 | articleContent.id = "readability-content"; 1189 | 1190 | var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); 1191 | // Keep potential top candidate's parent node to try to get text direction of it later. 1192 | parentOfTopCandidate = topCandidate.parentNode; 1193 | var siblings = parentOfTopCandidate.children; 1194 | 1195 | for (var s = 0, sl = siblings.length; s < sl; s++) { 1196 | var sibling = siblings[s]; 1197 | var append = false; 1198 | 1199 | this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); 1200 | this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); 1201 | 1202 | if (sibling === topCandidate) { 1203 | append = true; 1204 | } else { 1205 | var contentBonus = 0; 1206 | 1207 | // Give a bonus if sibling nodes and top candidates have the example same classname 1208 | if (sibling.className === topCandidate.className && topCandidate.className !== "") 1209 | contentBonus += topCandidate.readability.contentScore * 0.2; 1210 | 1211 | if (sibling.readability && 1212 | ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { 1213 | append = true; 1214 | } else if (sibling.nodeName === "P") { 1215 | var linkDensity = this._getLinkDensity(sibling); 1216 | var nodeContent = this._getInnerText(sibling); 1217 | var nodeLength = nodeContent.length; 1218 | 1219 | if (nodeLength > 80 && linkDensity < 0.25) { 1220 | append = true; 1221 | } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && 1222 | nodeContent.search(/\\.( |\$)/) !== -1) { 1223 | append = true; 1224 | } 1225 | } 1226 | } 1227 | 1228 | if (append) { 1229 | this.log("Appending node:", sibling); 1230 | 1231 | if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { 1232 | // We have a node that isn't a common block level element, like a form or td tag. 1233 | // Turn it into a div so it doesn't get filtered out later by accident. 1234 | this.log("Altering sibling:", sibling, "to div."); 1235 | 1236 | sibling = this._setNodeTag(sibling, "DIV"); 1237 | } 1238 | 1239 | articleContent.appendChild(sibling); 1240 | // Fetch children again to make it compatible 1241 | // with DOM parsers without live collection support. 1242 | siblings = parentOfTopCandidate.children; 1243 | // siblings is a reference to the children array, and 1244 | // sibling is removed from the array when we call appendChild(). 1245 | // As a result, we must revisit this index since the nodes 1246 | // have been shifted. 1247 | s -= 1; 1248 | sl -= 1; 1249 | } 1250 | } 1251 | 1252 | if (this._debug) 1253 | this.log("Article content pre-prep: " + articleContent.innerHTML); 1254 | // So we have all of the content that we need. Now we clean it up for presentation. 1255 | this._prepArticle(articleContent); 1256 | if (this._debug) 1257 | this.log("Article content post-prep: " + articleContent.innerHTML); 1258 | 1259 | if (neededToCreateTopCandidate) { 1260 | // We already created a fake div thing, and there wouldn't have been any siblings left 1261 | // for the previous loop, so there's no point trying to create a new div, and then 1262 | // move all the children over. Just assign IDs and class names here. No need to append 1263 | // because that already happened anyway. 1264 | topCandidate.id = "readability-page-1"; 1265 | topCandidate.className = "page"; 1266 | } else { 1267 | var div = doc.createElement("DIV"); 1268 | div.id = "readability-page-1"; 1269 | div.className = "page"; 1270 | while (articleContent.firstChild) { 1271 | div.appendChild(articleContent.firstChild); 1272 | } 1273 | articleContent.appendChild(div); 1274 | } 1275 | 1276 | if (this._debug) 1277 | this.log("Article content after paging: " + articleContent.innerHTML); 1278 | 1279 | var parseSuccessful = true; 1280 | 1281 | // Now that we've gone through the full algorithm, check to see if 1282 | // we got any meaningful content. If we didn't, we may need to re-run 1283 | // grabArticle with different flags set. This gives us a higher likelihood of 1284 | // finding the content, and the sieve approach gives us a higher likelihood of 1285 | // finding the -right- content. 1286 | var textLength = this._getInnerText(articleContent, true).length; 1287 | if (textLength < this._charThreshold) { 1288 | parseSuccessful = false; 1289 | page.innerHTML = pageCacheHtml; 1290 | 1291 | if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { 1292 | this._removeFlag(this.FLAG_STRIP_UNLIKELYS); 1293 | this._attempts.push({articleContent: articleContent, textLength: textLength}); 1294 | } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 1295 | this._removeFlag(this.FLAG_WEIGHT_CLASSES); 1296 | this._attempts.push({articleContent: articleContent, textLength: textLength}); 1297 | } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 1298 | this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); 1299 | this._attempts.push({articleContent: articleContent, textLength: textLength}); 1300 | } else { 1301 | this._attempts.push({articleContent: articleContent, textLength: textLength}); 1302 | // No luck after removing flags, just return the longest text we found during the different loops 1303 | this._attempts.sort(function (a, b) { 1304 | return b.textLength - a.textLength; 1305 | }); 1306 | 1307 | // But first check if we actually have something 1308 | if (!this._attempts[0].textLength) { 1309 | return null; 1310 | } 1311 | 1312 | articleContent = this._attempts[0].articleContent; 1313 | parseSuccessful = true; 1314 | } 1315 | } 1316 | 1317 | if (parseSuccessful) { 1318 | // Find out text direction from ancestors of final top candidate. 1319 | var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); 1320 | this._someNode(ancestors, function(ancestor) { 1321 | if (!ancestor.tagName) 1322 | return false; 1323 | var articleDir = ancestor.getAttribute("dir"); 1324 | if (articleDir) { 1325 | this._articleDir = articleDir; 1326 | return true; 1327 | } 1328 | return false; 1329 | }); 1330 | return articleContent; 1331 | } 1332 | } 1333 | }, 1334 | 1335 | /** 1336 | * Check whether the input string could be a byline. 1337 | * This verifies that the input is a string, and that the length 1338 | * is less than 100 chars. 1339 | * 1340 | * @param possibleByline {string} - a string to check whether its a byline. 1341 | * @return Boolean - whether the input string is a byline. 1342 | */ 1343 | _isValidByline: function(byline) { 1344 | if (typeof byline == "string" || byline instanceof String) { 1345 | byline = byline.trim(); 1346 | return (byline.length > 0) && (byline.length < 100); 1347 | } 1348 | return false; 1349 | }, 1350 | 1351 | /** 1352 | * Converts some of the common HTML entities in string to their corresponding characters. 1353 | * 1354 | * @param str {string} - a string to unescape. 1355 | * @return string without HTML entity. 1356 | */ 1357 | _unescapeHtmlEntities: function(str) { 1358 | if (!str) { 1359 | return str; 1360 | } 1361 | 1362 | var htmlEscapeMap = this.HTML_ESCAPE_MAP; 1363 | return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { 1364 | return htmlEscapeMap[tag]; 1365 | }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { 1366 | var num = parseInt(hex || numStr, hex ? 16 : 10); 1367 | return String.fromCharCode(num); 1368 | }); 1369 | }, 1370 | 1371 | /** 1372 | * Try to extract metadata from JSON-LD object. 1373 | * For now, only Schema.org objects of type Article or its subtypes are supported. 1374 | * @return Object with any metadata that could be extracted (possibly none) 1375 | */ 1376 | _getJSONLD: function (doc) { 1377 | var scripts = this._getAllNodesWithTag(doc, ["script"]); 1378 | 1379 | var metadata; 1380 | 1381 | this._forEachNode(scripts, function(jsonLdElement) { 1382 | if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { 1383 | try { 1384 | // Strip CDATA markers if present 1385 | var content = jsonLdElement.textContent.replace(/^\\s*\\s*\$/g, ""); 1386 | var parsed = JSON.parse(content); 1387 | if ( 1388 | !parsed["@context"] || 1389 | !parsed["@context"].match(/^https?\\:\\/\\/schema\\.org\\/?\$/) 1390 | ) { 1391 | return; 1392 | } 1393 | 1394 | if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { 1395 | parsed = parsed["@graph"].find(function(it) { 1396 | return (it["@type"] || "").match( 1397 | this.REGEXPS.jsonLdArticleTypes 1398 | ); 1399 | }); 1400 | } 1401 | 1402 | if ( 1403 | !parsed || 1404 | !parsed["@type"] || 1405 | !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) 1406 | ) { 1407 | return; 1408 | } 1409 | 1410 | metadata = {}; 1411 | 1412 | if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { 1413 | // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz 1414 | // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either 1415 | // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. 1416 | 1417 | var title = this._getArticleTitle(); 1418 | var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; 1419 | var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; 1420 | 1421 | if (headlineMatches && !nameMatches) { 1422 | metadata.title = parsed.headline; 1423 | } else { 1424 | metadata.title = parsed.name; 1425 | } 1426 | } else if (typeof parsed.name === "string") { 1427 | metadata.title = parsed.name.trim(); 1428 | } else if (typeof parsed.headline === "string") { 1429 | metadata.title = parsed.headline.trim(); 1430 | } 1431 | if (parsed.author) { 1432 | if (typeof parsed.author.name === "string") { 1433 | metadata.byline = parsed.author.name.trim(); 1434 | } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { 1435 | metadata.byline = parsed.author 1436 | .filter(function(author) { 1437 | return author && typeof author.name === "string"; 1438 | }) 1439 | .map(function(author) { 1440 | return author.name.trim(); 1441 | }) 1442 | .join(", "); 1443 | } 1444 | } 1445 | if (typeof parsed.description === "string") { 1446 | metadata.excerpt = parsed.description.trim(); 1447 | } 1448 | if ( 1449 | parsed.publisher && 1450 | typeof parsed.publisher.name === "string" 1451 | ) { 1452 | metadata.siteName = parsed.publisher.name.trim(); 1453 | } 1454 | if (typeof parsed.datePublished === "string") { 1455 | metadata.datePublished = parsed.datePublished.trim(); 1456 | } 1457 | return; 1458 | } catch (err) { 1459 | this.log(err.message); 1460 | } 1461 | } 1462 | }); 1463 | return metadata ? metadata : {}; 1464 | }, 1465 | 1466 | /** 1467 | * Attempts to get excerpt and byline metadata for the article. 1468 | * 1469 | * @param {Object} jsonld — object containing any metadata that 1470 | * could be extracted from JSON-LD object. 1471 | * 1472 | * @return Object with optional "excerpt" and "byline" properties 1473 | */ 1474 | _getArticleMetadata: function(jsonld) { 1475 | var metadata = {}; 1476 | var values = {}; 1477 | var metaElements = this._doc.getElementsByTagName("meta"); 1478 | 1479 | // property is a space-separated list of values 1480 | var propertyPattern = /\\s*(article|dc|dcterm|og|twitter)\\s*:\\s*(author|creator|description|published_time|title|site_name)\\s*/gi; 1481 | 1482 | // name is a single value 1483 | var namePattern = /^\\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\\s*[-\\.:]\\s*)?(author|creator|pub-date|description|title|site_name)\\s*\$/i; 1484 | 1485 | // Find description tags. 1486 | this._forEachNode(metaElements, function(element) { 1487 | var elementName = element.getAttribute("name"); 1488 | var elementProperty = element.getAttribute("property"); 1489 | var content = element.getAttribute("content"); 1490 | if (!content) { 1491 | return; 1492 | } 1493 | var matches = null; 1494 | var name = null; 1495 | 1496 | if (elementProperty) { 1497 | matches = elementProperty.match(propertyPattern); 1498 | if (matches) { 1499 | // Convert to lowercase, and remove any whitespace 1500 | // so we can match below. 1501 | name = matches[0].toLowerCase().replace(/\\s/g, ""); 1502 | // multiple authors 1503 | values[name] = content.trim(); 1504 | } 1505 | } 1506 | if (!matches && elementName && namePattern.test(elementName)) { 1507 | name = elementName; 1508 | if (content) { 1509 | // Convert to lowercase, remove any whitespace, and convert dots 1510 | // to colons so we can match below. 1511 | name = name.toLowerCase().replace(/\\s/g, "").replace(/\\./g, ":"); 1512 | values[name] = content.trim(); 1513 | } 1514 | } 1515 | }); 1516 | 1517 | // get title 1518 | metadata.title = jsonld.title || 1519 | values["dc:title"] || 1520 | values["dcterm:title"] || 1521 | values["og:title"] || 1522 | values["weibo:article:title"] || 1523 | values["weibo:webpage:title"] || 1524 | values["title"] || 1525 | values["twitter:title"] || 1526 | values["parsely-title"]; 1527 | 1528 | if (!metadata.title) { 1529 | metadata.title = this._getArticleTitle(); 1530 | } 1531 | 1532 | // get author 1533 | metadata.byline = jsonld.byline || 1534 | values["dc:creator"] || 1535 | values["dcterm:creator"] || 1536 | values["author"] || 1537 | values["parsely-author"]; 1538 | 1539 | // get description 1540 | metadata.excerpt = jsonld.excerpt || 1541 | values["dc:description"] || 1542 | values["dcterm:description"] || 1543 | values["og:description"] || 1544 | values["weibo:article:description"] || 1545 | values["weibo:webpage:description"] || 1546 | values["description"] || 1547 | values["twitter:description"]; 1548 | 1549 | // get site name 1550 | metadata.siteName = jsonld.siteName || 1551 | values["og:site_name"]; 1552 | 1553 | // get article published time 1554 | metadata.publishedTime = jsonld.datePublished || 1555 | values["article:published_time"] || 1556 | values["parsely-pub-date"] || 1557 | null; 1558 | 1559 | // in many sites the meta value is escaped with HTML entities, 1560 | // so here we need to unescape it 1561 | metadata.title = this._unescapeHtmlEntities(metadata.title); 1562 | metadata.byline = this._unescapeHtmlEntities(metadata.byline); 1563 | metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); 1564 | metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); 1565 | metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime); 1566 | 1567 | return metadata; 1568 | }, 1569 | 1570 | /** 1571 | * Check if node is image, or if node contains exactly only one image 1572 | * whether as a direct child or as its descendants. 1573 | * 1574 | * @param Element 1575 | **/ 1576 | _isSingleImage: function(node) { 1577 | if (node.tagName === "IMG") { 1578 | return true; 1579 | } 1580 | 1581 | if (node.children.length !== 1 || node.textContent.trim() !== "") { 1582 | return false; 1583 | } 1584 | 1585 | return this._isSingleImage(node.children[0]); 1586 | }, 1587 | 1588 | /** 1589 | * Find all