├── .editorconfig
├── .gitignore
├── .prettierrc
├── README.md
├── package-lock.json
├── package.json
├── renovate.json
├── src
    ├── distiller.ts
    ├── index.ts
    └── third_party
    │   ├── dom-distiller
    │       ├── README.md
    │       └── domdistiller.ts
    │   ├── readability
    │       ├── LICENSE.md
    │       └── readability.ts
    │   └── turndown-client
    │       ├── LISENCE
    │       ├── README.md
    │       ├── turndown-plugin-gfm.ts
    │       └── turndown.ts
├── test
    ├── index.spec.ts
    └── tsconfig.json
├── tsconfig.json
├── vitest.config.ts
├── worker-configuration.d.ts
└── wrangler.toml


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | root = true
 3 | 
 4 | [*]
 5 | indent_style = tab
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | 
11 | [*.yml]
12 | indent_style = space
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | 
  3 | logs
  4 | _.log
  5 | npm-debug.log_
  6 | yarn-debug.log*
  7 | yarn-error.log*
  8 | lerna-debug.log*
  9 | .pnpm-debug.log*
 10 | 
 11 | # Diagnostic reports (https://nodejs.org/api/report.html)
 12 | 
 13 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
 14 | 
 15 | # Runtime data
 16 | 
 17 | pids
 18 | _.pid
 19 | _.seed
 20 | \*.pid.lock
 21 | 
 22 | # Directory for instrumented libs generated by jscoverage/JSCover
 23 | 
 24 | lib-cov
 25 | 
 26 | # Coverage directory used by tools like istanbul
 27 | 
 28 | coverage
 29 | \*.lcov
 30 | 
 31 | # nyc test coverage
 32 | 
 33 | .nyc_output
 34 | 
 35 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 36 | 
 37 | .grunt
 38 | 
 39 | # Bower dependency directory (https://bower.io/)
 40 | 
 41 | bower_components
 42 | 
 43 | # node-waf configuration
 44 | 
 45 | .lock-wscript
 46 | 
 47 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 48 | 
 49 | build/Release
 50 | 
 51 | # Dependency directories
 52 | 
 53 | node_modules/
 54 | jspm_packages/
 55 | 
 56 | # Snowpack dependency directory (https://snowpack.dev/)
 57 | 
 58 | web_modules/
 59 | 
 60 | # TypeScript cache
 61 | 
 62 | \*.tsbuildinfo
 63 | 
 64 | # Optional npm cache directory
 65 | 
 66 | .npm
 67 | 
 68 | # Optional eslint cache
 69 | 
 70 | .eslintcache
 71 | 
 72 | # Optional stylelint cache
 73 | 
 74 | .stylelintcache
 75 | 
 76 | # Microbundle cache
 77 | 
 78 | .rpt2_cache/
 79 | .rts2_cache_cjs/
 80 | .rts2_cache_es/
 81 | .rts2_cache_umd/
 82 | 
 83 | # Optional REPL history
 84 | 
 85 | .node_repl_history
 86 | 
 87 | # Output of 'npm pack'
 88 | 
 89 | \*.tgz
 90 | 
 91 | # Yarn Integrity file
 92 | 
 93 | .yarn-integrity
 94 | 
 95 | # dotenv environment variable files
 96 | 
 97 | .env
 98 | .env.development.local
 99 | .env.test.local
100 | .env.production.local
101 | .env.local
102 | 
103 | # parcel-bundler cache (https://parceljs.org/)
104 | 
105 | .cache
106 | .parcel-cache
107 | 
108 | # Next.js build output
109 | 
110 | .next
111 | out
112 | 
113 | # Nuxt.js build / generate output
114 | 
115 | .nuxt
116 | dist
117 | 
118 | # Gatsby files
119 | 
120 | .cache/
121 | 
122 | # Comment in the public line in if your project uses Gatsby and not Next.js
123 | 
124 | # https://nextjs.org/blog/next-9-1#public-directory-support
125 | 
126 | # public
127 | 
128 | # vuepress build output
129 | 
130 | .vuepress/dist
131 | 
132 | # vuepress v2.x temp and cache directory
133 | 
134 | .temp
135 | .cache
136 | 
137 | # Docusaurus cache and generated files
138 | 
139 | .docusaurus
140 | 
141 | # Serverless directories
142 | 
143 | .serverless/
144 | 
145 | # FuseBox cache
146 | 
147 | .fusebox/
148 | 
149 | # DynamoDB Local files
150 | 
151 | .dynamodb/
152 | 
153 | # TernJS port file
154 | 
155 | .tern-port
156 | 
157 | # Stores VSCode versions used for testing VSCode extensions
158 | 
159 | .vscode-test
160 | 
161 | # yarn v2
162 | 
163 | .yarn/cache
164 | .yarn/unplugged
165 | .yarn/build-state.yml
166 | .yarn/install-state.gz
167 | .pnp.\*
168 | 
169 | # wrangler project
170 | 
171 | .dev.vars
172 | .wrangler/
173 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | 	"printWidth": 140,
3 | 	"singleQuote": true,
4 | 	"semi": true,
5 | 	"useTabs": true
6 | }
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cloudflare DOM Distiller
 2 | 
 3 | This repository provides an API implementation for easily retrieving content from target web pages on Cloudflare Workers.
 4 | 
 5 | ## Features
 6 | 
 7 | - **Cloudflare Workers & Browser Rendering**: Utilizes Cloudflare Workers and browser rendering to fetch page information.
 8 | - **Readability**: Uses Readability to extract page content and remove unnecessary information.
 9 | - **DOM-Distiller**: If you set option `useReadability: false` in a request, uses dom-distiller to extract page content and remove unnecessary information.
10 | - **Turndown**: Converts the extracted HTML to Markdown format for better readability.
11 | 
12 | ## Example Usage
13 | 
14 | To run the API in development mode:
15 | 
16 | ```bash
17 | npx wrangler dev --remote
18 | ```
19 | 
20 | You can make a request to your local server and verify that the content of the target web page is converted to Markdown format:
21 | 
22 | ```bash
23 | $ curl -H 'Content-Type: application/json' \
24 |  -X POST http://localhost:8787/distill \
25 |  -d '{"url": "https://blog.samaltman.com/gpt-4o", "markdown": true}'
26 | 
27 | {"body":"There ... to the team that poured so much work into making this happen!"}
28 | ```
29 | 
30 | ## Endpoint: `/distill`
31 | 
32 | ### Request Format
33 | 
34 | - **url**: The URL of the target web page to fetch content from.
35 | - **markdown**: Boolean value to indicate whether the content should be converted to Markdown format.
36 | 
37 | ### Response Format
38 | 
39 | - **body**: Returns the content of the web page.
40 | 
41 | ## References
42 | 
43 | - [mixmark\-io/turndown: 🛏 An HTML to Markdown converter written in JavaScript](https://github.com/mixmark-io/turndown)
44 | - [mozilla/readability: A standalone version of the readability lib](https://github.com/mozilla/readability)
45 | - [chromium/dom\-distiller: Distills the DOM](https://github.com/chromium/dom-distiller)
46 | - [Puppeteer · Browser Rendering docs](https://developers.cloudflare.com/browser-rendering/platform/puppeteer/)
47 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "cloudflare-dom-distiller",
 3 |   "version": "0.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "deploy": "wrangler deploy",
 7 |     "dev": "wrangler dev",
 8 |     "start": "wrangler dev",
 9 |     "test": "vitest",
10 |     "cf-typegen": "wrangler types"
11 |   },
12 |   "devDependencies": {
13 |     "@cloudflare/puppeteer": "^0.0.14",
14 |     "@cloudflare/vitest-pool-workers": "^0.1.0",
15 |     "@cloudflare/workers-types": "^4.20240605.0",
16 |     "@types/turndown": "^5.0.4",
17 |     "typescript": "^5.0.4",
18 |     "vitest": "1.3.0",
19 |     "wrangler": "^4.0.0"
20 |   },
21 |   "dependencies": {
22 |     "@hono/zod-validator": "^0.5.0",
23 |     "hono": "^4.5.8",
24 |     "turndown": "^7.2.0"
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": [
4 |     "config:recommended"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/src/distiller.ts:
--------------------------------------------------------------------------------
  1 | import puppeteer from '@cloudflare/puppeteer';
  2 | 
  3 | // @ts-ignore
  4 | import { readabilityJsBundle } from './third_party/readability/readability';
  5 | import { domdistillerJsBundle } from './third_party/dom-distiller/domdistiller';
  6 | import { turndownJsBundle } from './third_party/turndown-client/turndown';
  7 | import { turndownPluginGfmJsBundle } from './third_party/turndown-client/turndown-plugin-gfm';
  8 | // @ts-ignore
  9 | import { readabilityJsBundle } from './third_party/readability/readability';
 10 | 
 11 | export async function scrapeAndDistill(
 12 | 	browserWorker: puppeteer.BrowserWorker,
 13 | 	url: string,
 14 | 	markdown: boolean,
 15 | 	useReadability: boolean
 16 | ): Promise<string> {
 17 | 	const { browser } = await pickRandomSession(browserWorker);
 18 | 	try {
 19 | 		const page = await browser.newPage();
 20 | 		await page.goto(url, { waitUntil: 'networkidle2' });
 21 | 
 22 | 		// load the DOM Distiller script
 23 | 		const content = useReadability ? await extractWithReadability(page) : await extractWithDomDistiller(page);
 24 | 
 25 | 		if (markdown) {
 26 | 			await page.evaluate(turndownJsBundle);
 27 | 			await page.evaluate(turndownPluginGfmJsBundle);
 28 | 			await page.evaluate(`var content = ${JSON.stringify(content)};`);
 29 | 			const markdown = await page.evaluate(() => {
 30 | 				// @ts-ignore
 31 | 				const turndownService = new TurndownService({
 32 | 					codeBlockStyle: 'fenced',
 33 | 					preformattedCode: true,
 34 | 				});
 35 | 
 36 | 				// @ts-ignore
 37 | 				turndownService.use(turndownPluginGfm.gfm);
 38 | 
 39 | 				// https://github.com/mixmark-io/turndown/issues/192#issuecomment-1242819018
 40 | 				// @ts-ignore
 41 | 				const getExt = (node) => {
 42 | 					// Simple match where the <pre> has the `highlight-source-js` tags
 43 | 					// @ts-ignore
 44 | 					const getFirstTag = (node) => node.outerHTML.split('>').shift() + '>';
 45 | 					const match = getFirstTag(node).match(/highlight-source-[a-z]+/);
 46 | 					if (match) return match[0].split('-').pop();
 47 | 
 48 | 					// More complex match where the _parent_ (single) has that.
 49 | 					// The parent of the <pre> is not a "wrapping" parent, so skip those
 50 | 					if (node.parentNode.childNodes.length !== 1) return '';
 51 | 
 52 | 					// Check the parent just in case
 53 | 					const parent = getFirstTag(node.parentNode).match(/highlight-source-[a-z]+/);
 54 | 					if (parent) return parent[0].split('-').pop();
 55 | 
 56 | 					// Nothing was found...
 57 | 					return '';
 58 | 				};
 59 | 				turndownService.addRule('fenceAllPreformattedText', {
 60 | 					filter: ['pre'],
 61 | 					// @ts-ignore
 62 | 					replacement: function (content, node) {
 63 | 						const ext = getExt(node);
 64 | 						const code = [...node.childNodes].map((c) => c.textContent).join('');
 65 | 						return '\n```' + ext + '\n' + code + '\n```\n\n';
 66 | 					},
 67 | 				});
 68 | 				// @ts-ignore
 69 | 				return turndownService.turndown(content);
 70 | 			});
 71 | 			return markdown;
 72 | 		}
 73 | 
 74 | 		return content;
 75 | 	} finally {
 76 | 		await browser.close();
 77 | 	}
 78 | }
 79 | 
 80 | async function extractWithDomDistiller(page: puppeteer.Page) {
 81 | 	const distillerScript = domdistillerJsBundle;
 82 | 	console.debug('Injecting DOM Distiller script');
 83 | 	await page.evaluate(distillerScript);
 84 | 
 85 | 	// run the DOM Distiller script
 86 | 	console.debug('Running DOM Distiller');
 87 | 	const distilledContent = await page.evaluate(() => {
 88 | 		// @ts-ignore
 89 | 		return org.chromium.distiller.DomDistiller.apply();
 90 | 	});
 91 | 
 92 | 	console.debug('Distilled content:', distilledContent);
 93 | 
 94 | 	// console.log(distilledContent);
 95 | 	const content = distilledContent[2][1];
 96 | 	return content;
 97 | }
 98 | 
 99 | async function extractWithReadability(page: puppeteer.Page) {
100 | 	const readabilityScript = readabilityJsBundle;
101 | 
102 | 	console.debug('Injecting Readability script');
103 | 	await page.evaluate(readabilityScript);
104 | 
105 | 	// run the Readability script
106 | 	console.debug('Running Readability');
107 | 	const content = await page.evaluate(() => {
108 | 		// @ts-ignore
109 | 		const article = new Readability(document).parse();
110 | 		return article.content;
111 | 	});
112 | 
113 | 	return content;
114 | }
115 | 
116 | // Pick random free session
117 | // Other custom logic could be used instead
118 | // https://developers.cloudflare.com/browser-rendering/get-started/reuse-sessions/
119 | async function getRandomSession(endpoint: puppeteer.BrowserWorker): Promise<string | undefined> {
120 | 	const sessions: puppeteer.ActiveSession[] = await puppeteer.sessions(endpoint);
121 | 	console.log(`Sessions: ${JSON.stringify(sessions)}`);
122 | 	const sessionsIds = sessions
123 | 		.filter((v) => {
124 | 			return !v.connectionId; // remove sessions with workers connected to them
125 | 		})
126 | 		.map((v) => {
127 | 			return v.sessionId;
128 | 		});
129 | 	if (sessionsIds.length === 0) {
130 | 		return;
131 | 	}
132 | 
133 | 	const sessionId = sessionsIds[Math.floor(Math.random() * sessionsIds.length)];
134 | 
135 | 	return sessionId!;
136 | }
137 | 
138 | async function pickRandomSession(browserWorker: puppeteer.BrowserWorker) {
139 | 	// Pick random session from open sessions
140 | 	let sessionId = await getRandomSession(browserWorker);
141 | 	let browser, launched;
142 | 	if (sessionId) {
143 | 		try {
144 | 			browser = await puppeteer.connect(browserWorker, sessionId);
145 | 		} catch (e) {
146 | 			// another worker may have connected first
147 | 			console.log(`Failed to connect to ${sessionId}. Error ${e}`);
148 | 		}
149 | 	}
150 | 	if (!browser) {
151 | 		// No open sessions, launch new session
152 | 		browser = await puppeteer.launch(browserWorker);
153 | 		launched = true;
154 | 	}
155 | 
156 | 	return { browser, launched };
157 | }
158 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | import puppeteer, { BrowserWorker } from '@cloudflare/puppeteer';
 2 | 
 3 | import { Hono } from 'hono';
 4 | import { zValidator } from '@hono/zod-validator';
 5 | import { z } from 'zod';
 6 | // @ts-ignore
 7 | import { scrapeAndDistill } from './distiller';
 8 | 
 9 | const DistillRequestSchema = z.object({
10 | 	url: z.string(),
11 | 	markdown: z.boolean(),
12 | 	useReadability: z.boolean().optional(),
13 | });
14 | 
15 | type Request = z.infer<typeof DistillRequestSchema>;
16 | 
17 | const DistillResponseSchema = z.object({
18 | 	body: z.string(),
19 | });
20 | 
21 | type Response = z.infer<typeof DistillResponseSchema>;
22 | const app = new Hono<{ Bindings: Bindings }>();
23 | 
24 | type Bindings = {
25 | 	MYBROWSER: BrowserWorker;
26 | 	SERVICE_API_KEY?: string;
27 | };
28 | 
29 | // set bearer auth if SERVICE_API_KEY is set
30 | app.use(async (c, next) => {
31 | 	const serviceApiKey = c.env.SERVICE_API_KEY;
32 | 	// bypass auth if SERVICE_API_KEY is not set
33 | 	if (!serviceApiKey) {
34 | 		return await next();
35 | 	}
36 | 
37 | 	const authHeader = c.req.header('Authorization');
38 | 	if (!authHeader) {
39 | 		return c.text('Authorization header is missing', { status: 401 });
40 | 	}
41 | 
42 | 	const [authType, authValue] = authHeader.split(' ');
43 | 
44 | 	if (authType !== 'Bearer') {
45 | 		return c.text('Invalid authorization type', { status: 401 });
46 | 	}
47 | 
48 | 	if (authValue !== serviceApiKey) {
49 | 		return c.text('Invalid API key', { status: 401 });
50 | 	}
51 | 
52 | 	return await next();
53 | });
54 | 
55 | app.post('/distill', zValidator('json', DistillRequestSchema), async (c) => {
56 | 	const req = c.req.valid('json');
57 | 
58 | 	const browserWorker = c.env.MYBROWSER;
59 | 
60 | 	// return 429 if the browser worker is busy
61 | 	// https://github.com/cloudflare/puppeteer/blob/808f08afdd25ee49a267479f05eecd0a1b3edf0a/src/puppeteer-core.ts#L86
62 | 	const limits = await puppeteer.limits(browserWorker);
63 | 	if (limits.allowedBrowserAcquisitions < 1) {
64 | 		const retryAfter = limits.timeUntilNextAllowedBrowserAcquisition;
65 | 		return c.text('The browser worker is busy', 429, {
66 | 			'Retry-After': retryAfter.toString(),
67 | 		});
68 | 	}
69 | 
70 | 	// by default, use readability
71 | 	const useReadability = req.useReadability ?? true;
72 | 
73 | 	const distilled = await scrapeAndDistill(browserWorker, req.url, req.markdown, useReadability);
74 | 
75 | 	const res: Response = {
76 | 		body: distilled,
77 | 	};
78 | 
79 | 	return c.json(res);
80 | });
81 | 
82 | export default app;
83 | 


--------------------------------------------------------------------------------
/src/third_party/dom-distiller/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # dom-distiller
3 | 
4 | This directory contains dom-distiller.js, which was generated using the repository <https://github.com/ainoya/dom-distiller>, a fork of <https://github.com/chromium/dom-distiller>.
5 | The license follows the original repository from which it was forked.
6 | 


--------------------------------------------------------------------------------
/src/third_party/readability/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010 Arc90 Inc
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/src/third_party/readability/readability.ts:
--------------------------------------------------------------------------------
   1 | export const readabilityJsBundle=`/*
   2 |  * Copyright (c) 2010 Arc90 Inc
   3 |  *
   4 |  * Licensed under the Apache License, Version 2.0 (the "License");
   5 |  * you may not use this file except in compliance with the License.
   6 |  * You may obtain a copy of the License at
   7 |  *
   8 |  *     http://www.apache.org/licenses/LICENSE-2.0
   9 |  *
  10 |  * Unless required by applicable law or agreed to in writing, software
  11 |  * distributed under the License is distributed on an "AS IS" BASIS,
  12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 |  * See the License for the specific language governing permissions and
  14 |  * limitations under the License.
  15 |  */
  16 | 
  17 | /*
  18 |  * This code is heavily based on Arc90's readability.js (1.7.1) script
  19 |  * available at: http://code.google.com/p/arc90labs-readability
  20 |  */
  21 | 
  22 | /**
  23 |  * Public constructor.
  24 |  * @param {HTMLDocument} doc     The document to parse.
  25 |  * @param {Object}       options The options object.
  26 |  */
  27 | function Readability(doc, options) {
  28 |   // In some older versions, people passed a URI as the first argument. Cope:
  29 |   if (options && options.documentElement) {
  30 |     doc = options;
  31 |     options = arguments[2];
  32 |   } else if (!doc || !doc.documentElement) {
  33 |     throw new Error("First argument to Readability constructor should be a document object.");
  34 |   }
  35 |   options = options || {};
  36 | 
  37 |   this._doc = doc;
  38 |   this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
  39 |   this._articleTitle = null;
  40 |   this._articleByline = null;
  41 |   this._articleDir = null;
  42 |   this._articleSiteName = null;
  43 |   this._attempts = [];
  44 | 
  45 |   // Configurable options
  46 |   this._debug = !!options.debug;
  47 |   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
  48 |   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
  49 |   this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
  50 |   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
  51 |   this._keepClasses = !!options.keepClasses;
  52 |   this._serializer = options.serializer || function(el) {
  53 |     return el.innerHTML;
  54 |   };
  55 |   this._disableJSONLD = !!options.disableJSONLD;
  56 |   this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
  57 |   this._linkDensityModifier = options.linkDensityModifier || 0;
  58 | 
  59 |   // Start with all flags set
  60 |   this._flags = this.FLAG_STRIP_UNLIKELYS |
  61 |                 this.FLAG_WEIGHT_CLASSES |
  62 |                 this.FLAG_CLEAN_CONDITIONALLY;
  63 | 
  64 | 
  65 |   // Control whether log messages are sent to the console
  66 |   if (this._debug) {
  67 |     let logNode = function(node) {
  68 |       if (node.nodeType == node.TEXT_NODE) {
  69 |         return \`\${node.nodeName} ("\${node.textContent}")\`;
  70 |       }
  71 |       let attrPairs = Array.from(node.attributes || [], function(attr) {
  72 |         return \`\${attr.name}="\${attr.value}"\`;
  73 |       }).join(" ");
  74 |       return \`<\${node.localName} \${attrPairs}>\`;
  75 |     };
  76 |     this.log = function () {
  77 |       if (typeof console !== "undefined") {
  78 |         let args = Array.from(arguments, arg => {
  79 |           if (arg && arg.nodeType == this.ELEMENT_NODE) {
  80 |             return logNode(arg);
  81 |           }
  82 |           return arg;
  83 |         });
  84 |         args.unshift("Reader: (Readability)");
  85 |         console.log.apply(console, args);
  86 |       } else if (typeof dump !== "undefined") {
  87 |         /* global dump */
  88 |         var msg = Array.prototype.map.call(arguments, function(x) {
  89 |           return (x && x.nodeName) ? logNode(x) : x;
  90 |         }).join(" ");
  91 |         dump("Reader: (Readability) " + msg + "\\n");
  92 |       }
  93 |     };
  94 |   } else {
  95 |     this.log = function () {};
  96 |   }
  97 | }
  98 | 
  99 | Readability.prototype = {
 100 |   FLAG_STRIP_UNLIKELYS: 0x1,
 101 |   FLAG_WEIGHT_CLASSES: 0x2,
 102 |   FLAG_CLEAN_CONDITIONALLY: 0x4,
 103 | 
 104 |   // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
 105 |   ELEMENT_NODE: 1,
 106 |   TEXT_NODE: 3,
 107 | 
 108 |   // Max number of nodes supported by this parser. Default: 0 (no limit)
 109 |   DEFAULT_MAX_ELEMS_TO_PARSE: 0,
 110 | 
 111 |   // The number of top candidates to consider when analysing how
 112 |   // tight the competition is among candidates.
 113 |   DEFAULT_N_TOP_CANDIDATES: 5,
 114 | 
 115 |   // Element tags to score by default.
 116 |   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 117 | 
 118 |   // The default number of chars an article must have in order to return a result
 119 |   DEFAULT_CHAR_THRESHOLD: 500,
 120 | 
 121 |   // All of the regular expressions in use within readability.
 122 |   // Defined up here so we don't instantiate them repeatedly in loops.
 123 |   REGEXPS: {
 124 |     // NOTE: These two regular expressions are duplicated in
 125 |     // Readability-readerable.js. Please keep both copies in sync.
 126 |     unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
 127 |     okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
 128 | 
 129 |     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
 130 |     negative: /-ad-|hidden|^hid\$| hid\$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
 131 |     extraneous: /print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility/i,
 132 |     byline: /byline|author|dateline|writtenby|p-author/i,
 133 |     replaceFonts: /<(\\/?)font[^>]*>/gi,
 134 |     normalize: /\\s{2,}/g,
 135 |     videos: /\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq)\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,
 136 |     shareElements: /(\\b|_)(share|sharedaddy)(\\b|_)/i,
 137 |     nextLink: /(next|weiter|continue|>([^\\|]|\$)|»([^\\|]|\$))/i,
 138 |     prevLink: /(prev|earl|old|new|<|«)/i,
 139 |     tokenize: /\\W+/g,
 140 |     whitespace: /^\\s*\$/,
 141 |     hasContent: /\\S\$/,
 142 |     hashUrl: /^#.+/,
 143 |     srcsetUrl: /(\\S+)(\\s+[\\d.]+[xw])?(\\s*(?:,|\$))/g,
 144 |     b64DataUrl: /^data:\\s*([^\\s;,]+)\\s*;\\s*base64\\s*,/i,
 145 |     // Commas as used in Latin, Sindhi, Chinese and various other scripts.
 146 |     // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
 147 |     commas: /\\u002C|\\u060C|\\uFE50|\\uFE10|\\uFE11|\\u2E41|\\u2E34|\\u2E32|\\uFF0C/g,
 148 |     // See: https://schema.org/Article
 149 |     jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference\$/,
 150 |     // used to see if a node's content matches words commonly used for ad blocks or loading indicators
 151 |     adWords: /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)\$/iu,
 152 |     loadingWords: /^((loading|正在加载|Загрузка|chargement|cargando)(…|\\.\\.\\.)?)\$/iu,
 153 |   },
 154 | 
 155 |   UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ],
 156 | 
 157 |   DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]),
 158 | 
 159 |   ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
 160 | 
 161 |   PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
 162 | 
 163 |   DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
 164 | 
 165 |   // The commented out elements qualify as phrasing content but tend to be
 166 |   // removed by readability when put into paragraphs, so we ignore them here.
 167 |   PHRASING_ELEMS: [
 168 |     // "CANVAS", "IFRAME", "SVG", "VIDEO",
 169 |     "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
 170 |     "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
 171 |     "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
 172 |     "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
 173 |     "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
 174 |   ],
 175 | 
 176 |   // These are the classes that readability sets itself.
 177 |   CLASSES_TO_PRESERVE: [ "page" ],
 178 | 
 179 |   // These are the list of HTML entities that need to be escaped.
 180 |   HTML_ESCAPE_MAP: {
 181 |     "lt": "<",
 182 |     "gt": ">",
 183 |     "amp": "&",
 184 |     "quot": '"',
 185 |     "apos": "'",
 186 |   },
 187 | 
 188 |   /**
 189 |    * Run any post-process modifications to article content as necessary.
 190 |    *
 191 |    * @param Element
 192 |    * @return void
 193 |   **/
 194 |   _postProcessContent: function(articleContent) {
 195 |     // Readability cannot open relative uris so we convert them to absolute uris.
 196 |     this._fixRelativeUris(articleContent);
 197 | 
 198 |     this._simplifyNestedElements(articleContent);
 199 | 
 200 |     if (!this._keepClasses) {
 201 |       // Remove classes.
 202 |       this._cleanClasses(articleContent);
 203 |     }
 204 |   },
 205 | 
 206 |   /**
 207 |    * Iterates over a NodeList, calls \`filterFn\` for each node and removes node
 208 |    * if function returned \`true\`.
 209 |    *
 210 |    * If function is not passed, removes all the nodes in node list.
 211 |    *
 212 |    * @param NodeList nodeList The nodes to operate on
 213 |    * @param Function filterFn the function to use as a filter
 214 |    * @return void
 215 |    */
 216 |   _removeNodes: function(nodeList, filterFn) {
 217 |     // Avoid ever operating on live node lists.
 218 |     if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 219 |       throw new Error("Do not pass live node lists to _removeNodes");
 220 |     }
 221 |     for (var i = nodeList.length - 1; i >= 0; i--) {
 222 |       var node = nodeList[i];
 223 |       var parentNode = node.parentNode;
 224 |       if (parentNode) {
 225 |         if (!filterFn || filterFn.call(this, node, i, nodeList)) {
 226 |           parentNode.removeChild(node);
 227 |         }
 228 |       }
 229 |     }
 230 |   },
 231 | 
 232 |   /**
 233 |    * Iterates over a NodeList, and calls _setNodeTag for each node.
 234 |    *
 235 |    * @param NodeList nodeList The nodes to operate on
 236 |    * @param String newTagName the new tag name to use
 237 |    * @return void
 238 |    */
 239 |   _replaceNodeTags: function(nodeList, newTagName) {
 240 |     // Avoid ever operating on live node lists.
 241 |     if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 242 |       throw new Error("Do not pass live node lists to _replaceNodeTags");
 243 |     }
 244 |     for (const node of nodeList) {
 245 |       this._setNodeTag(node, newTagName);
 246 |     }
 247 |   },
 248 | 
 249 |   /**
 250 |    * Iterate over a NodeList, which doesn't natively fully implement the Array
 251 |    * interface.
 252 |    *
 253 |    * For convenience, the current object context is applied to the provided
 254 |    * iterate function.
 255 |    *
 256 |    * @param  NodeList nodeList The NodeList.
 257 |    * @param  Function fn       The iterate function.
 258 |    * @return void
 259 |    */
 260 |   _forEachNode: function(nodeList, fn) {
 261 |     Array.prototype.forEach.call(nodeList, fn, this);
 262 |   },
 263 | 
 264 |   /**
 265 |    * Iterate over a NodeList, and return the first node that passes
 266 |    * the supplied test function
 267 |    *
 268 |    * For convenience, the current object context is applied to the provided
 269 |    * test function.
 270 |    *
 271 |    * @param  NodeList nodeList The NodeList.
 272 |    * @param  Function fn       The test function.
 273 |    * @return void
 274 |    */
 275 |   _findNode: function(nodeList, fn) {
 276 |     return Array.prototype.find.call(nodeList, fn, this);
 277 |   },
 278 | 
 279 |   /**
 280 |    * Iterate over a NodeList, return true if any of the provided iterate
 281 |    * function calls returns true, false otherwise.
 282 |    *
 283 |    * For convenience, the current object context is applied to the
 284 |    * provided iterate function.
 285 |    *
 286 |    * @param  NodeList nodeList The NodeList.
 287 |    * @param  Function fn       The iterate function.
 288 |    * @return Boolean
 289 |    */
 290 |   _someNode: function(nodeList, fn) {
 291 |     return Array.prototype.some.call(nodeList, fn, this);
 292 |   },
 293 | 
 294 |   /**
 295 |    * Iterate over a NodeList, return true if all of the provided iterate
 296 |    * function calls return true, false otherwise.
 297 |    *
 298 |    * For convenience, the current object context is applied to the
 299 |    * provided iterate function.
 300 |    *
 301 |    * @param  NodeList nodeList The NodeList.
 302 |    * @param  Function fn       The iterate function.
 303 |    * @return Boolean
 304 |    */
 305 |   _everyNode: function(nodeList, fn) {
 306 |     return Array.prototype.every.call(nodeList, fn, this);
 307 |   },
 308 | 
 309 |   /**
 310 |    * Concat all nodelists passed as arguments.
 311 |    *
 312 |    * @return ...NodeList
 313 |    * @return Array
 314 |    */
 315 |   _concatNodeLists: function() {
 316 |     var slice = Array.prototype.slice;
 317 |     var args = slice.call(arguments);
 318 |     var nodeLists = args.map(function(list) {
 319 |       return slice.call(list);
 320 |     });
 321 |     return Array.prototype.concat.apply([], nodeLists);
 322 |   },
 323 | 
 324 |   _getAllNodesWithTag: function(node, tagNames) {
 325 |     if (node.querySelectorAll) {
 326 |       return node.querySelectorAll(tagNames.join(","));
 327 |     }
 328 |     return [].concat.apply([], tagNames.map(function(tag) {
 329 |       var collection = node.getElementsByTagName(tag);
 330 |       return Array.isArray(collection) ? collection : Array.from(collection);
 331 |     }));
 332 |   },
 333 | 
 334 |   /**
 335 |    * Removes the class="" attribute from every element in the given
 336 |    * subtree, except those that match CLASSES_TO_PRESERVE and
 337 |    * the classesToPreserve array from the options object.
 338 |    *
 339 |    * @param Element
 340 |    * @return void
 341 |    */
 342 |   _cleanClasses: function(node) {
 343 |     var classesToPreserve = this._classesToPreserve;
 344 |     var className = (node.getAttribute("class") || "")
 345 |       .split(/\\s+/)
 346 |       .filter(function(cls) {
 347 |         return classesToPreserve.indexOf(cls) != -1;
 348 |       })
 349 |       .join(" ");
 350 | 
 351 |     if (className) {
 352 |       node.setAttribute("class", className);
 353 |     } else {
 354 |       node.removeAttribute("class");
 355 |     }
 356 | 
 357 |     for (node = node.firstElementChild; node; node = node.nextElementSibling) {
 358 |       this._cleanClasses(node);
 359 |     }
 360 |   },
 361 | 
 362 |   /**
 363 |    * Converts each <a> and <img> uri in the given element to an absolute URI,
 364 |    * ignoring #ref URIs.
 365 |    *
 366 |    * @param Element
 367 |    * @return void
 368 |    */
 369 |   _fixRelativeUris: function(articleContent) {
 370 |     var baseURI = this._doc.baseURI;
 371 |     var documentURI = this._doc.documentURI;
 372 |     function toAbsoluteURI(uri) {
 373 |       // Leave hash links alone if the base URI matches the document URI:
 374 |       if (baseURI == documentURI && uri.charAt(0) == "#") {
 375 |         return uri;
 376 |       }
 377 | 
 378 |       // Otherwise, resolve against base URI:
 379 |       try {
 380 |         return new URL(uri, baseURI).href;
 381 |       } catch (ex) {
 382 |         // Something went wrong, just return the original:
 383 |       }
 384 |       return uri;
 385 |     }
 386 | 
 387 |     var links = this._getAllNodesWithTag(articleContent, ["a"]);
 388 |     this._forEachNode(links, function(link) {
 389 |       var href = link.getAttribute("href");
 390 |       if (href) {
 391 |         // Remove links with javascript: URIs, since
 392 |         // they won't work after scripts have been removed from the page.
 393 |         if (href.indexOf("javascript:") === 0) {
 394 |           // if the link only contains simple text content, it can be converted to a text node
 395 |           if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
 396 |             var text = this._doc.createTextNode(link.textContent);
 397 |             link.parentNode.replaceChild(text, link);
 398 |           } else {
 399 |             // if the link has multiple children, they should all be preserved
 400 |             var container = this._doc.createElement("span");
 401 |             while (link.firstChild) {
 402 |               container.appendChild(link.firstChild);
 403 |             }
 404 |             link.parentNode.replaceChild(container, link);
 405 |           }
 406 |         } else {
 407 |           link.setAttribute("href", toAbsoluteURI(href));
 408 |         }
 409 |       }
 410 |     });
 411 | 
 412 |     var medias = this._getAllNodesWithTag(articleContent, [
 413 |       "img", "picture", "figure", "video", "audio", "source",
 414 |     ]);
 415 | 
 416 |     this._forEachNode(medias, function(media) {
 417 |       var src = media.getAttribute("src");
 418 |       var poster = media.getAttribute("poster");
 419 |       var srcset = media.getAttribute("srcset");
 420 | 
 421 |       if (src) {
 422 |         media.setAttribute("src", toAbsoluteURI(src));
 423 |       }
 424 | 
 425 |       if (poster) {
 426 |         media.setAttribute("poster", toAbsoluteURI(poster));
 427 |       }
 428 | 
 429 |       if (srcset) {
 430 |         var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
 431 |           return toAbsoluteURI(p1) + (p2 || "") + p3;
 432 |         });
 433 | 
 434 |         media.setAttribute("srcset", newSrcset);
 435 |       }
 436 |     });
 437 |   },
 438 | 
 439 |   _simplifyNestedElements: function(articleContent) {
 440 |     var node = articleContent;
 441 | 
 442 |     while (node) {
 443 |       if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
 444 |         if (this._isElementWithoutContent(node)) {
 445 |           node = this._removeAndGetNext(node);
 446 |           continue;
 447 |         } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) {
 448 |           var child = node.children[0];
 449 |           for (var i = 0; i < node.attributes.length; i++) {
 450 |             child.setAttribute(node.attributes[i].name, node.attributes[i].value);
 451 |           }
 452 |           node.parentNode.replaceChild(child, node);
 453 |           node = child;
 454 |           continue;
 455 |         }
 456 |       }
 457 | 
 458 |       node = this._getNextNode(node);
 459 |     }
 460 |   },
 461 | 
 462 |   /**
 463 |    * Get the article title as an H1.
 464 |    *
 465 |    * @return string
 466 |    **/
 467 |   _getArticleTitle: function() {
 468 |     var doc = this._doc;
 469 |     var curTitle = "";
 470 |     var origTitle = "";
 471 | 
 472 |     try {
 473 |       curTitle = origTitle = doc.title.trim();
 474 | 
 475 |       // If they had an element with id "title" in their HTML
 476 |       if (typeof curTitle !== "string")
 477 |         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
 478 |     } catch (e) {/* ignore exceptions setting the title. */}
 479 | 
 480 |     var titleHadHierarchicalSeparators = false;
 481 |     function wordCount(str) {
 482 |       return str.split(/\\s+/).length;
 483 |     }
 484 | 
 485 |     // If there's a separator in the title, first remove the final part
 486 |     if ((/ [\\|\\-\\\\\\/>»] /).test(curTitle)) {
 487 |       titleHadHierarchicalSeparators = / [\\\\\\/>»] /.test(curTitle);
 488 |       curTitle = origTitle.replace(/(.*)[\\|\\-\\\\\\/>»] .*/gi, "\$1");
 489 | 
 490 |       // If the resulting title is too short (3 words or fewer), remove
 491 |       // the first part instead:
 492 |       if (wordCount(curTitle) < 3)
 493 |         curTitle = origTitle.replace(/[^\\|\\-\\\\\\/>»]*[\\|\\-\\\\\\/>»](.*)/gi, "\$1");
 494 |     } else if (curTitle.indexOf(": ") !== -1) {
 495 |       // Check if we have an heading containing this exact string, so we
 496 |       // could assume it's the full title.
 497 |       var headings = this._concatNodeLists(
 498 |         doc.getElementsByTagName("h1"),
 499 |         doc.getElementsByTagName("h2")
 500 |       );
 501 |       var trimmedTitle = curTitle.trim();
 502 |       var match = this._someNode(headings, function(heading) {
 503 |         return heading.textContent.trim() === trimmedTitle;
 504 |       });
 505 | 
 506 |       // If we don't, let's extract the title out of the original title string.
 507 |       if (!match) {
 508 |         curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
 509 | 
 510 |         // If the title is now too short, try the first colon instead:
 511 |         if (wordCount(curTitle) < 3) {
 512 |           curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
 513 |           // But if we have too many words before the colon there's something weird
 514 |           // with the titles and the H tags so let's just use the original title instead
 515 |         } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
 516 |           curTitle = origTitle;
 517 |         }
 518 |       }
 519 |     } else if (curTitle.length > 150 || curTitle.length < 15) {
 520 |       var hOnes = doc.getElementsByTagName("h1");
 521 | 
 522 |       if (hOnes.length === 1)
 523 |         curTitle = this._getInnerText(hOnes[0]);
 524 |     }
 525 | 
 526 |     curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
 527 |     // If we now have 4 words or fewer as our title, and either no
 528 |     // 'hierarchical' separators (\\, /, > or ») were found in the original
 529 |     // title or we decreased the number of words by more than 1 word, use
 530 |     // the original title.
 531 |     var curTitleWordCount = wordCount(curTitle);
 532 |     if (curTitleWordCount <= 4 &&
 533 |         (!titleHadHierarchicalSeparators ||
 534 |          curTitleWordCount != wordCount(origTitle.replace(/[\\|\\-\\\\\\/>»]+/g, "")) - 1)) {
 535 |       curTitle = origTitle;
 536 |     }
 537 | 
 538 |     return curTitle;
 539 |   },
 540 | 
 541 |   /**
 542 |    * Prepare the HTML document for readability to scrape it.
 543 |    * This includes things like stripping javascript, CSS, and handling terrible markup.
 544 |    *
 545 |    * @return void
 546 |    **/
 547 |   _prepDocument: function() {
 548 |     var doc = this._doc;
 549 | 
 550 |     // Remove all style tags in head
 551 |     this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
 552 | 
 553 |     if (doc.body) {
 554 |       this._replaceBrs(doc.body);
 555 |     }
 556 | 
 557 |     this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
 558 |   },
 559 | 
 560 |   /**
 561 |    * Finds the next node, starting from the given node, and ignoring
 562 |    * whitespace in between. If the given node is an element, the same node is
 563 |    * returned.
 564 |    */
 565 |   _nextNode: function (node) {
 566 |     var next = node;
 567 |     while (next
 568 |         && (next.nodeType != this.ELEMENT_NODE)
 569 |         && this.REGEXPS.whitespace.test(next.textContent)) {
 570 |       next = next.nextSibling;
 571 |     }
 572 |     return next;
 573 |   },
 574 | 
 575 |   /**
 576 |    * Replaces 2 or more successive <br> elements with a single <p>.
 577 |    * Whitespace between <br> elements are ignored. For example:
 578 |    *   <div>foo<br>bar<br> <br><br>abc</div>
 579 |    * will become:
 580 |    *   <div>foo<br>bar<p>abc</p></div>
 581 |    */
 582 |   _replaceBrs: function (elem) {
 583 |     this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
 584 |       var next = br.nextSibling;
 585 | 
 586 |       // Whether 2 or more <br> elements have been found and replaced with a
 587 |       // <p> block.
 588 |       var replaced = false;
 589 | 
 590 |       // If we find a <br> chain, remove the <br>s until we hit another node
 591 |       // or non-whitespace. This leaves behind the first <br> in the chain
 592 |       // (which will be replaced with a <p> later).
 593 |       while ((next = this._nextNode(next)) && (next.tagName == "BR")) {
 594 |         replaced = true;
 595 |         var brSibling = next.nextSibling;
 596 |         next.parentNode.removeChild(next);
 597 |         next = brSibling;
 598 |       }
 599 | 
 600 |       // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
 601 |       // all sibling nodes as children of the <p> until we hit another <br>
 602 |       // chain.
 603 |       if (replaced) {
 604 |         var p = this._doc.createElement("p");
 605 |         br.parentNode.replaceChild(p, br);
 606 | 
 607 |         next = p.nextSibling;
 608 |         while (next) {
 609 |           // If we've hit another <br><br>, we're done adding children to this <p>.
 610 |           if (next.tagName == "BR") {
 611 |             var nextElem = this._nextNode(next.nextSibling);
 612 |             if (nextElem && nextElem.tagName == "BR")
 613 |               break;
 614 |           }
 615 | 
 616 |           if (!this._isPhrasingContent(next))
 617 |             break;
 618 | 
 619 |           // Otherwise, make this node a child of the new <p>.
 620 |           var sibling = next.nextSibling;
 621 |           p.appendChild(next);
 622 |           next = sibling;
 623 |         }
 624 | 
 625 |         while (p.lastChild && this._isWhitespace(p.lastChild)) {
 626 |           p.removeChild(p.lastChild);
 627 |         }
 628 | 
 629 |         if (p.parentNode.tagName === "P")
 630 |           this._setNodeTag(p.parentNode, "DIV");
 631 |       }
 632 |     });
 633 |   },
 634 | 
 635 |   _setNodeTag: function (node, tag) {
 636 |     this.log("_setNodeTag", node, tag);
 637 |     if (this._docJSDOMParser) {
 638 |       node.localName = tag.toLowerCase();
 639 |       node.tagName = tag.toUpperCase();
 640 |       return node;
 641 |     }
 642 | 
 643 |     var replacement = node.ownerDocument.createElement(tag);
 644 |     while (node.firstChild) {
 645 |       replacement.appendChild(node.firstChild);
 646 |     }
 647 |     node.parentNode.replaceChild(replacement, node);
 648 |     if (node.readability)
 649 |       replacement.readability = node.readability;
 650 | 
 651 |     for (var i = 0; i < node.attributes.length; i++) {
 652 |       try {
 653 |         replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
 654 |       } catch (ex) {
 655 |         /* it's possible for setAttribute() to throw if the attribute name
 656 |          * isn't a valid XML Name. Such attributes can however be parsed from
 657 |          * source in HTML docs, see https://github.com/whatwg/html/issues/4275,
 658 |          * so we can hit them here and then throw. We don't care about such
 659 |          * attributes so we ignore them.
 660 |          */
 661 |       }
 662 |     }
 663 |     return replacement;
 664 |   },
 665 | 
 666 |   /**
 667 |    * Prepare the article node for display. Clean out any inline styles,
 668 |    * iframes, forms, strip extraneous <p> tags, etc.
 669 |    *
 670 |    * @param Element
 671 |    * @return void
 672 |    **/
 673 |   _prepArticle: function(articleContent) {
 674 |     this._cleanStyles(articleContent);
 675 | 
 676 |     // Check for data tables before we continue, to avoid removing items in
 677 |     // those tables, which will often be isolated even though they're
 678 |     // visually linked to other content-ful elements (text, images, etc.).
 679 |     this._markDataTables(articleContent);
 680 | 
 681 |     this._fixLazyImages(articleContent);
 682 | 
 683 |     // Clean out junk from the article content
 684 |     this._cleanConditionally(articleContent, "form");
 685 |     this._cleanConditionally(articleContent, "fieldset");
 686 |     this._clean(articleContent, "object");
 687 |     this._clean(articleContent, "embed");
 688 |     this._clean(articleContent, "footer");
 689 |     this._clean(articleContent, "link");
 690 |     this._clean(articleContent, "aside");
 691 | 
 692 |     // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
 693 |     // which means we don't remove the top candidates even they have "share".
 694 | 
 695 |     var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
 696 | 
 697 |     this._forEachNode(articleContent.children, function (topCandidate) {
 698 |       this._cleanMatchedNodes(topCandidate, function (node, matchString) {
 699 |         return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold;
 700 |       });
 701 |     });
 702 | 
 703 |     this._clean(articleContent, "iframe");
 704 |     this._clean(articleContent, "input");
 705 |     this._clean(articleContent, "textarea");
 706 |     this._clean(articleContent, "select");
 707 |     this._clean(articleContent, "button");
 708 |     this._cleanHeaders(articleContent);
 709 | 
 710 |     // Do these last as the previous stuff may have removed junk
 711 |     // that will affect these
 712 |     this._cleanConditionally(articleContent, "table");
 713 |     this._cleanConditionally(articleContent, "ul");
 714 |     this._cleanConditionally(articleContent, "div");
 715 | 
 716 |     // replace H1 with H2 as H1 should be only title that is displayed separately
 717 |     this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2");
 718 | 
 719 |     // Remove extra paragraphs
 720 |     this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
 721 |       var imgCount = paragraph.getElementsByTagName("img").length;
 722 |       var embedCount = paragraph.getElementsByTagName("embed").length;
 723 |       var objectCount = paragraph.getElementsByTagName("object").length;
 724 |       // At this point, nasty iframes have been removed, only remain embedded video ones.
 725 |       var iframeCount = paragraph.getElementsByTagName("iframe").length;
 726 |       var totalCount = imgCount + embedCount + objectCount + iframeCount;
 727 | 
 728 |       return totalCount === 0 && !this._getInnerText(paragraph, false);
 729 |     });
 730 | 
 731 |     this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
 732 |       var next = this._nextNode(br.nextSibling);
 733 |       if (next && next.tagName == "P")
 734 |         br.parentNode.removeChild(br);
 735 |     });
 736 | 
 737 |     // Remove single-cell tables
 738 |     this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
 739 |       var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
 740 |       if (this._hasSingleTagInsideElement(tbody, "TR")) {
 741 |         var row = tbody.firstElementChild;
 742 |         if (this._hasSingleTagInsideElement(row, "TD")) {
 743 |           var cell = row.firstElementChild;
 744 |           cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
 745 |           table.parentNode.replaceChild(cell, table);
 746 |         }
 747 |       }
 748 |     });
 749 |   },
 750 | 
 751 |   /**
 752 |    * Initialize a node with the readability object. Also checks the
 753 |    * className/id for special names to add to its score.
 754 |    *
 755 |    * @param Element
 756 |    * @return void
 757 |   **/
 758 |   _initializeNode: function(node) {
 759 |     node.readability = {"contentScore": 0};
 760 | 
 761 |     switch (node.tagName) {
 762 |       case "DIV":
 763 |         node.readability.contentScore += 5;
 764 |         break;
 765 | 
 766 |       case "PRE":
 767 |       case "TD":
 768 |       case "BLOCKQUOTE":
 769 |         node.readability.contentScore += 3;
 770 |         break;
 771 | 
 772 |       case "ADDRESS":
 773 |       case "OL":
 774 |       case "UL":
 775 |       case "DL":
 776 |       case "DD":
 777 |       case "DT":
 778 |       case "LI":
 779 |       case "FORM":
 780 |         node.readability.contentScore -= 3;
 781 |         break;
 782 | 
 783 |       case "H1":
 784 |       case "H2":
 785 |       case "H3":
 786 |       case "H4":
 787 |       case "H5":
 788 |       case "H6":
 789 |       case "TH":
 790 |         node.readability.contentScore -= 5;
 791 |         break;
 792 |     }
 793 | 
 794 |     node.readability.contentScore += this._getClassWeight(node);
 795 |   },
 796 | 
 797 |   _removeAndGetNext: function(node) {
 798 |     var nextNode = this._getNextNode(node, true);
 799 |     node.parentNode.removeChild(node);
 800 |     return nextNode;
 801 |   },
 802 | 
 803 |   /**
 804 |    * Traverse the DOM from node to node, starting at the node passed in.
 805 |    * Pass true for the second parameter to indicate this node itself
 806 |    * (and its kids) are going away, and we want the next node over.
 807 |    *
 808 |    * Calling this in a loop will traverse the DOM depth-first.
 809 |    */
 810 |   _getNextNode: function(node, ignoreSelfAndKids) {
 811 |     // First check for kids if those aren't being ignored
 812 |     if (!ignoreSelfAndKids && node.firstElementChild) {
 813 |       return node.firstElementChild;
 814 |     }
 815 |     // Then for siblings...
 816 |     if (node.nextElementSibling) {
 817 |       return node.nextElementSibling;
 818 |     }
 819 |     // And finally, move up the parent chain *and* find a sibling
 820 |     // (because this is depth-first traversal, we will have already
 821 |     // seen the parent nodes themselves).
 822 |     do {
 823 |       node = node.parentNode;
 824 |     } while (node && !node.nextElementSibling);
 825 |     return node && node.nextElementSibling;
 826 |   },
 827 | 
 828 |   // compares second text to first one
 829 |   // 1 = same text, 0 = completely different text
 830 |   // works the way that it splits both texts into words and then finds words that are unique in second text
 831 |   // the result is given by the lower length of unique parts
 832 |   _textSimilarity: function(textA, textB) {
 833 |     var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
 834 |     var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
 835 |     if (!tokensA.length || !tokensB.length) {
 836 |       return 0;
 837 |     }
 838 |     var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
 839 |     var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
 840 |     return 1 - distanceB;
 841 |   },
 842 | 
 843 |   _checkByline: function(node, matchString) {
 844 |     if (this._articleByline) {
 845 |       return false;
 846 |     }
 847 | 
 848 |     if (node.getAttribute !== undefined) {
 849 |       var rel = node.getAttribute("rel");
 850 |       var itemprop = node.getAttribute("itemprop");
 851 |     }
 852 | 
 853 |     if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
 854 |       this._articleByline = node.textContent.trim();
 855 |       return true;
 856 |     }
 857 | 
 858 |     return false;
 859 |   },
 860 | 
 861 |   _getNodeAncestors: function(node, maxDepth) {
 862 |     maxDepth = maxDepth || 0;
 863 |     var i = 0, ancestors = [];
 864 |     while (node.parentNode) {
 865 |       ancestors.push(node.parentNode);
 866 |       if (maxDepth && ++i === maxDepth)
 867 |         break;
 868 |       node = node.parentNode;
 869 |     }
 870 |     return ancestors;
 871 |   },
 872 | 
 873 |   /***
 874 |    * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
 875 |    *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
 876 |    *
 877 |    * @param page a document to run upon. Needs to be a full document, complete with body.
 878 |    * @return Element
 879 |   **/
 880 |   _grabArticle: function (page) {
 881 |     this.log("**** grabArticle ****");
 882 |     var doc = this._doc;
 883 |     var isPaging = page !== null;
 884 |     page = page ? page : this._doc.body;
 885 | 
 886 |     // We can't grab an article if we don't have a page!
 887 |     if (!page) {
 888 |       this.log("No body found in document. Abort.");
 889 |       return null;
 890 |     }
 891 | 
 892 |     var pageCacheHtml = page.innerHTML;
 893 | 
 894 |     while (true) {
 895 |       this.log("Starting grabArticle loop");
 896 |       var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
 897 | 
 898 |       // First, node prepping. Trash nodes that look cruddy (like ones with the
 899 |       // class name "comment", etc), and turn divs into P tags where they have been
 900 |       // used inappropriately (as in, where they contain no other block level elements.)
 901 |       var elementsToScore = [];
 902 |       var node = this._doc.documentElement;
 903 | 
 904 |       let shouldRemoveTitleHeader = true;
 905 | 
 906 |       while (node) {
 907 | 
 908 |         if (node.tagName === "HTML") {
 909 |           this._articleLang = node.getAttribute("lang");
 910 |         }
 911 | 
 912 |         var matchString = node.className + " " + node.id;
 913 | 
 914 |         if (!this._isProbablyVisible(node)) {
 915 |           this.log("Removing hidden node - " + matchString);
 916 |           node = this._removeAndGetNext(node);
 917 |           continue;
 918 |         }
 919 | 
 920 |         // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
 921 |         if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") {
 922 |           node = this._removeAndGetNext(node);
 923 |           continue;
 924 |         }
 925 | 
 926 |         // Check to see if this node is a byline, and remove it if it is.
 927 |         if (this._checkByline(node, matchString)) {
 928 |           node = this._removeAndGetNext(node);
 929 |           continue;
 930 |         }
 931 | 
 932 |         if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
 933 |           this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim());
 934 |           shouldRemoveTitleHeader = false;
 935 |           node = this._removeAndGetNext(node);
 936 |           continue;
 937 |         }
 938 | 
 939 |         // Remove unlikely candidates
 940 |         if (stripUnlikelyCandidates) {
 941 |           if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
 942 |               !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
 943 |               !this._hasAncestorTag(node, "table") &&
 944 |               !this._hasAncestorTag(node, "code") &&
 945 |               node.tagName !== "BODY" &&
 946 |               node.tagName !== "A") {
 947 |             this.log("Removing unlikely candidate - " + matchString);
 948 |             node = this._removeAndGetNext(node);
 949 |             continue;
 950 |           }
 951 | 
 952 |           if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
 953 |             this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString);
 954 |             node = this._removeAndGetNext(node);
 955 |             continue;
 956 |           }
 957 |         }
 958 | 
 959 |         // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
 960 |         if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
 961 |              node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
 962 |              node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
 963 |             this._isElementWithoutContent(node)) {
 964 |           node = this._removeAndGetNext(node);
 965 |           continue;
 966 |         }
 967 | 
 968 |         if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
 969 |           elementsToScore.push(node);
 970 |         }
 971 | 
 972 |         // Turn all divs that don't have children block level elements into p's
 973 |         if (node.tagName === "DIV") {
 974 |           // Put phrasing content into paragraphs.
 975 |           var p = null;
 976 |           var childNode = node.firstChild;
 977 |           while (childNode) {
 978 |             var nextSibling = childNode.nextSibling;
 979 |             if (this._isPhrasingContent(childNode)) {
 980 |               if (p !== null) {
 981 |                 p.appendChild(childNode);
 982 |               } else if (!this._isWhitespace(childNode)) {
 983 |                 p = doc.createElement("p");
 984 |                 node.replaceChild(p, childNode);
 985 |                 p.appendChild(childNode);
 986 |               }
 987 |             } else if (p !== null) {
 988 |               while (p.lastChild && this._isWhitespace(p.lastChild)) {
 989 |                 p.removeChild(p.lastChild);
 990 |               }
 991 |               p = null;
 992 |             }
 993 |             childNode = nextSibling;
 994 |           }
 995 | 
 996 |           // Sites like http://mobile.slate.com encloses each paragraph with a DIV
 997 |           // element. DIVs with only a P element inside and no text content can be
 998 |           // safely converted into plain P elements to avoid confusing the scoring
 999 |           // algorithm with DIVs with are, in practice, paragraphs.
1000 |           if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
1001 |             var newNode = node.children[0];
1002 |             node.parentNode.replaceChild(newNode, node);
1003 |             node = newNode;
1004 |             elementsToScore.push(node);
1005 |           } else if (!this._hasChildBlockElement(node)) {
1006 |             node = this._setNodeTag(node, "P");
1007 |             elementsToScore.push(node);
1008 |           }
1009 |         }
1010 |         node = this._getNextNode(node);
1011 |       }
1012 | 
1013 |       /**
1014 |        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
1015 |        * Then add their score to their parent node.
1016 |        *
1017 |        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
1018 |       **/
1019 |       var candidates = [];
1020 |       this._forEachNode(elementsToScore, function(elementToScore) {
1021 |         if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
1022 |           return;
1023 | 
1024 |         // If this paragraph is less than 25 characters, don't even count it.
1025 |         var innerText = this._getInnerText(elementToScore);
1026 |         if (innerText.length < 25)
1027 |           return;
1028 | 
1029 |         // Exclude nodes with no ancestor.
1030 |         var ancestors = this._getNodeAncestors(elementToScore, 5);
1031 |         if (ancestors.length === 0)
1032 |           return;
1033 | 
1034 |         var contentScore = 0;
1035 | 
1036 |         // Add a point for the paragraph itself as a base.
1037 |         contentScore += 1;
1038 | 
1039 |         // Add points for any commas within this paragraph.
1040 |         contentScore += innerText.split(this.REGEXPS.commas).length;
1041 | 
1042 |         // For every 100 characters in this paragraph, add another point. Up to 3 points.
1043 |         contentScore += Math.min(Math.floor(innerText.length / 100), 3);
1044 | 
1045 |         // Initialize and score ancestors.
1046 |         this._forEachNode(ancestors, function(ancestor, level) {
1047 |           if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
1048 |             return;
1049 | 
1050 |           if (typeof(ancestor.readability) === "undefined") {
1051 |             this._initializeNode(ancestor);
1052 |             candidates.push(ancestor);
1053 |           }
1054 | 
1055 |           // Node score divider:
1056 |           // - parent:             1 (no division)
1057 |           // - grandparent:        2
1058 |           // - great grandparent+: ancestor level * 3
1059 |           if (level === 0)
1060 |             var scoreDivider = 1;
1061 |           else if (level === 1)
1062 |             scoreDivider = 2;
1063 |           else
1064 |             scoreDivider = level * 3;
1065 |           ancestor.readability.contentScore += contentScore / scoreDivider;
1066 |         });
1067 |       });
1068 | 
1069 |       // After we've calculated scores, loop through all of the possible
1070 |       // candidate nodes we found and find the one with the highest score.
1071 |       var topCandidates = [];
1072 |       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
1073 |         var candidate = candidates[c];
1074 | 
1075 |         // Scale the final candidates score based on link density. Good content
1076 |         // should have a relatively small link density (5% or less) and be mostly
1077 |         // unaffected by this operation.
1078 |         var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
1079 |         candidate.readability.contentScore = candidateScore;
1080 | 
1081 |         this.log("Candidate:", candidate, "with score " + candidateScore);
1082 | 
1083 |         for (var t = 0; t < this._nbTopCandidates; t++) {
1084 |           var aTopCandidate = topCandidates[t];
1085 | 
1086 |           if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
1087 |             topCandidates.splice(t, 0, candidate);
1088 |             if (topCandidates.length > this._nbTopCandidates)
1089 |               topCandidates.pop();
1090 |             break;
1091 |           }
1092 |         }
1093 |       }
1094 | 
1095 |       var topCandidate = topCandidates[0] || null;
1096 |       var neededToCreateTopCandidate = false;
1097 |       var parentOfTopCandidate;
1098 | 
1099 |       // If we still have no top candidate, just use the body as a last resort.
1100 |       // We also have to copy the body node so it is something we can modify.
1101 |       if (topCandidate === null || topCandidate.tagName === "BODY") {
1102 |         // Move all of the page's children into topCandidate
1103 |         topCandidate = doc.createElement("DIV");
1104 |         neededToCreateTopCandidate = true;
1105 |         // Move everything (not just elements, also text nodes etc.) into the container
1106 |         // so we even include text directly in the body:
1107 |         while (page.firstChild) {
1108 |           this.log("Moving child out:", page.firstChild);
1109 |           topCandidate.appendChild(page.firstChild);
1110 |         }
1111 | 
1112 |         page.appendChild(topCandidate);
1113 | 
1114 |         this._initializeNode(topCandidate);
1115 |       } else if (topCandidate) {
1116 |         // Find a better top candidate node if it contains (at least three) nodes which belong to \`topCandidates\` array
1117 |         // and whose scores are quite closed with current \`topCandidate\` node.
1118 |         var alternativeCandidateAncestors = [];
1119 |         for (var i = 1; i < topCandidates.length; i++) {
1120 |           if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
1121 |             alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
1122 |           }
1123 |         }
1124 |         var MINIMUM_TOPCANDIDATES = 3;
1125 |         if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
1126 |           parentOfTopCandidate = topCandidate.parentNode;
1127 |           while (parentOfTopCandidate.tagName !== "BODY") {
1128 |             var listsContainingThisAncestor = 0;
1129 |             for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
1130 |               listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
1131 |             }
1132 |             if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
1133 |               topCandidate = parentOfTopCandidate;
1134 |               break;
1135 |             }
1136 |             parentOfTopCandidate = parentOfTopCandidate.parentNode;
1137 |           }
1138 |         }
1139 |         if (!topCandidate.readability) {
1140 |           this._initializeNode(topCandidate);
1141 |         }
1142 | 
1143 |         // Because of our bonus system, parents of candidates might have scores
1144 |         // themselves. They get half of the node. There won't be nodes with higher
1145 |         // scores than our topCandidate, but if we see the score going *up* in the first
1146 |         // few steps up the tree, that's a decent sign that there might be more content
1147 |         // lurking in other places that we want to unify in. The sibling stuff
1148 |         // below does some of that - but only if we've looked high enough up the DOM
1149 |         // tree.
1150 |         parentOfTopCandidate = topCandidate.parentNode;
1151 |         var lastScore = topCandidate.readability.contentScore;
1152 |         // The scores shouldn't get too low.
1153 |         var scoreThreshold = lastScore / 3;
1154 |         while (parentOfTopCandidate.tagName !== "BODY") {
1155 |           if (!parentOfTopCandidate.readability) {
1156 |             parentOfTopCandidate = parentOfTopCandidate.parentNode;
1157 |             continue;
1158 |           }
1159 |           var parentScore = parentOfTopCandidate.readability.contentScore;
1160 |           if (parentScore < scoreThreshold)
1161 |             break;
1162 |           if (parentScore > lastScore) {
1163 |             // Alright! We found a better parent to use.
1164 |             topCandidate = parentOfTopCandidate;
1165 |             break;
1166 |           }
1167 |           lastScore = parentOfTopCandidate.readability.contentScore;
1168 |           parentOfTopCandidate = parentOfTopCandidate.parentNode;
1169 |         }
1170 | 
1171 |         // If the top candidate is the only child, use parent instead. This will help sibling
1172 |         // joining logic when adjacent content is actually located in parent's sibling node.
1173 |         parentOfTopCandidate = topCandidate.parentNode;
1174 |         while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
1175 |           topCandidate = parentOfTopCandidate;
1176 |           parentOfTopCandidate = topCandidate.parentNode;
1177 |         }
1178 |         if (!topCandidate.readability) {
1179 |           this._initializeNode(topCandidate);
1180 |         }
1181 |       }
1182 | 
1183 |       // Now that we have the top candidate, look through its siblings for content
1184 |       // that might also be related. Things like preambles, content split by ads
1185 |       // that we removed, etc.
1186 |       var articleContent = doc.createElement("DIV");
1187 |       if (isPaging)
1188 |         articleContent.id = "readability-content";
1189 | 
1190 |       var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
1191 |       // Keep potential top candidate's parent node to try to get text direction of it later.
1192 |       parentOfTopCandidate = topCandidate.parentNode;
1193 |       var siblings = parentOfTopCandidate.children;
1194 | 
1195 |       for (var s = 0, sl = siblings.length; s < sl; s++) {
1196 |         var sibling = siblings[s];
1197 |         var append = false;
1198 | 
1199 |         this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
1200 |         this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
1201 | 
1202 |         if (sibling === topCandidate) {
1203 |           append = true;
1204 |         } else {
1205 |           var contentBonus = 0;
1206 | 
1207 |           // Give a bonus if sibling nodes and top candidates have the example same classname
1208 |           if (sibling.className === topCandidate.className && topCandidate.className !== "")
1209 |             contentBonus += topCandidate.readability.contentScore * 0.2;
1210 | 
1211 |           if (sibling.readability &&
1212 |               ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
1213 |             append = true;
1214 |           } else if (sibling.nodeName === "P") {
1215 |             var linkDensity = this._getLinkDensity(sibling);
1216 |             var nodeContent = this._getInnerText(sibling);
1217 |             var nodeLength = nodeContent.length;
1218 | 
1219 |             if (nodeLength > 80 && linkDensity < 0.25) {
1220 |               append = true;
1221 |             } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 &&
1222 |                        nodeContent.search(/\\.( |\$)/) !== -1) {
1223 |               append = true;
1224 |             }
1225 |           }
1226 |         }
1227 | 
1228 |         if (append) {
1229 |           this.log("Appending node:", sibling);
1230 | 
1231 |           if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
1232 |             // We have a node that isn't a common block level element, like a form or td tag.
1233 |             // Turn it into a div so it doesn't get filtered out later by accident.
1234 |             this.log("Altering sibling:", sibling, "to div.");
1235 | 
1236 |             sibling = this._setNodeTag(sibling, "DIV");
1237 |           }
1238 | 
1239 |           articleContent.appendChild(sibling);
1240 |           // Fetch children again to make it compatible
1241 |           // with DOM parsers without live collection support.
1242 |           siblings = parentOfTopCandidate.children;
1243 |           // siblings is a reference to the children array, and
1244 |           // sibling is removed from the array when we call appendChild().
1245 |           // As a result, we must revisit this index since the nodes
1246 |           // have been shifted.
1247 |           s -= 1;
1248 |           sl -= 1;
1249 |         }
1250 |       }
1251 | 
1252 |       if (this._debug)
1253 |         this.log("Article content pre-prep: " + articleContent.innerHTML);
1254 |       // So we have all of the content that we need. Now we clean it up for presentation.
1255 |       this._prepArticle(articleContent);
1256 |       if (this._debug)
1257 |         this.log("Article content post-prep: " + articleContent.innerHTML);
1258 | 
1259 |       if (neededToCreateTopCandidate) {
1260 |         // We already created a fake div thing, and there wouldn't have been any siblings left
1261 |         // for the previous loop, so there's no point trying to create a new div, and then
1262 |         // move all the children over. Just assign IDs and class names here. No need to append
1263 |         // because that already happened anyway.
1264 |         topCandidate.id = "readability-page-1";
1265 |         topCandidate.className = "page";
1266 |       } else {
1267 |         var div = doc.createElement("DIV");
1268 |         div.id = "readability-page-1";
1269 |         div.className = "page";
1270 |         while (articleContent.firstChild) {
1271 |           div.appendChild(articleContent.firstChild);
1272 |         }
1273 |         articleContent.appendChild(div);
1274 |       }
1275 | 
1276 |       if (this._debug)
1277 |         this.log("Article content after paging: " + articleContent.innerHTML);
1278 | 
1279 |       var parseSuccessful = true;
1280 | 
1281 |       // Now that we've gone through the full algorithm, check to see if
1282 |       // we got any meaningful content. If we didn't, we may need to re-run
1283 |       // grabArticle with different flags set. This gives us a higher likelihood of
1284 |       // finding the content, and the sieve approach gives us a higher likelihood of
1285 |       // finding the -right- content.
1286 |       var textLength = this._getInnerText(articleContent, true).length;
1287 |       if (textLength < this._charThreshold) {
1288 |         parseSuccessful = false;
1289 |         page.innerHTML = pageCacheHtml;
1290 | 
1291 |         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
1292 |           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
1293 |           this._attempts.push({articleContent: articleContent, textLength: textLength});
1294 |         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
1295 |           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
1296 |           this._attempts.push({articleContent: articleContent, textLength: textLength});
1297 |         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
1298 |           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
1299 |           this._attempts.push({articleContent: articleContent, textLength: textLength});
1300 |         } else {
1301 |           this._attempts.push({articleContent: articleContent, textLength: textLength});
1302 |           // No luck after removing flags, just return the longest text we found during the different loops
1303 |           this._attempts.sort(function (a, b) {
1304 |             return b.textLength - a.textLength;
1305 |           });
1306 | 
1307 |           // But first check if we actually have something
1308 |           if (!this._attempts[0].textLength) {
1309 |             return null;
1310 |           }
1311 | 
1312 |           articleContent = this._attempts[0].articleContent;
1313 |           parseSuccessful = true;
1314 |         }
1315 |       }
1316 | 
1317 |       if (parseSuccessful) {
1318 |         // Find out text direction from ancestors of final top candidate.
1319 |         var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
1320 |         this._someNode(ancestors, function(ancestor) {
1321 |           if (!ancestor.tagName)
1322 |             return false;
1323 |           var articleDir = ancestor.getAttribute("dir");
1324 |           if (articleDir) {
1325 |             this._articleDir = articleDir;
1326 |             return true;
1327 |           }
1328 |           return false;
1329 |         });
1330 |         return articleContent;
1331 |       }
1332 |     }
1333 |   },
1334 | 
1335 |   /**
1336 |    * Check whether the input string could be a byline.
1337 |    * This verifies that the input is a string, and that the length
1338 |    * is less than 100 chars.
1339 |    *
1340 |    * @param possibleByline {string} - a string to check whether its a byline.
1341 |    * @return Boolean - whether the input string is a byline.
1342 |    */
1343 |   _isValidByline: function(byline) {
1344 |     if (typeof byline == "string" || byline instanceof String) {
1345 |       byline = byline.trim();
1346 |       return (byline.length > 0) && (byline.length < 100);
1347 |     }
1348 |     return false;
1349 |   },
1350 | 
1351 |   /**
1352 |    * Converts some of the common HTML entities in string to their corresponding characters.
1353 |    *
1354 |    * @param str {string} - a string to unescape.
1355 |    * @return string without HTML entity.
1356 |    */
1357 |   _unescapeHtmlEntities: function(str) {
1358 |     if (!str) {
1359 |       return str;
1360 |     }
1361 | 
1362 |     var htmlEscapeMap = this.HTML_ESCAPE_MAP;
1363 |     return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) {
1364 |       return htmlEscapeMap[tag];
1365 |     }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) {
1366 |       var num = parseInt(hex || numStr, hex ? 16 : 10);
1367 |       return String.fromCharCode(num);
1368 |     });
1369 |   },
1370 | 
1371 |   /**
1372 |    * Try to extract metadata from JSON-LD object.
1373 |    * For now, only Schema.org objects of type Article or its subtypes are supported.
1374 |    * @return Object with any metadata that could be extracted (possibly none)
1375 |    */
1376 |   _getJSONLD: function (doc) {
1377 |     var scripts = this._getAllNodesWithTag(doc, ["script"]);
1378 | 
1379 |     var metadata;
1380 | 
1381 |     this._forEachNode(scripts, function(jsonLdElement) {
1382 |       if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") {
1383 |         try {
1384 |           // Strip CDATA markers if present
1385 |           var content = jsonLdElement.textContent.replace(/^\\s*<!\\[CDATA\\[|\\]\\]>\\s*\$/g, "");
1386 |           var parsed = JSON.parse(content);
1387 |           if (
1388 |             !parsed["@context"] ||
1389 |             !parsed["@context"].match(/^https?\\:\\/\\/schema\\.org\\/?\$/)
1390 |           ) {
1391 |             return;
1392 |           }
1393 | 
1394 |           if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
1395 |             parsed = parsed["@graph"].find(function(it) {
1396 |               return (it["@type"] || "").match(
1397 |                 this.REGEXPS.jsonLdArticleTypes
1398 |               );
1399 |             });
1400 |           }
1401 | 
1402 |           if (
1403 |             !parsed ||
1404 |             !parsed["@type"] ||
1405 |             !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
1406 |           ) {
1407 |             return;
1408 |           }
1409 | 
1410 |           metadata = {};
1411 | 
1412 |           if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
1413 |             // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1414 |             // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1415 |             // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1416 | 
1417 |             var title = this._getArticleTitle();
1418 |             var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1419 |             var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
1420 | 
1421 |             if (headlineMatches && !nameMatches) {
1422 |               metadata.title = parsed.headline;
1423 |             } else {
1424 |               metadata.title = parsed.name;
1425 |             }
1426 |           } else if (typeof parsed.name === "string") {
1427 |             metadata.title = parsed.name.trim();
1428 |           } else if (typeof parsed.headline === "string") {
1429 |             metadata.title = parsed.headline.trim();
1430 |           }
1431 |           if (parsed.author) {
1432 |             if (typeof parsed.author.name === "string") {
1433 |               metadata.byline = parsed.author.name.trim();
1434 |             } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
1435 |               metadata.byline = parsed.author
1436 |                 .filter(function(author) {
1437 |                   return author && typeof author.name === "string";
1438 |                 })
1439 |                 .map(function(author) {
1440 |                   return author.name.trim();
1441 |                 })
1442 |                 .join(", ");
1443 |             }
1444 |           }
1445 |           if (typeof parsed.description === "string") {
1446 |             metadata.excerpt = parsed.description.trim();
1447 |           }
1448 |           if (
1449 |             parsed.publisher &&
1450 |             typeof parsed.publisher.name === "string"
1451 |           ) {
1452 |             metadata.siteName = parsed.publisher.name.trim();
1453 |           }
1454 |           if (typeof parsed.datePublished === "string") {
1455 |             metadata.datePublished = parsed.datePublished.trim();
1456 |           }
1457 |           return;
1458 |         } catch (err) {
1459 |           this.log(err.message);
1460 |         }
1461 |       }
1462 |     });
1463 |     return metadata ? metadata : {};
1464 |   },
1465 | 
1466 |   /**
1467 |    * Attempts to get excerpt and byline metadata for the article.
1468 |    *
1469 |    * @param {Object} jsonld — object containing any metadata that
1470 |    * could be extracted from JSON-LD object.
1471 |    *
1472 |    * @return Object with optional "excerpt" and "byline" properties
1473 |    */
1474 |   _getArticleMetadata: function(jsonld) {
1475 |     var metadata = {};
1476 |     var values = {};
1477 |     var metaElements = this._doc.getElementsByTagName("meta");
1478 | 
1479 |     // property is a space-separated list of values
1480 |     var propertyPattern = /\\s*(article|dc|dcterm|og|twitter)\\s*:\\s*(author|creator|description|published_time|title|site_name)\\s*/gi;
1481 | 
1482 |     // name is a single value
1483 |     var namePattern = /^\\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\\s*[-\\.:]\\s*)?(author|creator|pub-date|description|title|site_name)\\s*\$/i;
1484 | 
1485 |     // Find description tags.
1486 |     this._forEachNode(metaElements, function(element) {
1487 |       var elementName = element.getAttribute("name");
1488 |       var elementProperty = element.getAttribute("property");
1489 |       var content = element.getAttribute("content");
1490 |       if (!content) {
1491 |         return;
1492 |       }
1493 |       var matches = null;
1494 |       var name = null;
1495 | 
1496 |       if (elementProperty) {
1497 |         matches = elementProperty.match(propertyPattern);
1498 |         if (matches) {
1499 |           // Convert to lowercase, and remove any whitespace
1500 |           // so we can match below.
1501 |           name = matches[0].toLowerCase().replace(/\\s/g, "");
1502 |           // multiple authors
1503 |           values[name] = content.trim();
1504 |         }
1505 |       }
1506 |       if (!matches && elementName && namePattern.test(elementName)) {
1507 |         name = elementName;
1508 |         if (content) {
1509 |           // Convert to lowercase, remove any whitespace, and convert dots
1510 |           // to colons so we can match below.
1511 |           name = name.toLowerCase().replace(/\\s/g, "").replace(/\\./g, ":");
1512 |           values[name] = content.trim();
1513 |         }
1514 |       }
1515 |     });
1516 | 
1517 |     // get title
1518 |     metadata.title = jsonld.title ||
1519 |                      values["dc:title"] ||
1520 |                      values["dcterm:title"] ||
1521 |                      values["og:title"] ||
1522 |                      values["weibo:article:title"] ||
1523 |                      values["weibo:webpage:title"] ||
1524 |                      values["title"] ||
1525 |                      values["twitter:title"] ||
1526 |                      values["parsely-title"];
1527 | 
1528 |     if (!metadata.title) {
1529 |       metadata.title = this._getArticleTitle();
1530 |     }
1531 | 
1532 |     // get author
1533 |     metadata.byline = jsonld.byline ||
1534 |                       values["dc:creator"] ||
1535 |                       values["dcterm:creator"] ||
1536 |                       values["author"] ||
1537 |                       values["parsely-author"];
1538 | 
1539 |     // get description
1540 |     metadata.excerpt = jsonld.excerpt ||
1541 |                        values["dc:description"] ||
1542 |                        values["dcterm:description"] ||
1543 |                        values["og:description"] ||
1544 |                        values["weibo:article:description"] ||
1545 |                        values["weibo:webpage:description"] ||
1546 |                        values["description"] ||
1547 |                        values["twitter:description"];
1548 | 
1549 |     // get site name
1550 |     metadata.siteName = jsonld.siteName ||
1551 |                         values["og:site_name"];
1552 | 
1553 |     // get article published time
1554 |     metadata.publishedTime = jsonld.datePublished ||
1555 |                              values["article:published_time"] ||
1556 |                              values["parsely-pub-date"] ||
1557 |                              null;
1558 | 
1559 |     // in many sites the meta value is escaped with HTML entities,
1560 |     // so here we need to unescape it
1561 |     metadata.title = this._unescapeHtmlEntities(metadata.title);
1562 |     metadata.byline = this._unescapeHtmlEntities(metadata.byline);
1563 |     metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
1564 |     metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
1565 |     metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
1566 | 
1567 |     return metadata;
1568 |   },
1569 | 
1570 |   /**
1571 |    * Check if node is image, or if node contains exactly only one image
1572 |    * whether as a direct child or as its descendants.
1573 |    *
1574 |    * @param Element
1575 |   **/
1576 |   _isSingleImage: function(node) {
1577 |     if (node.tagName === "IMG") {
1578 |       return true;
1579 |     }
1580 | 
1581 |     if (node.children.length !== 1 || node.textContent.trim() !== "") {
1582 |       return false;
1583 |     }
1584 | 
1585 |     return this._isSingleImage(node.children[0]);
1586 |   },
1587 | 
1588 |   /**
1589 |    * Find all <noscript> that are located after <img> nodes, and which contain only one
1590 |    * <img> element. Replace the first image with the image from inside the <noscript> tag,
1591 |    * and remove the <noscript> tag. This improves the quality of the images we use on
1592 |    * some sites (e.g. Medium).
1593 |    *
1594 |    * @param Element
1595 |   **/
1596 |   _unwrapNoscriptImages: function(doc) {
1597 |     // Find img without source or attributes that might contains image, and remove it.
1598 |     // This is done to prevent a placeholder img is replaced by img from noscript in next step.
1599 |     var imgs = Array.from(doc.getElementsByTagName("img"));
1600 |     this._forEachNode(imgs, function(img) {
1601 |       for (var i = 0; i < img.attributes.length; i++) {
1602 |         var attr = img.attributes[i];
1603 |         switch (attr.name) {
1604 |           case "src":
1605 |           case "srcset":
1606 |           case "data-src":
1607 |           case "data-srcset":
1608 |             return;
1609 |         }
1610 | 
1611 |         if (/\\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1612 |           return;
1613 |         }
1614 |       }
1615 | 
1616 |       img.parentNode.removeChild(img);
1617 |     });
1618 | 
1619 |     // Next find noscript and try to extract its image
1620 |     var noscripts = Array.from(doc.getElementsByTagName("noscript"));
1621 |     this._forEachNode(noscripts, function(noscript) {
1622 |       // Parse content of noscript and make sure it only contains image
1623 |       var tmp = doc.createElement("div");
1624 |       tmp.innerHTML = noscript.innerHTML;
1625 |       if (!this._isSingleImage(tmp)) {
1626 |         return;
1627 |       }
1628 | 
1629 |       // If noscript has previous sibling and it only contains image,
1630 |       // replace it with noscript content. However we also keep old
1631 |       // attributes that might contains image.
1632 |       var prevElement = noscript.previousElementSibling;
1633 |       if (prevElement && this._isSingleImage(prevElement)) {
1634 |         var prevImg = prevElement;
1635 |         if (prevImg.tagName !== "IMG") {
1636 |           prevImg = prevElement.getElementsByTagName("img")[0];
1637 |         }
1638 | 
1639 |         var newImg = tmp.getElementsByTagName("img")[0];
1640 |         for (var i = 0; i < prevImg.attributes.length; i++) {
1641 |           var attr = prevImg.attributes[i];
1642 |           if (attr.value === "") {
1643 |             continue;
1644 |           }
1645 | 
1646 |           if (attr.name === "src" || attr.name === "srcset" || /\\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1647 |             if (newImg.getAttribute(attr.name) === attr.value) {
1648 |               continue;
1649 |             }
1650 | 
1651 |             var attrName = attr.name;
1652 |             if (newImg.hasAttribute(attrName)) {
1653 |               attrName = "data-old-" + attrName;
1654 |             }
1655 | 
1656 |             newImg.setAttribute(attrName, attr.value);
1657 |           }
1658 |         }
1659 | 
1660 |         noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
1661 |       }
1662 |     });
1663 |   },
1664 | 
1665 |   /**
1666 |    * Removes script tags from the document.
1667 |    *
1668 |    * @param Element
1669 |   **/
1670 |   _removeScripts: function(doc) {
1671 |     this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"]));
1672 |   },
1673 | 
1674 |   /**
1675 |    * Check if this node has only whitespace and a single element with given tag
1676 |    * Returns false if the DIV node contains non-empty text nodes
1677 |    * or if it contains no element with given tag or more than 1 element.
1678 |    *
1679 |    * @param Element
1680 |    * @param string tag of child element
1681 |   **/
1682 |   _hasSingleTagInsideElement: function(element, tag) {
1683 |     // There should be exactly 1 element child with given tag
1684 |     if (element.children.length != 1 || element.children[0].tagName !== tag) {
1685 |       return false;
1686 |     }
1687 | 
1688 |     // And there should be no text nodes with real content
1689 |     return !this._someNode(element.childNodes, function(node) {
1690 |       return node.nodeType === this.TEXT_NODE &&
1691 |              this.REGEXPS.hasContent.test(node.textContent);
1692 |     });
1693 |   },
1694 | 
1695 |   _isElementWithoutContent: function(node) {
1696 |     return node.nodeType === this.ELEMENT_NODE &&
1697 |       node.textContent.trim().length == 0 &&
1698 |       (node.children.length == 0 ||
1699 |        node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
1700 |   },
1701 | 
1702 |   /**
1703 |    * Determine whether element has any children block level elements.
1704 |    *
1705 |    * @param Element
1706 |    */
1707 |   _hasChildBlockElement: function (element) {
1708 |     return this._someNode(element.childNodes, function(node) {
1709 |       return this.DIV_TO_P_ELEMS.has(node.tagName) ||
1710 |              this._hasChildBlockElement(node);
1711 |     });
1712 |   },
1713 | 
1714 |   /***
1715 |    * Determine if a node qualifies as phrasing content.
1716 |    * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
1717 |   **/
1718 |   _isPhrasingContent: function(node) {
1719 |     return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
1720 |       ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
1721 |         this._everyNode(node.childNodes, this._isPhrasingContent));
1722 |   },
1723 | 
1724 |   _isWhitespace: function(node) {
1725 |     return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
1726 |            (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
1727 |   },
1728 | 
1729 |   /**
1730 |    * Get the inner text of a node - cross browser compatibly.
1731 |    * This also strips out any excess whitespace to be found.
1732 |    *
1733 |    * @param Element
1734 |    * @param Boolean normalizeSpaces (default: true)
1735 |    * @return string
1736 |   **/
1737 |   _getInnerText: function(e, normalizeSpaces) {
1738 |     normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
1739 |     var textContent = e.textContent.trim();
1740 | 
1741 |     if (normalizeSpaces) {
1742 |       return textContent.replace(this.REGEXPS.normalize, " ");
1743 |     }
1744 |     return textContent;
1745 |   },
1746 | 
1747 |   /**
1748 |    * Get the number of times a string s appears in the node e.
1749 |    *
1750 |    * @param Element
1751 |    * @param string - what to split on. Default is ","
1752 |    * @return number (integer)
1753 |   **/
1754 |   _getCharCount: function(e, s) {
1755 |     s = s || ",";
1756 |     return this._getInnerText(e).split(s).length - 1;
1757 |   },
1758 | 
1759 |   /**
1760 |    * Remove the style attribute on every e and under.
1761 |    * TODO: Test if getElementsByTagName(*) is faster.
1762 |    *
1763 |    * @param Element
1764 |    * @return void
1765 |   **/
1766 |   _cleanStyles: function(e) {
1767 |     if (!e || e.tagName.toLowerCase() === "svg")
1768 |       return;
1769 | 
1770 |     // Remove \`style\` and deprecated presentational attributes
1771 |     for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
1772 |       e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
1773 |     }
1774 | 
1775 |     if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
1776 |       e.removeAttribute("width");
1777 |       e.removeAttribute("height");
1778 |     }
1779 | 
1780 |     var cur = e.firstElementChild;
1781 |     while (cur !== null) {
1782 |       this._cleanStyles(cur);
1783 |       cur = cur.nextElementSibling;
1784 |     }
1785 |   },
1786 | 
1787 |   /**
1788 |    * Get the density of links as a percentage of the content
1789 |    * This is the amount of text that is inside a link divided by the total text in the node.
1790 |    *
1791 |    * @param Element
1792 |    * @return number (float)
1793 |   **/
1794 |   _getLinkDensity: function(element) {
1795 |     var textLength = this._getInnerText(element).length;
1796 |     if (textLength === 0)
1797 |       return 0;
1798 | 
1799 |     var linkLength = 0;
1800 | 
1801 |     // XXX implement _reduceNodeList?
1802 |     this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
1803 |       var href = linkNode.getAttribute("href");
1804 |       var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
1805 |       linkLength += this._getInnerText(linkNode).length * coefficient;
1806 |     });
1807 | 
1808 |     return linkLength / textLength;
1809 |   },
1810 | 
1811 |   /**
1812 |    * Get an elements class/id weight. Uses regular expressions to tell if this
1813 |    * element looks good or bad.
1814 |    *
1815 |    * @param Element
1816 |    * @return number (Integer)
1817 |   **/
1818 |   _getClassWeight: function(e) {
1819 |     if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES))
1820 |       return 0;
1821 | 
1822 |     var weight = 0;
1823 | 
1824 |     // Look for a special classname
1825 |     if (typeof(e.className) === "string" && e.className !== "") {
1826 |       if (this.REGEXPS.negative.test(e.className))
1827 |         weight -= 25;
1828 | 
1829 |       if (this.REGEXPS.positive.test(e.className))
1830 |         weight += 25;
1831 |     }
1832 | 
1833 |     // Look for a special ID
1834 |     if (typeof(e.id) === "string" && e.id !== "") {
1835 |       if (this.REGEXPS.negative.test(e.id))
1836 |         weight -= 25;
1837 | 
1838 |       if (this.REGEXPS.positive.test(e.id))
1839 |         weight += 25;
1840 |     }
1841 | 
1842 |     return weight;
1843 |   },
1844 | 
1845 |   /**
1846 |    * Clean a node of all elements of type "tag".
1847 |    * (Unless it's a youtube/vimeo video. People love movies.)
1848 |    *
1849 |    * @param Element
1850 |    * @param string tag to clean
1851 |    * @return void
1852 |    **/
1853 |   _clean: function(e, tag) {
1854 |     var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
1855 | 
1856 |     this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) {
1857 |       // Allow youtube and vimeo videos through as people usually want to see those.
1858 |       if (isEmbed) {
1859 |         // First, check the elements attributes to see if any of them contain youtube or vimeo
1860 |         for (var i = 0; i < element.attributes.length; i++) {
1861 |           if (this._allowedVideoRegex.test(element.attributes[i].value)) {
1862 |             return false;
1863 |           }
1864 |         }
1865 | 
1866 |         // For embed with <object> tag, check inner HTML as well.
1867 |         if (element.tagName === "object" && this._allowedVideoRegex.test(element.innerHTML)) {
1868 |           return false;
1869 |         }
1870 |       }
1871 | 
1872 |       return true;
1873 |     });
1874 |   },
1875 | 
1876 |   /**
1877 |    * Check if a given node has one of its ancestor tag name matching the
1878 |    * provided one.
1879 |    * @param  HTMLElement node
1880 |    * @param  String      tagName
1881 |    * @param  Number      maxDepth
1882 |    * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
1883 |    * @return Boolean
1884 |    */
1885 |   _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
1886 |     maxDepth = maxDepth || 3;
1887 |     tagName = tagName.toUpperCase();
1888 |     var depth = 0;
1889 |     while (node.parentNode) {
1890 |       if (maxDepth > 0 && depth > maxDepth)
1891 |         return false;
1892 |       if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
1893 |         return true;
1894 |       node = node.parentNode;
1895 |       depth++;
1896 |     }
1897 |     return false;
1898 |   },
1899 | 
1900 |   /**
1901 |    * Return an object indicating how many rows and columns this table has.
1902 |    */
1903 |   _getRowAndColumnCount: function(table) {
1904 |     var rows = 0;
1905 |     var columns = 0;
1906 |     var trs = table.getElementsByTagName("tr");
1907 |     for (var i = 0; i < trs.length; i++) {
1908 |       var rowspan = trs[i].getAttribute("rowspan") || 0;
1909 |       if (rowspan) {
1910 |         rowspan = parseInt(rowspan, 10);
1911 |       }
1912 |       rows += (rowspan || 1);
1913 | 
1914 |       // Now look for column-related info
1915 |       var columnsInThisRow = 0;
1916 |       var cells = trs[i].getElementsByTagName("td");
1917 |       for (var j = 0; j < cells.length; j++) {
1918 |         var colspan = cells[j].getAttribute("colspan") || 0;
1919 |         if (colspan) {
1920 |           colspan = parseInt(colspan, 10);
1921 |         }
1922 |         columnsInThisRow += (colspan || 1);
1923 |       }
1924 |       columns = Math.max(columns, columnsInThisRow);
1925 |     }
1926 |     return {rows: rows, columns: columns};
1927 |   },
1928 | 
1929 |   /**
1930 |    * Look for 'data' (as opposed to 'layout') tables, for which we use
1931 |    * similar checks as
1932 |    * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
1933 |    */
1934 |   _markDataTables: function(root) {
1935 |     var tables = root.getElementsByTagName("table");
1936 |     for (var i = 0; i < tables.length; i++) {
1937 |       var table = tables[i];
1938 |       var role = table.getAttribute("role");
1939 |       if (role == "presentation") {
1940 |         table._readabilityDataTable = false;
1941 |         continue;
1942 |       }
1943 |       var datatable = table.getAttribute("datatable");
1944 |       if (datatable == "0") {
1945 |         table._readabilityDataTable = false;
1946 |         continue;
1947 |       }
1948 |       var summary = table.getAttribute("summary");
1949 |       if (summary) {
1950 |         table._readabilityDataTable = true;
1951 |         continue;
1952 |       }
1953 | 
1954 |       var caption = table.getElementsByTagName("caption")[0];
1955 |       if (caption && caption.childNodes.length > 0) {
1956 |         table._readabilityDataTable = true;
1957 |         continue;
1958 |       }
1959 | 
1960 |       // If the table has a descendant with any of these tags, consider a data table:
1961 |       var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
1962 |       var descendantExists = function(tag) {
1963 |         return !!table.getElementsByTagName(tag)[0];
1964 |       };
1965 |       if (dataTableDescendants.some(descendantExists)) {
1966 |         this.log("Data table because found data-y descendant");
1967 |         table._readabilityDataTable = true;
1968 |         continue;
1969 |       }
1970 | 
1971 |       // Nested tables indicate a layout table:
1972 |       if (table.getElementsByTagName("table")[0]) {
1973 |         table._readabilityDataTable = false;
1974 |         continue;
1975 |       }
1976 | 
1977 |       var sizeInfo = this._getRowAndColumnCount(table);
1978 | 
1979 |       if (sizeInfo.columns == 1 || sizeInfo.rows == 1) {
1980 |         // single colum/row tables are commonly used for page layout purposes.
1981 |         table._readabilityDataTable = false;
1982 |         continue;
1983 |       }
1984 | 
1985 |       if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
1986 |         table._readabilityDataTable = true;
1987 |         continue;
1988 |       }
1989 |       // Now just go by size entirely:
1990 |       table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
1991 |     }
1992 |   },
1993 | 
1994 |   /* convert images and figures that have properties like data-src into images that can be loaded without JS */
1995 |   _fixLazyImages: function (root) {
1996 |     this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
1997 |       // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
1998 |       // So, here we check if the data uri is too short, just might as well remove it.
1999 |       if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
2000 |         // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
2001 |         var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
2002 |         if (parts[1] === "image/svg+xml") {
2003 |           return;
2004 |         }
2005 | 
2006 |         // Make sure this element has other attributes which contains image.
2007 |         // If it doesn't, then this src is important and shouldn't be removed.
2008 |         var srcCouldBeRemoved = false;
2009 |         for (var i = 0; i < elem.attributes.length; i++) {
2010 |           var attr = elem.attributes[i];
2011 |           if (attr.name === "src") {
2012 |             continue;
2013 |           }
2014 | 
2015 |           if (/\\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
2016 |             srcCouldBeRemoved = true;
2017 |             break;
2018 |           }
2019 |         }
2020 | 
2021 |         // Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
2022 |         // it will be too small, therefore it might be placeholder image.
2023 |         if (srcCouldBeRemoved) {
2024 |           var b64starts = elem.src.search(/base64\\s*/i) + 7;
2025 |           var b64length = elem.src.length - b64starts;
2026 |           if (b64length < 133) {
2027 |             elem.removeAttribute("src");
2028 |           }
2029 |         }
2030 |       }
2031 | 
2032 |       // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
2033 |       if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) {
2034 |         return;
2035 |       }
2036 | 
2037 |       for (var j = 0; j < elem.attributes.length; j++) {
2038 |         attr = elem.attributes[j];
2039 |         if (attr.name === "src" || attr.name === "srcset" || attr.name === "alt") {
2040 |           continue;
2041 |         }
2042 |         var copyTo = null;
2043 |         if (/\\.(jpg|jpeg|png|webp)\\s+\\d/.test(attr.value)) {
2044 |           copyTo = "srcset";
2045 |         } else if (/^\\s*\\S+\\.(jpg|jpeg|png|webp)\\S*\\s*\$/.test(attr.value)) {
2046 |           copyTo = "src";
2047 |         }
2048 |         if (copyTo) {
2049 |           //if this is an img or picture, set the attribute directly
2050 |           if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
2051 |             elem.setAttribute(copyTo, attr.value);
2052 |           } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
2053 |             //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
2054 |             //see the nytimes-3 testcase for an example
2055 |             var img = this._doc.createElement("img");
2056 |             img.setAttribute(copyTo, attr.value);
2057 |             elem.appendChild(img);
2058 |           }
2059 |         }
2060 |       }
2061 |     });
2062 |   },
2063 | 
2064 |   _getTextDensity: function(e, tags) {
2065 |     var textLength = this._getInnerText(e, true).length;
2066 |     if (textLength === 0) {
2067 |       return 0;
2068 |     }
2069 |     var childrenLength = 0;
2070 |     var children = this._getAllNodesWithTag(e, tags);
2071 |     this._forEachNode(children, (child) => childrenLength += this._getInnerText(child, true).length);
2072 |     return childrenLength / textLength;
2073 |   },
2074 | 
2075 |   /**
2076 |    * Clean an element of all tags of type "tag" if they look fishy.
2077 |    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
2078 |    *
2079 |    * @return void
2080 |    **/
2081 |   _cleanConditionally: function(e, tag) {
2082 |     if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
2083 |       return;
2084 | 
2085 |     // Gather counts for other typical elements embedded within.
2086 |     // Traverse backwards so we can remove nodes at the same time
2087 |     // without effecting the traversal.
2088 |     //
2089 |     // TODO: Consider taking into account original contentScore here.
2090 |     this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) {
2091 |       // First check if this node IS data table, in which case don't remove it.
2092 |       var isDataTable = function(t) {
2093 |         return t._readabilityDataTable;
2094 |       };
2095 | 
2096 |       var isList = tag === "ul" || tag === "ol";
2097 |       if (!isList) {
2098 |         var listLength = 0;
2099 |         var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]);
2100 |         this._forEachNode(listNodes, (list) => listLength += this._getInnerText(list).length);
2101 |         isList = listLength / this._getInnerText(node).length > 0.9;
2102 |       }
2103 | 
2104 |       if (tag === "table" && isDataTable(node)) {
2105 |         return false;
2106 |       }
2107 | 
2108 |       // Next check if we're inside a data table, in which case don't remove it as well.
2109 |       if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
2110 |         return false;
2111 |       }
2112 | 
2113 |       if (this._hasAncestorTag(node, "code")) {
2114 |         return false;
2115 |       }
2116 | 
2117 |       // keep element if it has a data tables
2118 |       if ([...node.getElementsByTagName("table")].some( tbl => tbl._readabilityDataTable)) {
2119 |         return false;
2120 |       }
2121 | 
2122 |       var weight = this._getClassWeight(node);
2123 | 
2124 |       this.log("Cleaning Conditionally", node);
2125 | 
2126 |       var contentScore = 0;
2127 | 
2128 |       if (weight + contentScore < 0) {
2129 |         return true;
2130 |       }
2131 | 
2132 |       if (this._getCharCount(node, ",") < 10) {
2133 |         // If there are not very many commas, and the number of
2134 |         // non-paragraph elements is more than paragraphs or other
2135 |         // ominous signs, remove the element.
2136 |         var p = node.getElementsByTagName("p").length;
2137 |         var img = node.getElementsByTagName("img").length;
2138 |         var li = node.getElementsByTagName("li").length - 100;
2139 |         var input = node.getElementsByTagName("input").length;
2140 |         var headingDensity = this._getTextDensity(node, ["h1", "h2", "h3", "h4", "h5", "h6"]);
2141 | 
2142 |         var embedCount = 0;
2143 |         var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]);
2144 | 
2145 |         for (var i = 0; i < embeds.length; i++) {
2146 |           // If this embed has attribute that matches video regex, don't delete it.
2147 |           for (var j = 0; j < embeds[i].attributes.length; j++) {
2148 |             if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
2149 |               return false;
2150 |             }
2151 |           }
2152 | 
2153 |           // For embed with <object> tag, check inner HTML as well.
2154 |           if (embeds[i].tagName === "object" && this._allowedVideoRegex.test(embeds[i].innerHTML)) {
2155 |             return false;
2156 |           }
2157 | 
2158 |           embedCount++;
2159 |         }
2160 | 
2161 |         var innerText = this._getInnerText(node);
2162 | 
2163 |         // toss any node whose inner text contains nothing but suspicious words
2164 |         if (this.REGEXPS.adWords.test(innerText) || this.REGEXPS.loadingWords.test(innerText)) {
2165 |           return true;
2166 |         }
2167 | 
2168 |         var contentLength = innerText.length;
2169 |         var linkDensity = this._getLinkDensity(node);
2170 |         var textishTags = ["SPAN", "LI", "TD"].concat(Array.from(this.DIV_TO_P_ELEMS));
2171 |         var textDensity = this._getTextDensity(node, textishTags);
2172 |         var isFigureChild = this._hasAncestorTag(node, "figure");
2173 | 
2174 |         // apply shadiness checks, then check for exceptions
2175 |         const shouldRemoveNode = () => {
2176 |           const errs = [];
2177 |           if (!isFigureChild && img > 1 && p / img < 0.5) {
2178 |             errs.push(\`Bad p to img ratio (img=\${img}, p=\${p})\`);
2179 |           }
2180 |           if (!isList && li > p) {
2181 |             errs.push(\`Too many li's outside of a list. (li=\${li} > p=\${p})\`);
2182 |           }
2183 |           if (input > Math.floor(p/3)) {
2184 |             errs.push(\`Too many inputs per p. (input=\${input}, p=\${p})\`);
2185 |           }
2186 |           if (!isList && !isFigureChild && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && linkDensity > 0) {
2187 |             errs.push(\`Suspiciously short. (headingDensity=\${headingDensity}, img=\${img}, linkDensity=\${linkDensity})\`);
2188 |           }
2189 |           if (!isList && weight < 25 && linkDensity > (0.2 + this._linkDensityModifier)) {
2190 |             errs.push(\`Low weight and a little linky. (linkDensity=\${linkDensity})\`);
2191 |           }
2192 |           if (weight >= 25 && linkDensity > (0.5 + this._linkDensityModifier)) {
2193 |             errs.push(\`High weight and mostly links. (linkDensity=\${linkDensity})\`);
2194 |           }
2195 |           if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
2196 |             errs.push(\`Suspicious embed. (embedCount=\${embedCount}, contentLength=\${contentLength})\`);
2197 |           }
2198 |           if (img === 0 && textDensity === 0) {
2199 |             errs.push(\`No useful content. (img=\${img}, textDensity=\${textDensity})\`);
2200 |           }
2201 | 
2202 |           if (errs.length > 0) {
2203 |             this.log("Checks failed", errs);
2204 |             return true;
2205 |           }
2206 | 
2207 |           return false;
2208 |         };
2209 | 
2210 |         var haveToRemove = shouldRemoveNode();
2211 | 
2212 |         // Allow simple lists of images to remain in pages
2213 |         if (isList && haveToRemove) {
2214 |           for (var x = 0; x < node.children.length; x++) {
2215 |             let child = node.children[x];
2216 |             // Don't filter in lists with li's that contain more than one child
2217 |             if (child.children.length > 1) {
2218 |               return haveToRemove;
2219 |             }
2220 |           }
2221 |           let li_count = node.getElementsByTagName("li").length;
2222 |           // Only allow the list to remain if every li contains an image
2223 |           if (img == li_count) {
2224 |             return false;
2225 |           }
2226 |         }
2227 |         return haveToRemove;
2228 |       }
2229 |       return false;
2230 |     });
2231 |   },
2232 | 
2233 |   /**
2234 |    * Clean out elements that match the specified conditions
2235 |    *
2236 |    * @param Element
2237 |    * @param Function determines whether a node should be removed
2238 |    * @return void
2239 |    **/
2240 |   _cleanMatchedNodes: function(e, filter) {
2241 |     var endOfSearchMarkerNode = this._getNextNode(e, true);
2242 |     var next = this._getNextNode(e);
2243 |     while (next && next != endOfSearchMarkerNode) {
2244 |       if (filter.call(this, next, next.className + " " + next.id)) {
2245 |         next = this._removeAndGetNext(next);
2246 |       } else {
2247 |         next = this._getNextNode(next);
2248 |       }
2249 |     }
2250 |   },
2251 | 
2252 |   /**
2253 |    * Clean out spurious headers from an Element.
2254 |    *
2255 |    * @param Element
2256 |    * @return void
2257 |   **/
2258 |   _cleanHeaders: function(e) {
2259 |     let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]);
2260 |     this._removeNodes(headingNodes, function(node) {
2261 |       let shouldRemove = this._getClassWeight(node) < 0;
2262 |       if (shouldRemove) {
2263 |         this.log("Removing header with low class weight:", node);
2264 |       }
2265 |       return shouldRemove;
2266 |     });
2267 |   },
2268 | 
2269 |   /**
2270 |    * Check if this node is an H1 or H2 element whose content is mostly
2271 |    * the same as the article title.
2272 |    *
2273 |    * @param Element  the node to check.
2274 |    * @return boolean indicating whether this is a title-like header.
2275 |    */
2276 |   _headerDuplicatesTitle: function(node) {
2277 |     if (node.tagName != "H1" && node.tagName != "H2") {
2278 |       return false;
2279 |     }
2280 |     var heading = this._getInnerText(node, false);
2281 |     this.log("Evaluating similarity of header:", heading, this._articleTitle);
2282 |     return this._textSimilarity(this._articleTitle, heading) > 0.75;
2283 |   },
2284 | 
2285 |   _flagIsActive: function(flag) {
2286 |     return (this._flags & flag) > 0;
2287 |   },
2288 | 
2289 |   _removeFlag: function(flag) {
2290 |     this._flags = this._flags & ~flag;
2291 |   },
2292 | 
2293 |   _isProbablyVisible: function(node) {
2294 |     // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
2295 |     return (!node.style || node.style.display != "none")
2296 |       && (!node.style || node.style.visibility != "hidden")
2297 |       && !node.hasAttribute("hidden")
2298 |       //check for "fallback-image" so that wikimedia math images are displayed
2299 |       && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
2300 |   },
2301 | 
2302 |   /**
2303 |    * Runs readability.
2304 |    *
2305 |    * Workflow:
2306 |    *  1. Prep the document by removing script tags, css, etc.
2307 |    *  2. Build readability's DOM tree.
2308 |    *  3. Grab the article content from the current dom tree.
2309 |    *  4. Replace the current DOM tree with the new one.
2310 |    *  5. Read peacefully.
2311 |    *
2312 |    * @return void
2313 |    **/
2314 |   parse: function () {
2315 |     // Avoid parsing too large documents, as per configuration option
2316 |     if (this._maxElemsToParse > 0) {
2317 |       var numTags = this._doc.getElementsByTagName("*").length;
2318 |       if (numTags > this._maxElemsToParse) {
2319 |         throw new Error("Aborting parsing document; " + numTags + " elements found");
2320 |       }
2321 |     }
2322 | 
2323 |     // Unwrap image from noscript
2324 |     this._unwrapNoscriptImages(this._doc);
2325 | 
2326 |     // Extract JSON-LD metadata before removing scripts
2327 |     var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
2328 | 
2329 |     // Remove script tags from the document.
2330 |     this._removeScripts(this._doc);
2331 | 
2332 |     this._prepDocument();
2333 | 
2334 |     var metadata = this._getArticleMetadata(jsonLd);
2335 |     this._articleTitle = metadata.title;
2336 | 
2337 |     var articleContent = this._grabArticle();
2338 |     if (!articleContent)
2339 |       return null;
2340 | 
2341 |     this.log("Grabbed: " + articleContent.innerHTML);
2342 | 
2343 |     this._postProcessContent(articleContent);
2344 | 
2345 |     // If we haven't found an excerpt in the article's metadata, use the article's
2346 |     // first paragraph as the excerpt. This is used for displaying a preview of
2347 |     // the article's content.
2348 |     if (!metadata.excerpt) {
2349 |       var paragraphs = articleContent.getElementsByTagName("p");
2350 |       if (paragraphs.length > 0) {
2351 |         metadata.excerpt = paragraphs[0].textContent.trim();
2352 |       }
2353 |     }
2354 | 
2355 |     var textContent = articleContent.textContent;
2356 |     return {
2357 |       title: this._articleTitle,
2358 |       byline: metadata.byline || this._articleByline,
2359 |       dir: this._articleDir,
2360 |       lang: this._articleLang,
2361 |       content: this._serializer(articleContent),
2362 |       textContent: textContent,
2363 |       length: textContent.length,
2364 |       excerpt: metadata.excerpt,
2365 |       siteName: metadata.siteName || this._articleSiteName,
2366 |       publishedTime: metadata.publishedTime,
2367 |     };
2368 |   },
2369 | };
2370 | 
2371 | if (typeof module === "object") {
2372 |   /* global module */
2373 |   module.exports = Readability;
2374 | }
2375 | /*
2376 |  * Copyright (c) 2010 Arc90 Inc
2377 |  *
2378 |  * Licensed under the Apache License, Version 2.0 (the "License");
2379 |  * you may not use this file except in compliance with the License.
2380 |  * You may obtain a copy of the License at
2381 |  *
2382 |  *     http://www.apache.org/licenses/LICENSE-2.0
2383 |  *
2384 |  * Unless required by applicable law or agreed to in writing, software
2385 |  * distributed under the License is distributed on an "AS IS" BASIS,
2386 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2387 |  * See the License for the specific language governing permissions and
2388 |  * limitations under the License.
2389 |  */
2390 | 
2391 | /*
2392 |  * This code is heavily based on Arc90's readability.js (1.7.1) script
2393 |  * available at: http://code.google.com/p/arc90labs-readability
2394 |  */
2395 | 
2396 | var REGEXPS = {
2397 |   // NOTE: These two regular expressions are duplicated in
2398 |   // Readability.js. Please keep both copies in sync.
2399 |   unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
2400 |   okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
2401 | };
2402 | 
2403 | function isNodeVisible(node) {
2404 |   // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
2405 |   return (!node.style || node.style.display != "none")
2406 |     && !node.hasAttribute("hidden")
2407 |     //check for "fallback-image" so that wikimedia math images are displayed
2408 |     && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
2409 | }
2410 | 
2411 | /**
2412 |  * Decides whether or not the document is reader-able without parsing the whole thing.
2413 |  * @param {Object} options Configuration object.
2414 |  * @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable.
2415 |  * @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable.
2416 |  * @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible.
2417 |  * @return {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object.
2418 |  */
2419 | function isProbablyReaderable(doc, options = {}) {
2420 |   // For backward compatibility reasons 'options' can either be a configuration object or the function used
2421 |   // to determine if a node is visible.
2422 |   if (typeof options == "function") {
2423 |     options = { visibilityChecker: options };
2424 |   }
2425 | 
2426 |   var defaultOptions = { minScore: 20, minContentLength: 140, visibilityChecker: isNodeVisible };
2427 |   options = Object.assign(defaultOptions, options);
2428 | 
2429 |   var nodes = doc.querySelectorAll("p, pre, article");
2430 | 
2431 |   // Get <div> nodes which have <br> node(s) and append them into the \`nodes\` variable.
2432 |   // Some articles' DOM structures might look like
2433 |   // <div>
2434 |   //   Sentences<br>
2435 |   //   <br>
2436 |   //   Sentences<br>
2437 |   // </div>
2438 |   var brNodes = doc.querySelectorAll("div > br");
2439 |   if (brNodes.length) {
2440 |     var set = new Set(nodes);
2441 |     [].forEach.call(brNodes, function (node) {
2442 |       set.add(node.parentNode);
2443 |     });
2444 |     nodes = Array.from(set);
2445 |   }
2446 | 
2447 |   var score = 0;
2448 |   // This is a little cheeky, we use the accumulator 'score' to decide what to return from
2449 |   // this callback:
2450 |   return [].some.call(nodes, function (node) {
2451 |     if (!options.visibilityChecker(node)) {
2452 |       return false;
2453 |     }
2454 | 
2455 |     var matchString = node.className + " " + node.id;
2456 |     if (REGEXPS.unlikelyCandidates.test(matchString) &&
2457 |         !REGEXPS.okMaybeItsACandidate.test(matchString)) {
2458 |       return false;
2459 |     }
2460 | 
2461 |     if (node.matches("li p")) {
2462 |       return false;
2463 |     }
2464 | 
2465 |     var textContentLength = node.textContent.trim().length;
2466 |     if (textContentLength < options.minContentLength) {
2467 |       return false;
2468 |     }
2469 | 
2470 |     score += Math.sqrt(textContentLength - options.minContentLength);
2471 | 
2472 |     if (score > options.minScore) {
2473 |       return true;
2474 |     }
2475 |     return false;
2476 |   });
2477 | }
2478 | 
2479 | if (typeof module === "object") {
2480 |   /* global module */
2481 |   module.exports = isProbablyReaderable;
2482 | }
2483 | `;


--------------------------------------------------------------------------------
/src/third_party/turndown-client/LISENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Dom Christie
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/third_party/turndown-client/README.md:
--------------------------------------------------------------------------------
1 | # Turndown
2 | 
3 | - This is the build artifacts from <https://github.com/mixmark-io/turndown>.
4 | - The license follows the original fork.
5 | 


--------------------------------------------------------------------------------
/src/third_party/turndown-client/turndown-plugin-gfm.ts:
--------------------------------------------------------------------------------
  1 | export const turndownPluginGfmJsBundle=`var turndownPluginGfm = (function (exports) {
  2 | 'use strict';
  3 | 
  4 | var highlightRegExp = /highlight-(?:text|source)-([a-z0-9]+)/;
  5 | 
  6 | function highlightedCodeBlock (turndownService) {
  7 |   turndownService.addRule('highlightedCodeBlock', {
  8 |     filter: function (node) {
  9 |       var firstChild = node.firstChild;
 10 |       return (
 11 |         node.nodeName === 'DIV' &&
 12 |         highlightRegExp.test(node.className) &&
 13 |         firstChild &&
 14 |         firstChild.nodeName === 'PRE'
 15 |       )
 16 |     },
 17 |     replacement: function (content, node, options) {
 18 |       var className = node.className || '';
 19 |       var language = (className.match(highlightRegExp) || [null, ''])[1];
 20 | 
 21 |       return (
 22 |         '\\n\\n' + options.fence + language + '\\n' +
 23 |         node.firstChild.textContent +
 24 |         '\\n' + options.fence + '\\n\\n'
 25 |       )
 26 |     }
 27 |   });
 28 | }
 29 | 
 30 | function strikethrough (turndownService) {
 31 |   turndownService.addRule('strikethrough', {
 32 |     filter: ['del', 's', 'strike'],
 33 |     replacement: function (content) {
 34 |       return '~' + content + '~'
 35 |     }
 36 |   });
 37 | }
 38 | 
 39 | var indexOf = Array.prototype.indexOf;
 40 | var every = Array.prototype.every;
 41 | var rules = {};
 42 | 
 43 | rules.tableCell = {
 44 |   filter: ['th', 'td'],
 45 |   replacement: function (content, node) {
 46 |     return cell(content, node)
 47 |   }
 48 | };
 49 | 
 50 | rules.tableRow = {
 51 |   filter: 'tr',
 52 |   replacement: function (content, node) {
 53 |     var borderCells = '';
 54 |     var alignMap = { left: ':--', right: '--:', center: ':-:' };
 55 | 
 56 |     if (isHeadingRow(node)) {
 57 |       for (var i = 0; i < node.childNodes.length; i++) {
 58 |         var border = '---';
 59 |         var align = (
 60 |           node.childNodes[i].getAttribute('align') || ''
 61 |         ).toLowerCase();
 62 | 
 63 |         if (align) border = alignMap[align] || border;
 64 | 
 65 |         borderCells += cell(border, node.childNodes[i]);
 66 |       }
 67 |     }
 68 |     return '\\n' + content + (borderCells ? '\\n' + borderCells : '')
 69 |   }
 70 | };
 71 | 
 72 | rules.table = {
 73 |   // Only convert tables with a heading row.
 74 |   // Tables with no heading row are kept using \`keep\` (see below).
 75 |   filter: function (node) {
 76 |     return node.nodeName === 'TABLE' && isHeadingRow(node.rows[0])
 77 |   },
 78 | 
 79 |   replacement: function (content) {
 80 |     // Ensure there are no blank lines
 81 |     content = content.replace('\\n\\n', '\\n');
 82 |     return '\\n\\n' + content + '\\n\\n'
 83 |   }
 84 | };
 85 | 
 86 | rules.tableSection = {
 87 |   filter: ['thead', 'tbody', 'tfoot'],
 88 |   replacement: function (content) {
 89 |     return content
 90 |   }
 91 | };
 92 | 
 93 | // A tr is a heading row if:
 94 | // - the parent is a THEAD
 95 | // - or if its the first child of the TABLE or the first TBODY (possibly
 96 | //   following a blank THEAD)
 97 | // - and every cell is a TH
 98 | function isHeadingRow (tr) {
 99 |   var parentNode = tr.parentNode;
100 |   return (
101 |     parentNode.nodeName === 'THEAD' ||
102 |     (
103 |       parentNode.firstChild === tr &&
104 |       (parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
105 |       every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
106 |     )
107 |   )
108 | }
109 | 
110 | function isFirstTbody (element) {
111 |   var previousSibling = element.previousSibling;
112 |   return (
113 |     element.nodeName === 'TBODY' && (
114 |       !previousSibling ||
115 |       (
116 |         previousSibling.nodeName === 'THEAD' &&
117 |         /^\\s*\$/i.test(previousSibling.textContent)
118 |       )
119 |     )
120 |   )
121 | }
122 | 
123 | function cell (content, node) {
124 |   var index = indexOf.call(node.parentNode.childNodes, node);
125 |   var prefix = ' ';
126 |   if (index === 0) prefix = '| ';
127 |   return prefix + content + ' |'
128 | }
129 | 
130 | function tables (turndownService) {
131 |   turndownService.keep(function (node) {
132 |     return node.nodeName === 'TABLE' && !isHeadingRow(node.rows[0])
133 |   });
134 |   for (var key in rules) turndownService.addRule(key, rules[key]);
135 | }
136 | 
137 | function taskListItems (turndownService) {
138 |   turndownService.addRule('taskListItems', {
139 |     filter: function (node) {
140 |       return node.type === 'checkbox' && node.parentNode.nodeName === 'LI'
141 |     },
142 |     replacement: function (content, node) {
143 |       return (node.checked ? '[x]' : '[ ]') + ' '
144 |     }
145 |   });
146 | }
147 | 
148 | function gfm (turndownService) {
149 |   turndownService.use([
150 |     highlightedCodeBlock,
151 |     strikethrough,
152 |     tables,
153 |     taskListItems
154 |   ]);
155 | }
156 | 
157 | exports.gfm = gfm;
158 | exports.highlightedCodeBlock = highlightedCodeBlock;
159 | exports.strikethrough = strikethrough;
160 | exports.tables = tables;
161 | exports.taskListItems = taskListItems;
162 | 
163 | return exports;
164 | 
165 | }({}));
166 | `;


--------------------------------------------------------------------------------
/src/third_party/turndown-client/turndown.ts:
--------------------------------------------------------------------------------
  1 | // https://unpkg.com/turndown/dist/turndown.js
  2 | export const turndownJsBundle = `var TurndownService = (function () {
  3 |   'use strict';
  4 | 
  5 |   function extend (destination) {
  6 |     for (var i = 1; i < arguments.length; i++) {
  7 |       var source = arguments[i];
  8 |       for (var key in source) {
  9 |         if (source.hasOwnProperty(key)) destination[key] = source[key];
 10 |       }
 11 |     }
 12 |     return destination
 13 |   }
 14 | 
 15 |   function repeat (character, count) {
 16 |     return Array(count + 1).join(character)
 17 |   }
 18 | 
 19 |   function trimLeadingNewlines (string) {
 20 |     return string.replace(/^\\n*/, '')
 21 |   }
 22 | 
 23 |   function trimTrailingNewlines (string) {
 24 |     // avoid match-at-end regexp bottleneck, see #370
 25 |     var indexEnd = string.length;
 26 |     while (indexEnd > 0 && string[indexEnd - 1] === '\\n') indexEnd--;
 27 |     return string.substring(0, indexEnd)
 28 |   }
 29 | 
 30 |   var blockElements = [
 31 |     'ADDRESS', 'ARTICLE', 'ASIDE', 'AUDIO', 'BLOCKQUOTE', 'BODY', 'CANVAS',
 32 |     'CENTER', 'DD', 'DIR', 'DIV', 'DL', 'DT', 'FIELDSET', 'FIGCAPTION', 'FIGURE',
 33 |     'FOOTER', 'FORM', 'FRAMESET', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HEADER',
 34 |     'HGROUP', 'HR', 'HTML', 'ISINDEX', 'LI', 'MAIN', 'MENU', 'NAV', 'NOFRAMES',
 35 |     'NOSCRIPT', 'OL', 'OUTPUT', 'P', 'PRE', 'SECTION', 'TABLE', 'TBODY', 'TD',
 36 |     'TFOOT', 'TH', 'THEAD', 'TR', 'UL'
 37 |   ];
 38 | 
 39 |   function isBlock (node) {
 40 |     return is(node, blockElements)
 41 |   }
 42 | 
 43 |   var voidElements = [
 44 |     'AREA', 'BASE', 'BR', 'COL', 'COMMAND', 'EMBED', 'HR', 'IMG', 'INPUT',
 45 |     'KEYGEN', 'LINK', 'META', 'PARAM', 'SOURCE', 'TRACK', 'WBR'
 46 |   ];
 47 | 
 48 |   function isVoid (node) {
 49 |     return is(node, voidElements)
 50 |   }
 51 | 
 52 |   function hasVoid (node) {
 53 |     return has(node, voidElements)
 54 |   }
 55 | 
 56 |   var meaningfulWhenBlankElements = [
 57 |     'A', 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TH', 'TD', 'IFRAME', 'SCRIPT',
 58 |     'AUDIO', 'VIDEO'
 59 |   ];
 60 | 
 61 |   function isMeaningfulWhenBlank (node) {
 62 |     return is(node, meaningfulWhenBlankElements)
 63 |   }
 64 | 
 65 |   function hasMeaningfulWhenBlank (node) {
 66 |     return has(node, meaningfulWhenBlankElements)
 67 |   }
 68 | 
 69 |   function is (node, tagNames) {
 70 |     return tagNames.indexOf(node.nodeName) >= 0
 71 |   }
 72 | 
 73 |   function has (node, tagNames) {
 74 |     return (
 75 |       node.getElementsByTagName &&
 76 |       tagNames.some(function (tagName) {
 77 |         return node.getElementsByTagName(tagName).length
 78 |       })
 79 |     )
 80 |   }
 81 | 
 82 |   var rules = {};
 83 | 
 84 |   rules.paragraph = {
 85 |     filter: 'p',
 86 | 
 87 |     replacement: function (content) {
 88 |       return '\\n\\n' + content + '\\n\\n'
 89 |     }
 90 |   };
 91 | 
 92 |   rules.lineBreak = {
 93 |     filter: 'br',
 94 | 
 95 |     replacement: function (content, node, options) {
 96 |       return options.br + '\\n'
 97 |     }
 98 |   };
 99 | 
100 |   rules.heading = {
101 |     filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
102 | 
103 |     replacement: function (content, node, options) {
104 |       var hLevel = Number(node.nodeName.charAt(1));
105 | 
106 |       if (options.headingStyle === 'setext' && hLevel < 3) {
107 |         var underline = repeat((hLevel === 1 ? '=' : '-'), content.length);
108 |         return (
109 |           '\\n\\n' + content + '\\n' + underline + '\\n\\n'
110 |         )
111 |       } else {
112 |         return '\\n\\n' + repeat('#', hLevel) + ' ' + content + '\\n\\n'
113 |       }
114 |     }
115 |   };
116 | 
117 |   rules.blockquote = {
118 |     filter: 'blockquote',
119 | 
120 |     replacement: function (content) {
121 |       content = content.replace(/^\\n+|\\n+\$/g, '');
122 |       content = content.replace(/^/gm, '> ');
123 |       return '\\n\\n' + content + '\\n\\n'
124 |     }
125 |   };
126 | 
127 |   rules.list = {
128 |     filter: ['ul', 'ol'],
129 | 
130 |     replacement: function (content, node) {
131 |       var parent = node.parentNode;
132 |       if (parent.nodeName === 'LI' && parent.lastElementChild === node) {
133 |         return '\\n' + content
134 |       } else {
135 |         return '\\n\\n' + content + '\\n\\n'
136 |       }
137 |     }
138 |   };
139 | 
140 |   rules.listItem = {
141 |     filter: 'li',
142 | 
143 |     replacement: function (content, node, options) {
144 |       content = content
145 |         .replace(/^\\n+/, '') // remove leading newlines
146 |         .replace(/\\n+\$/, '\\n') // replace trailing newlines with just a single one
147 |         .replace(/\\n/gm, '\\n    '); // indent
148 |       var prefix = options.bulletListMarker + '   ';
149 |       var parent = node.parentNode;
150 |       if (parent.nodeName === 'OL') {
151 |         var start = parent.getAttribute('start');
152 |         var index = Array.prototype.indexOf.call(parent.children, node);
153 |         prefix = (start ? Number(start) + index : index + 1) + '.  ';
154 |       }
155 |       return (
156 |         prefix + content + (node.nextSibling && !/\\n\$/.test(content) ? '\\n' : '')
157 |       )
158 |     }
159 |   };
160 | 
161 |   rules.indentedCodeBlock = {
162 |     filter: function (node, options) {
163 |       return (
164 |         options.codeBlockStyle === 'indented' &&
165 |         node.nodeName === 'PRE' &&
166 |         node.firstChild &&
167 |         node.firstChild.nodeName === 'CODE'
168 |       )
169 |     },
170 | 
171 |     replacement: function (content, node, options) {
172 |       return (
173 |         '\\n\\n    ' +
174 |         node.firstChild.textContent.replace(/\\n/g, '\\n    ') +
175 |         '\\n\\n'
176 |       )
177 |     }
178 |   };
179 | 
180 |   rules.fencedCodeBlock = {
181 |     filter: function (node, options) {
182 |       return (
183 |         options.codeBlockStyle === 'fenced' &&
184 |         node.nodeName === 'PRE' &&
185 |         node.firstChild &&
186 |         node.firstChild.nodeName === 'CODE'
187 |       )
188 |     },
189 | 
190 |     replacement: function (content, node, options) {
191 |       var className = node.firstChild.getAttribute('class') || '';
192 |       var language = (className.match(/language-(\\S+)/) || [null, ''])[1];
193 |       var code = node.firstChild.textContent;
194 | 
195 |       var fenceChar = options.fence.charAt(0);
196 |       var fenceSize = 3;
197 |       var fenceInCodeRegex = new RegExp('^' + fenceChar + '{3,}', 'gm');
198 | 
199 |       var match;
200 |       while ((match = fenceInCodeRegex.exec(code))) {
201 |         if (match[0].length >= fenceSize) {
202 |           fenceSize = match[0].length + 1;
203 |         }
204 |       }
205 | 
206 |       var fence = repeat(fenceChar, fenceSize);
207 | 
208 |       return (
209 |         '\\n\\n' + fence + language + '\\n' +
210 |         code.replace(/\\n\$/, '') +
211 |         '\\n' + fence + '\\n\\n'
212 |       )
213 |     }
214 |   };
215 | 
216 |   rules.horizontalRule = {
217 |     filter: 'hr',
218 | 
219 |     replacement: function (content, node, options) {
220 |       return '\\n\\n' + options.hr + '\\n\\n'
221 |     }
222 |   };
223 | 
224 |   rules.inlineLink = {
225 |     filter: function (node, options) {
226 |       return (
227 |         options.linkStyle === 'inlined' &&
228 |         node.nodeName === 'A' &&
229 |         node.getAttribute('href')
230 |       )
231 |     },
232 | 
233 |     replacement: function (content, node) {
234 |       var href = node.getAttribute('href');
235 |       if (href) href = href.replace(/([()])/g, '\\\\\$1');
236 |       var title = cleanAttribute(node.getAttribute('title'));
237 |       if (title) title = ' "' + title.replace(/"/g, '\\\\"') + '"';
238 |       return '[' + content + '](' + href + title + ')'
239 |     }
240 |   };
241 | 
242 |   rules.referenceLink = {
243 |     filter: function (node, options) {
244 |       return (
245 |         options.linkStyle === 'referenced' &&
246 |         node.nodeName === 'A' &&
247 |         node.getAttribute('href')
248 |       )
249 |     },
250 | 
251 |     replacement: function (content, node, options) {
252 |       var href = node.getAttribute('href');
253 |       var title = cleanAttribute(node.getAttribute('title'));
254 |       if (title) title = ' "' + title + '"';
255 |       var replacement;
256 |       var reference;
257 | 
258 |       switch (options.linkReferenceStyle) {
259 |         case 'collapsed':
260 |           replacement = '[' + content + '][]';
261 |           reference = '[' + content + ']: ' + href + title;
262 |           break
263 |         case 'shortcut':
264 |           replacement = '[' + content + ']';
265 |           reference = '[' + content + ']: ' + href + title;
266 |           break
267 |         default:
268 |           var id = this.references.length + 1;
269 |           replacement = '[' + content + '][' + id + ']';
270 |           reference = '[' + id + ']: ' + href + title;
271 |       }
272 | 
273 |       this.references.push(reference);
274 |       return replacement
275 |     },
276 | 
277 |     references: [],
278 | 
279 |     append: function (options) {
280 |       var references = '';
281 |       if (this.references.length) {
282 |         references = '\\n\\n' + this.references.join('\\n') + '\\n\\n';
283 |         this.references = []; // Reset references
284 |       }
285 |       return references
286 |     }
287 |   };
288 | 
289 |   rules.emphasis = {
290 |     filter: ['em', 'i'],
291 | 
292 |     replacement: function (content, node, options) {
293 |       if (!content.trim()) return ''
294 |       return options.emDelimiter + content + options.emDelimiter
295 |     }
296 |   };
297 | 
298 |   rules.strong = {
299 |     filter: ['strong', 'b'],
300 | 
301 |     replacement: function (content, node, options) {
302 |       if (!content.trim()) return ''
303 |       return options.strongDelimiter + content + options.strongDelimiter
304 |     }
305 |   };
306 | 
307 |   rules.code = {
308 |     filter: function (node) {
309 |       var hasSiblings = node.previousSibling || node.nextSibling;
310 |       var isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
311 | 
312 |       return node.nodeName === 'CODE' && !isCodeBlock
313 |     },
314 | 
315 |     replacement: function (content) {
316 |       if (!content) return ''
317 |       content = content.replace(/\\r?\\n|\\r/g, ' ');
318 | 
319 |       var extraSpace = /^\`|^ .*?[^ ].* \$|\`\$/.test(content) ? ' ' : '';
320 |       var delimiter = '\`';
321 |       var matches = content.match(/\`+/gm) || [];
322 |       while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '\`';
323 | 
324 |       return delimiter + extraSpace + content + extraSpace + delimiter
325 |     }
326 |   };
327 | 
328 |   rules.image = {
329 |     filter: 'img',
330 | 
331 |     replacement: function (content, node) {
332 |       var alt = cleanAttribute(node.getAttribute('alt'));
333 |       var src = node.getAttribute('src') || '';
334 |       var title = cleanAttribute(node.getAttribute('title'));
335 |       var titlePart = title ? ' "' + title + '"' : '';
336 |       return src ? '![' + alt + ']' + '(' + src + titlePart + ')' : ''
337 |     }
338 |   };
339 | 
340 |   function cleanAttribute (attribute) {
341 |     return attribute ? attribute.replace(/(\\n+\\s*)+/g, '\\n') : ''
342 |   }
343 | 
344 |   /**
345 |    * Manages a collection of rules used to convert HTML to Markdown
346 |    */
347 | 
348 |   function Rules (options) {
349 |     this.options = options;
350 |     this._keep = [];
351 |     this._remove = [];
352 | 
353 |     this.blankRule = {
354 |       replacement: options.blankReplacement
355 |     };
356 | 
357 |     this.keepReplacement = options.keepReplacement;
358 | 
359 |     this.defaultRule = {
360 |       replacement: options.defaultReplacement
361 |     };
362 | 
363 |     this.array = [];
364 |     for (var key in options.rules) this.array.push(options.rules[key]);
365 |   }
366 | 
367 |   Rules.prototype = {
368 |     add: function (key, rule) {
369 |       this.array.unshift(rule);
370 |     },
371 | 
372 |     keep: function (filter) {
373 |       this._keep.unshift({
374 |         filter: filter,
375 |         replacement: this.keepReplacement
376 |       });
377 |     },
378 | 
379 |     remove: function (filter) {
380 |       this._remove.unshift({
381 |         filter: filter,
382 |         replacement: function () {
383 |           return ''
384 |         }
385 |       });
386 |     },
387 | 
388 |     forNode: function (node) {
389 |       if (node.isBlank) return this.blankRule
390 |       var rule;
391 | 
392 |       if ((rule = findRule(this.array, node, this.options))) return rule
393 |       if ((rule = findRule(this._keep, node, this.options))) return rule
394 |       if ((rule = findRule(this._remove, node, this.options))) return rule
395 | 
396 |       return this.defaultRule
397 |     },
398 | 
399 |     forEach: function (fn) {
400 |       for (var i = 0; i < this.array.length; i++) fn(this.array[i], i);
401 |     }
402 |   };
403 | 
404 |   function findRule (rules, node, options) {
405 |     for (var i = 0; i < rules.length; i++) {
406 |       var rule = rules[i];
407 |       if (filterValue(rule, node, options)) return rule
408 |     }
409 |     return void 0
410 |   }
411 | 
412 |   function filterValue (rule, node, options) {
413 |     var filter = rule.filter;
414 |     if (typeof filter === 'string') {
415 |       if (filter === node.nodeName.toLowerCase()) return true
416 |     } else if (Array.isArray(filter)) {
417 |       if (filter.indexOf(node.nodeName.toLowerCase()) > -1) return true
418 |     } else if (typeof filter === 'function') {
419 |       if (filter.call(rule, node, options)) return true
420 |     } else {
421 |       throw new TypeError('\`filter\` needs to be a string, array, or function')
422 |     }
423 |   }
424 | 
425 |   /**
426 |    * The collapseWhitespace function is adapted from collapse-whitespace
427 |    * by Luc Thevenard.
428 |    *
429 |    * The MIT License (MIT)
430 |    *
431 |    * Copyright (c) 2014 Luc Thevenard <lucthevenard@gmail.com>
432 |    *
433 |    * Permission is hereby granted, free of charge, to any person obtaining a copy
434 |    * of this software and associated documentation files (the "Software"), to deal
435 |    * in the Software without restriction, including without limitation the rights
436 |    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
437 |    * copies of the Software, and to permit persons to whom the Software is
438 |    * furnished to do so, subject to the following conditions:
439 |    *
440 |    * The above copyright notice and this permission notice shall be included in
441 |    * all copies or substantial portions of the Software.
442 |    *
443 |    * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
444 |    * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
445 |    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
446 |    * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
447 |    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
448 |    * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
449 |    * THE SOFTWARE.
450 |    */
451 | 
452 |   /**
453 |    * collapseWhitespace(options) removes extraneous whitespace from an the given element.
454 |    *
455 |    * @param {Object} options
456 |    */
457 |   function collapseWhitespace (options) {
458 |     var element = options.element;
459 |     var isBlock = options.isBlock;
460 |     var isVoid = options.isVoid;
461 |     var isPre = options.isPre || function (node) {
462 |       return node.nodeName === 'PRE'
463 |     };
464 | 
465 |     if (!element.firstChild || isPre(element)) return
466 | 
467 |     var prevText = null;
468 |     var keepLeadingWs = false;
469 | 
470 |     var prev = null;
471 |     var node = next(prev, element, isPre);
472 | 
473 |     while (node !== element) {
474 |       if (node.nodeType === 3 || node.nodeType === 4) { // Node.TEXT_NODE or Node.CDATA_SECTION_NODE
475 |         var text = node.data.replace(/[ \\r\\n\\t]+/g, ' ');
476 | 
477 |         if ((!prevText || / \$/.test(prevText.data)) &&
478 |             !keepLeadingWs && text[0] === ' ') {
479 |           text = text.substr(1);
480 |         }
481 | 
482 |         // \`text\` might be empty at this point.
483 |         if (!text) {
484 |           node = remove(node);
485 |           continue
486 |         }
487 | 
488 |         node.data = text;
489 | 
490 |         prevText = node;
491 |       } else if (node.nodeType === 1) { // Node.ELEMENT_NODE
492 |         if (isBlock(node) || node.nodeName === 'BR') {
493 |           if (prevText) {
494 |             prevText.data = prevText.data.replace(/ \$/, '');
495 |           }
496 | 
497 |           prevText = null;
498 |           keepLeadingWs = false;
499 |         } else if (isVoid(node) || isPre(node)) {
500 |           // Avoid trimming space around non-block, non-BR void elements and inline PRE.
501 |           prevText = null;
502 |           keepLeadingWs = true;
503 |         } else if (prevText) {
504 |           // Drop protection if set previously.
505 |           keepLeadingWs = false;
506 |         }
507 |       } else {
508 |         node = remove(node);
509 |         continue
510 |       }
511 | 
512 |       var nextNode = next(prev, node, isPre);
513 |       prev = node;
514 |       node = nextNode;
515 |     }
516 | 
517 |     if (prevText) {
518 |       prevText.data = prevText.data.replace(/ \$/, '');
519 |       if (!prevText.data) {
520 |         remove(prevText);
521 |       }
522 |     }
523 |   }
524 | 
525 |   /**
526 |    * remove(node) removes the given node from the DOM and returns the
527 |    * next node in the sequence.
528 |    *
529 |    * @param {Node} node
530 |    * @return {Node} node
531 |    */
532 |   function remove (node) {
533 |     var next = node.nextSibling || node.parentNode;
534 | 
535 |     node.parentNode.removeChild(node);
536 | 
537 |     return next
538 |   }
539 | 
540 |   /**
541 |    * next(prev, current, isPre) returns the next node in the sequence, given the
542 |    * current and previous nodes.
543 |    *
544 |    * @param {Node} prev
545 |    * @param {Node} current
546 |    * @param {Function} isPre
547 |    * @return {Node}
548 |    */
549 |   function next (prev, current, isPre) {
550 |     if ((prev && prev.parentNode === current) || isPre(current)) {
551 |       return current.nextSibling || current.parentNode
552 |     }
553 | 
554 |     return current.firstChild || current.nextSibling || current.parentNode
555 |   }
556 | 
557 |   /*
558 |    * Set up window for Node.js
559 |    */
560 | 
561 |   var root = (typeof window !== 'undefined' ? window : {});
562 | 
563 |   /*
564 |    * Parsing HTML strings
565 |    */
566 | 
567 |   function canParseHTMLNatively () {
568 |     var Parser = root.DOMParser;
569 |     var canParse = false;
570 | 
571 |     // Adapted from https://gist.github.com/1129031
572 |     // Firefox/Opera/IE throw errors on unsupported types
573 |     try {
574 |       // WebKit returns null on unsupported types
575 |       if (new Parser().parseFromString('', 'text/html')) {
576 |         canParse = true;
577 |       }
578 |     } catch (e) {}
579 | 
580 |     return canParse
581 |   }
582 | 
583 |   function createHTMLParser () {
584 |     var Parser = function () {};
585 | 
586 |     {
587 |       if (shouldUseActiveX()) {
588 |         Parser.prototype.parseFromString = function (string) {
589 |           var doc = new window.ActiveXObject('htmlfile');
590 |           doc.designMode = 'on'; // disable on-page scripts
591 |           doc.open();
592 |           doc.write(string);
593 |           doc.close();
594 |           return doc
595 |         };
596 |       } else {
597 |         Parser.prototype.parseFromString = function (string) {
598 |           var doc = document.implementation.createHTMLDocument('');
599 |           doc.open();
600 |           doc.write(string);
601 |           doc.close();
602 |           return doc
603 |         };
604 |       }
605 |     }
606 |     return Parser
607 |   }
608 | 
609 |   function shouldUseActiveX () {
610 |     var useActiveX = false;
611 |     try {
612 |       document.implementation.createHTMLDocument('').open();
613 |     } catch (e) {
614 |       if (root.ActiveXObject) useActiveX = true;
615 |     }
616 |     return useActiveX
617 |   }
618 | 
619 |   var HTMLParser = canParseHTMLNatively() ? root.DOMParser : createHTMLParser();
620 | 
621 |   function RootNode (input, options) {
622 |     var root;
623 |     if (typeof input === 'string') {
624 |       var doc = htmlParser().parseFromString(
625 |         // DOM parsers arrange elements in the <head> and <body>.
626 |         // Wrapping in a custom element ensures elements are reliably arranged in
627 |         // a single element.
628 |         '<x-turndown id="turndown-root">' + input + '</x-turndown>',
629 |         'text/html'
630 |       );
631 |       root = doc.getElementById('turndown-root');
632 |     } else {
633 |       root = input.cloneNode(true);
634 |     }
635 |     collapseWhitespace({
636 |       element: root,
637 |       isBlock: isBlock,
638 |       isVoid: isVoid,
639 |       isPre: options.preformattedCode ? isPreOrCode : null
640 |     });
641 | 
642 |     return root
643 |   }
644 | 
645 |   var _htmlParser;
646 |   function htmlParser () {
647 |     _htmlParser = _htmlParser || new HTMLParser();
648 |     return _htmlParser
649 |   }
650 | 
651 |   function isPreOrCode (node) {
652 |     return node.nodeName === 'PRE' || node.nodeName === 'CODE'
653 |   }
654 | 
655 |   function Node (node, options) {
656 |     node.isBlock = isBlock(node);
657 |     node.isCode = node.nodeName === 'CODE' || node.parentNode.isCode;
658 |     node.isBlank = isBlank(node);
659 |     node.flankingWhitespace = flankingWhitespace(node, options);
660 |     return node
661 |   }
662 | 
663 |   function isBlank (node) {
664 |     return (
665 |       !isVoid(node) &&
666 |       !isMeaningfulWhenBlank(node) &&
667 |       /^\\s*\$/i.test(node.textContent) &&
668 |       !hasVoid(node) &&
669 |       !hasMeaningfulWhenBlank(node)
670 |     )
671 |   }
672 | 
673 |   function flankingWhitespace (node, options) {
674 |     if (node.isBlock || (options.preformattedCode && node.isCode)) {
675 |       return { leading: '', trailing: '' }
676 |     }
677 | 
678 |     var edges = edgeWhitespace(node.textContent);
679 | 
680 |     // abandon leading ASCII WS if left-flanked by ASCII WS
681 |     if (edges.leadingAscii && isFlankedByWhitespace('left', node, options)) {
682 |       edges.leading = edges.leadingNonAscii;
683 |     }
684 | 
685 |     // abandon trailing ASCII WS if right-flanked by ASCII WS
686 |     if (edges.trailingAscii && isFlankedByWhitespace('right', node, options)) {
687 |       edges.trailing = edges.trailingNonAscii;
688 |     }
689 | 
690 |     return { leading: edges.leading, trailing: edges.trailing }
691 |   }
692 | 
693 |   function edgeWhitespace (string) {
694 |     var m = string.match(/^(([ \\t\\r\\n]*)(\\s*))(?:(?=\\S)[\\s\\S]*\\S)?((\\s*?)([ \\t\\r\\n]*))\$/);
695 |     return {
696 |       leading: m[1], // whole string for whitespace-only strings
697 |       leadingAscii: m[2],
698 |       leadingNonAscii: m[3],
699 |       trailing: m[4], // empty for whitespace-only strings
700 |       trailingNonAscii: m[5],
701 |       trailingAscii: m[6]
702 |     }
703 |   }
704 | 
705 |   function isFlankedByWhitespace (side, node, options) {
706 |     var sibling;
707 |     var regExp;
708 |     var isFlanked;
709 | 
710 |     if (side === 'left') {
711 |       sibling = node.previousSibling;
712 |       regExp = / \$/;
713 |     } else {
714 |       sibling = node.nextSibling;
715 |       regExp = /^ /;
716 |     }
717 | 
718 |     if (sibling) {
719 |       if (sibling.nodeType === 3) {
720 |         isFlanked = regExp.test(sibling.nodeValue);
721 |       } else if (options.preformattedCode && sibling.nodeName === 'CODE') {
722 |         isFlanked = false;
723 |       } else if (sibling.nodeType === 1 && !isBlock(sibling)) {
724 |         isFlanked = regExp.test(sibling.textContent);
725 |       }
726 |     }
727 |     return isFlanked
728 |   }
729 | 
730 |   var reduce = Array.prototype.reduce;
731 |   var escapes = [
732 |     [/\\\\/g, '\\\\\\\\'],
733 |     [/\\*/g, '\\\\*'],
734 |     [/^-/g, '\\\\-'],
735 |     [/^\\+ /g, '\\\\+ '],
736 |     [/^(=+)/g, '\\\\\$1'],
737 |     [/^(#{1,6}) /g, '\\\\\$1 '],
738 |     [/\`/g, '\\\\\`'],
739 |     [/^~~~/g, '\\\\~~~'],
740 |     [/\\[/g, '\\\\['],
741 |     [/\\]/g, '\\\\]'],
742 |     [/^>/g, '\\\\>'],
743 |     [/_/g, '\\\\_'],
744 |     [/^(\\d+)\\. /g, '\$1\\\\. ']
745 |   ];
746 | 
747 |   function TurndownService (options) {
748 |     if (!(this instanceof TurndownService)) return new TurndownService(options)
749 | 
750 |     var defaults = {
751 |       rules: rules,
752 |       headingStyle: 'setext',
753 |       hr: '* * *',
754 |       bulletListMarker: '*',
755 |       codeBlockStyle: 'indented',
756 |       fence: '\`\`\`',
757 |       emDelimiter: '_',
758 |       strongDelimiter: '**',
759 |       linkStyle: 'inlined',
760 |       linkReferenceStyle: 'full',
761 |       br: '  ',
762 |       preformattedCode: false,
763 |       blankReplacement: function (content, node) {
764 |         return node.isBlock ? '\\n\\n' : ''
765 |       },
766 |       keepReplacement: function (content, node) {
767 |         return node.isBlock ? '\\n\\n' + node.outerHTML + '\\n\\n' : node.outerHTML
768 |       },
769 |       defaultReplacement: function (content, node) {
770 |         return node.isBlock ? '\\n\\n' + content + '\\n\\n' : content
771 |       }
772 |     };
773 |     this.options = extend({}, defaults, options);
774 |     this.rules = new Rules(this.options);
775 |   }
776 | 
777 |   TurndownService.prototype = {
778 |     /**
779 |      * The entry point for converting a string or DOM node to Markdown
780 |      * @public
781 |      * @param {String|HTMLElement} input The string or DOM node to convert
782 |      * @returns A Markdown representation of the input
783 |      * @type String
784 |      */
785 | 
786 |     turndown: function (input) {
787 |       if (!canConvert(input)) {
788 |         throw new TypeError(
789 |           input + ' is not a string, or an element/document/fragment node.'
790 |         )
791 |       }
792 | 
793 |       if (input === '') return ''
794 | 
795 |       var output = process.call(this, new RootNode(input, this.options));
796 |       return postProcess.call(this, output)
797 |     },
798 | 
799 |     /**
800 |      * Add one or more plugins
801 |      * @public
802 |      * @param {Function|Array} plugin The plugin or array of plugins to add
803 |      * @returns The Turndown instance for chaining
804 |      * @type Object
805 |      */
806 | 
807 |     use: function (plugin) {
808 |       if (Array.isArray(plugin)) {
809 |         for (var i = 0; i < plugin.length; i++) this.use(plugin[i]);
810 |       } else if (typeof plugin === 'function') {
811 |         plugin(this);
812 |       } else {
813 |         throw new TypeError('plugin must be a Function or an Array of Functions')
814 |       }
815 |       return this
816 |     },
817 | 
818 |     /**
819 |      * Adds a rule
820 |      * @public
821 |      * @param {String} key The unique key of the rule
822 |      * @param {Object} rule The rule
823 |      * @returns The Turndown instance for chaining
824 |      * @type Object
825 |      */
826 | 
827 |     addRule: function (key, rule) {
828 |       this.rules.add(key, rule);
829 |       return this
830 |     },
831 | 
832 |     /**
833 |      * Keep a node (as HTML) that matches the filter
834 |      * @public
835 |      * @param {String|Array|Function} filter The unique key of the rule
836 |      * @returns The Turndown instance for chaining
837 |      * @type Object
838 |      */
839 | 
840 |     keep: function (filter) {
841 |       this.rules.keep(filter);
842 |       return this
843 |     },
844 | 
845 |     /**
846 |      * Remove a node that matches the filter
847 |      * @public
848 |      * @param {String|Array|Function} filter The unique key of the rule
849 |      * @returns The Turndown instance for chaining
850 |      * @type Object
851 |      */
852 | 
853 |     remove: function (filter) {
854 |       this.rules.remove(filter);
855 |       return this
856 |     },
857 | 
858 |     /**
859 |      * Escapes Markdown syntax
860 |      * @public
861 |      * @param {String} string The string to escape
862 |      * @returns A string with Markdown syntax escaped
863 |      * @type String
864 |      */
865 | 
866 |     escape: function (string) {
867 |       return escapes.reduce(function (accumulator, escape) {
868 |         return accumulator.replace(escape[0], escape[1])
869 |       }, string)
870 |     }
871 |   };
872 | 
873 |   /**
874 |    * Reduces a DOM node down to its Markdown string equivalent
875 |    * @private
876 |    * @param {HTMLElement} parentNode The node to convert
877 |    * @returns A Markdown representation of the node
878 |    * @type String
879 |    */
880 | 
881 |   function process (parentNode) {
882 |     var self = this;
883 |     return reduce.call(parentNode.childNodes, function (output, node) {
884 |       node = new Node(node, self.options);
885 | 
886 |       var replacement = '';
887 |       if (node.nodeType === 3) {
888 |         replacement = node.isCode ? node.nodeValue : self.escape(node.nodeValue);
889 |       } else if (node.nodeType === 1) {
890 |         replacement = replacementForNode.call(self, node);
891 |       }
892 | 
893 |       return join(output, replacement)
894 |     }, '')
895 |   }
896 | 
897 |   /**
898 |    * Appends strings as each rule requires and trims the output
899 |    * @private
900 |    * @param {String} output The conversion output
901 |    * @returns A trimmed version of the ouput
902 |    * @type String
903 |    */
904 | 
905 |   function postProcess (output) {
906 |     var self = this;
907 |     this.rules.forEach(function (rule) {
908 |       if (typeof rule.append === 'function') {
909 |         output = join(output, rule.append(self.options));
910 |       }
911 |     });
912 | 
913 |     return output.replace(/^[\\t\\r\\n]+/, '').replace(/[\\t\\r\\n\\s]+\$/, '')
914 |   }
915 | 
916 |   /**
917 |    * Converts an element node to its Markdown equivalent
918 |    * @private
919 |    * @param {HTMLElement} node The node to convert
920 |    * @returns A Markdown representation of the node
921 |    * @type String
922 |    */
923 | 
924 |   function replacementForNode (node) {
925 |     var rule = this.rules.forNode(node);
926 |     var content = process.call(this, node);
927 |     var whitespace = node.flankingWhitespace;
928 |     if (whitespace.leading || whitespace.trailing) content = content.trim();
929 |     return (
930 |       whitespace.leading +
931 |       rule.replacement(content, node, this.options) +
932 |       whitespace.trailing
933 |     )
934 |   }
935 | 
936 |   /**
937 |    * Joins replacement to the current output with appropriate number of new lines
938 |    * @private
939 |    * @param {String} output The current conversion output
940 |    * @param {String} replacement The string to append to the output
941 |    * @returns Joined output
942 |    * @type String
943 |    */
944 | 
945 |   function join (output, replacement) {
946 |     var s1 = trimTrailingNewlines(output);
947 |     var s2 = trimLeadingNewlines(replacement);
948 |     var nls = Math.max(output.length - s1.length, replacement.length - s2.length);
949 |     var separator = '\\n\\n'.substring(0, nls);
950 | 
951 |     return s1 + separator + s2
952 |   }
953 | 
954 |   /**
955 |    * Determines whether an input can be converted
956 |    * @private
957 |    * @param {String|HTMLElement} input Describe this parameter
958 |    * @returns Describe what it returns
959 |    * @type String|Object|Array|Boolean|Number
960 |    */
961 | 
962 |   function canConvert (input) {
963 |     return (
964 |       input != null && (
965 |         typeof input === 'string' ||
966 |         (input.nodeType && (
967 |           input.nodeType === 1 || input.nodeType === 9 || input.nodeType === 11
968 |         ))
969 |       )
970 |     )
971 |   }
972 | 
973 |   return TurndownService;
974 | 
975 | }());
976 | `;
977 | 


--------------------------------------------------------------------------------
/test/index.spec.ts:
--------------------------------------------------------------------------------
 1 | // test/index.spec.ts
 2 | import { env, createExecutionContext, waitOnExecutionContext, SELF } from 'cloudflare:test';
 3 | import { describe, it, expect } from 'vitest';
 4 | import worker from '../src/index';
 5 | 
 6 | // For now, you'll need to do something like this to get a correctly-typed
 7 | // `Request` to pass to `worker.fetch()`.
 8 | const IncomingRequest = Request<unknown, IncomingRequestCfProperties>;
 9 | 
10 | describe('Hello World worker', () => {
11 | 	it('responds with Hello World! (unit style)', async () => {
12 | 		const request = new IncomingRequest('http://example.com');
13 | 		// Create an empty context to pass to `worker.fetch()`.
14 | 		const ctx = createExecutionContext();
15 | 		const response = await worker.fetch(request, env, ctx);
16 | 		// Wait for all `Promise`s passed to `ctx.waitUntil()` to settle before running test assertions
17 | 		await waitOnExecutionContext(ctx);
18 | 		expect(await response.text()).toMatchInlineSnapshot(`"Hello World!"`);
19 | 	});
20 | 
21 | 	it('responds with Hello World! (integration style)', async () => {
22 | 		const response = await SELF.fetch('https://example.com');
23 | 		expect(await response.text()).toMatchInlineSnapshot(`"Hello World!"`);
24 | 	});
25 | });
26 | 


--------------------------------------------------------------------------------
/test/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"extends": "../tsconfig.json",
3 | 	"compilerOptions": {
4 | 		"types": ["@cloudflare/workers-types/experimental", "@cloudflare/vitest-pool-workers"]
5 | 	},
6 | 	"include": ["./**/*.ts", "../src/env.d.ts"],
7 | 	"exclude": []
8 | }
9 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"compilerOptions": {
  3 | 		/* Visit https://aka.ms/tsconfig.json to read more about this file */
  4 | 
  5 | 		/* Projects */
  6 | 		// "incremental": true,                              /* Enable incremental compilation */
  7 | 		// "composite": true,                                /* Enable constraints that allow a TypeScript project to be used with project references. */
  8 | 		// "tsBuildInfoFile": "./",                          /* Specify the folder for .tsbuildinfo incremental compilation files. */
  9 | 		// "disableSourceOfProjectReferenceRedirect": true,  /* Disable preferring source files instead of declaration files when referencing composite projects */
 10 | 		// "disableSolutionSearching": true,                 /* Opt a project out of multi-project reference checking when editing. */
 11 | 		// "disableReferencedProjectLoad": true,             /* Reduce the number of projects loaded automatically by TypeScript. */
 12 | 
 13 | 		/* Language and Environment */
 14 | 		"target": "es2021" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
 15 | 		"lib": ["es2021"] /* Specify a set of bundled library declaration files that describe the target runtime environment. */,
 16 | 		"jsx": "react" /* Specify what JSX code is generated. */,
 17 | 		// "experimentalDecorators": true,                   /* Enable experimental support for TC39 stage 2 draft decorators. */
 18 | 		// "emitDecoratorMetadata": true,                    /* Emit design-type metadata for decorated declarations in source files. */
 19 | 		// "jsxFactory": "",                                 /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h' */
 20 | 		// "jsxFragmentFactory": "",                         /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
 21 | 		// "jsxImportSource": "",                            /* Specify module specifier used to import the JSX factory functions when using `jsx: react-jsx*`.` */
 22 | 		// "reactNamespace": "",                             /* Specify the object invoked for `createElement`. This only applies when targeting `react` JSX emit. */
 23 | 		// "noLib": true,                                    /* Disable including any library files, including the default lib.d.ts. */
 24 | 		// "useDefineForClassFields": true,                  /* Emit ECMAScript-standard-compliant class fields. */
 25 | 
 26 | 		/* Modules */
 27 | 		"module": "es2022" /* Specify what module code is generated. */,
 28 | 		// "rootDir": "./",                                  /* Specify the root folder within your source files. */
 29 | 		"moduleResolution": "Bundler" /* Specify how TypeScript looks up a file from a given module specifier. */,
 30 | 		// "baseUrl": "./",                                  /* Specify the base directory to resolve non-relative module names. */
 31 | 		// "paths": {},                                      /* Specify a set of entries that re-map imports to additional lookup locations. */
 32 | 		// "rootDirs": [],                                   /* Allow multiple folders to be treated as one when resolving modules. */
 33 | 		// "typeRoots": [],                                  /* Specify multiple folders that act like `./node_modules/@types`. */
 34 | 		"types": [
 35 | 			"@cloudflare/workers-types/2023-07-01"
 36 | 		] /* Specify type package names to be included without being referenced in a source file. */,
 37 | 		// "allowUmdGlobalAccess": true,                     /* Allow accessing UMD globals from modules. */
 38 | 		"resolveJsonModule": true /* Enable importing .json files */,
 39 | 		// "noResolve": true,                                /* Disallow `import`s, `require`s or `<reference>`s from expanding the number of files TypeScript should add to a project. */
 40 | 
 41 | 		/* JavaScript Support */
 42 | 		"allowJs": true /* Allow JavaScript files to be a part of your program. Use the `checkJS` option to get errors from these files. */,
 43 | 		"checkJs": false /* Enable error reporting in type-checked JavaScript files. */,
 44 | 		// "maxNodeModuleJsDepth": 1,                        /* Specify the maximum folder depth used for checking JavaScript files from `node_modules`. Only applicable with `allowJs`. */
 45 | 
 46 | 		/* Emit */
 47 | 		// "declaration": true,                              /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
 48 | 		// "declarationMap": true,                           /* Create sourcemaps for d.ts files. */
 49 | 		// "emitDeclarationOnly": true,                      /* Only output d.ts files and not JavaScript files. */
 50 | 		// "sourceMap": true,                                /* Create source map files for emitted JavaScript files. */
 51 | 		// "outFile": "./",                                  /* Specify a file that bundles all outputs into one JavaScript file. If `declaration` is true, also designates a file that bundles all .d.ts output. */
 52 | 		// "outDir": "./",                                   /* Specify an output folder for all emitted files. */
 53 | 		// "removeComments": true,                           /* Disable emitting comments. */
 54 | 		"noEmit": true /* Disable emitting files from a compilation. */,
 55 | 		// "importHelpers": true,                            /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
 56 | 		// "importsNotUsedAsValues": "remove",               /* Specify emit/checking behavior for imports that are only used for types */
 57 | 		// "downlevelIteration": true,                       /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
 58 | 		// "sourceRoot": "",                                 /* Specify the root path for debuggers to find the reference source code. */
 59 | 		// "mapRoot": "",                                    /* Specify the location where debugger should locate map files instead of generated locations. */
 60 | 		// "inlineSourceMap": true,                          /* Include sourcemap files inside the emitted JavaScript. */
 61 | 		// "inlineSources": true,                            /* Include source code in the sourcemaps inside the emitted JavaScript. */
 62 | 		// "emitBOM": true,                                  /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
 63 | 		// "newLine": "crlf",                                /* Set the newline character for emitting files. */
 64 | 		// "stripInternal": true,                            /* Disable emitting declarations that have `@internal` in their JSDoc comments. */
 65 | 		// "noEmitHelpers": true,                            /* Disable generating custom helper functions like `__extends` in compiled output. */
 66 | 		// "noEmitOnError": true,                            /* Disable emitting files if any type checking errors are reported. */
 67 | 		// "preserveConstEnums": true,                       /* Disable erasing `const enum` declarations in generated code. */
 68 | 		// "declarationDir": "./",                           /* Specify the output directory for generated declaration files. */
 69 | 		// "preserveValueImports": true,                     /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
 70 | 
 71 | 		/* Interop Constraints */
 72 | 		"isolatedModules": true /* Ensure that each file can be safely transpiled without relying on other imports. */,
 73 | 		"allowSyntheticDefaultImports": true /* Allow 'import x from y' when a module doesn't have a default export. */,
 74 | 		// "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables `allowSyntheticDefaultImports` for type compatibility. */,
 75 | 		// "preserveSymlinks": true,                         /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
 76 | 		"forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
 77 | 
 78 | 		/* Type Checking */
 79 | 		"strict": true /* Enable all strict type-checking options. */,
 80 | 		// "noImplicitAny": true,                            /* Enable error reporting for expressions and declarations with an implied `any` type.. */
 81 | 		// "strictNullChecks": true,                         /* When type checking, take into account `null` and `undefined`. */
 82 | 		// "strictFunctionTypes": true,                      /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
 83 | 		// "strictBindCallApply": true,                      /* Check that the arguments for `bind`, `call`, and `apply` methods match the original function. */
 84 | 		// "strictPropertyInitialization": true,             /* Check for class properties that are declared but not set in the constructor. */
 85 | 		// "noImplicitThis": true,                           /* Enable error reporting when `this` is given the type `any`. */
 86 | 		// "useUnknownInCatchVariables": true,               /* Type catch clause variables as 'unknown' instead of 'any'. */
 87 | 		// "alwaysStrict": true,                             /* Ensure 'use strict' is always emitted. */
 88 | 		// "noUnusedLocals": true,                           /* Enable error reporting when a local variables aren't read. */
 89 | 		// "noUnusedParameters": true,                       /* Raise an error when a function parameter isn't read */
 90 | 		// "exactOptionalPropertyTypes": true,               /* Interpret optional property types as written, rather than adding 'undefined'. */
 91 | 		// "noImplicitReturns": true,                        /* Enable error reporting for codepaths that do not explicitly return in a function. */
 92 | 		// "noFallthroughCasesInSwitch": true,               /* Enable error reporting for fallthrough cases in switch statements. */
 93 | 		// "noUncheckedIndexedAccess": true,                 /* Include 'undefined' in index signature results */
 94 | 		// "noImplicitOverride": true,                       /* Ensure overriding members in derived classes are marked with an override modifier. */
 95 | 		// "noPropertyAccessFromIndexSignature": true,       /* Enforces using indexed accessors for keys declared using an indexed type */
 96 | 		// "allowUnusedLabels": true,                        /* Disable error reporting for unused labels. */
 97 | 		// "allowUnreachableCode": true,                     /* Disable error reporting for unreachable code. */
 98 | 
 99 | 		/* Completeness */
100 | 		// "skipDefaultLibCheck": true,                      /* Skip type checking .d.ts files that are included with TypeScript. */
101 | 		"skipLibCheck": true /* Skip type checking all .d.ts files. */
102 | 	},
103 | 	"exclude": ["test"]
104 | }
105 | 


--------------------------------------------------------------------------------
/vitest.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config';
 2 | 
 3 | export default defineWorkersConfig({
 4 | 	test: {
 5 | 		poolOptions: {
 6 | 			workers: {
 7 | 				wrangler: { configPath: './wrangler.toml' },
 8 | 			},
 9 | 		},
10 | 	},
11 | });
12 | 


--------------------------------------------------------------------------------
/worker-configuration.d.ts:
--------------------------------------------------------------------------------
1 | // Generated by Wrangler on Tue Jun 11 2024 09:07:19 GMT+0900 (日本標準時)
2 | // by running `wrangler types`
3 | 
4 | interface Env {
5 | 	MYBROWSER: Fetcher;
6 | }
7 | 


--------------------------------------------------------------------------------
/wrangler.toml:
--------------------------------------------------------------------------------
  1 | #:schema node_modules/wrangler/config-schema.json
  2 | name = "cloudflare-dom-distiller"
  3 | main = "src/index.ts"
  4 | compatibility_date = "2024-06-05"
  5 | compatibility_flags = ["nodejs_compat"]
  6 | 
  7 | # Automatically place your workloads in an optimal location to minimize latency.
  8 | # If you are running back-end logic in a Worker, running it closer to your back-end infrastructure
  9 | # rather than the end user may result in better performance.
 10 | # Docs: https://developers.cloudflare.com/workers/configuration/smart-placement/#smart-placement
 11 | # [placement]
 12 | # mode = "smart"
 13 | 
 14 | # Variable bindings. These are arbitrary, plaintext strings (similar to environment variables)
 15 | # Docs:
 16 | # - https://developers.cloudflare.com/workers/wrangler/configuration/#environment-variables
 17 | # Note: Use secrets to store sensitive data.
 18 | # - https://developers.cloudflare.com/workers/configuration/secrets/
 19 | # [vars]
 20 | # MY_VARIABLE = "production_value"
 21 | 
 22 | # Bind the Workers AI model catalog. Run machine learning models, powered by serverless GPUs, on Cloudflare’s global network
 23 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#workers-ai
 24 | # [ai]
 25 | # binding = "AI"
 26 | 
 27 | # Bind an Analytics Engine dataset. Use Analytics Engine to write analytics within your Pages Function.
 28 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#analytics-engine-datasets
 29 | # [[analytics_engine_datasets]]
 30 | # binding = "MY_DATASET"
 31 | 
 32 | # Bind a headless browser instance running on Cloudflare's global network.
 33 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#browser-rendering
 34 | [browser]
 35 | binding = "MYBROWSER"
 36 | 
 37 | # Bind a D1 database. D1 is Cloudflare’s native serverless SQL database.
 38 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#d1-databases
 39 | # [[d1_databases]]
 40 | # binding = "MY_DB"
 41 | # database_name = "my-database"
 42 | # database_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 43 | 
 44 | # Bind a dispatch namespace. Use Workers for Platforms to deploy serverless functions programmatically on behalf of your customers.
 45 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#dispatch-namespace-bindings-workers-for-platforms
 46 | # [[dispatch_namespaces]]
 47 | # binding = "MY_DISPATCHER"
 48 | # namespace = "my-namespace"
 49 | 
 50 | # Bind a Durable Object. Durable objects are a scale-to-zero compute primitive based on the actor model.
 51 | # Durable Objects can live for as long as needed. Use these when you need a long-running "server", such as in realtime apps.
 52 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#durable-objects
 53 | # [[durable_objects.bindings]]
 54 | # name = "MY_DURABLE_OBJECT"
 55 | # class_name = "MyDurableObject"
 56 | 
 57 | # Durable Object migrations.
 58 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#migrations
 59 | # [[migrations]]
 60 | # tag = "v1"
 61 | # new_classes = ["MyDurableObject"]
 62 | 
 63 | # Bind a Hyperdrive configuration. Use to accelerate access to your existing databases from Cloudflare Workers.
 64 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#hyperdrive
 65 | # [[hyperdrive]]
 66 | # binding = "MY_HYPERDRIVE"
 67 | # id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
 68 | 
 69 | # Bind a KV Namespace. Use KV as persistent storage for small key-value pairs.
 70 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#kv-namespaces
 71 | # [[kv_namespaces]]
 72 | # binding = "MY_KV_NAMESPACE"
 73 | # id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
 74 | 
 75 | # Bind an mTLS certificate. Use to present a client certificate when communicating with another service.
 76 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#mtls-certificates
 77 | # [[mtls_certificates]]
 78 | # binding = "MY_CERTIFICATE"
 79 | # certificate_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 80 | 
 81 | # Bind a Queue producer. Use this binding to schedule an arbitrary task that may be processed later by a Queue consumer.
 82 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#queues
 83 | # [[queues.producers]]
 84 | # binding = "MY_QUEUE"
 85 | # queue = "my-queue"
 86 | 
 87 | # Bind a Queue consumer. Queue Consumers can retrieve tasks scheduled by Producers to act on them.
 88 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#queues
 89 | # [[queues.consumers]]
 90 | # queue = "my-queue"
 91 | 
 92 | # Bind an R2 Bucket. Use R2 to store arbitrarily large blobs of data, such as files.
 93 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#r2-buckets
 94 | # [[r2_buckets]]
 95 | # binding = "MY_BUCKET"
 96 | # bucket_name = "my-bucket"
 97 | 
 98 | # Bind another Worker service. Use this binding to call another Worker without network overhead.
 99 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#service-bindings
100 | # [[services]]
101 | # binding = "MY_SERVICE"
102 | # service = "my-service"
103 | 
104 | # Bind a Vectorize index. Use to store and query vector embeddings for semantic search, classification and other vector search use-cases.
105 | # Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#vectorize-indexes
106 | # [[vectorize]]
107 | # binding = "MY_INDEX"
108 | # index_name = "my-index"
109 | 


--------------------------------------------------------------------------------