224 | * instead of [https://google.com](https://google.com)
225 | *
226 | * @default true
227 | */
228 | useInlineLinks?: boolean
229 | }
230 | ```
231 |
232 | ## Custom Translators
233 |
234 | Custom translators are an advanced option to allow handling certain elements a specific way.
235 |
236 | These can be modified via the `NodeHtmlMarkdown#translators` property, or added during creation.
237 |
238 | __For detail on how to use them see__:
239 |
240 | - [translator.ts](https://github.com/crosstype/node-html-markdown/blob/master/src/translator.ts) - Documentation for `TranslatorConfig`
241 | - [config.ts](https://github.com/crosstype/node-html-markdown/blob/master/src/config.ts) - Translators in `defaultTranslators`
242 |
243 | The `NodeHtmlMarkdown#codeBlockTranslators` property is a collection of translators which handles elements within a `` block.
244 |
245 | ## Further improvements
246 |
247 | Being a performance-centric library, we're always interested in further improvements.
248 | There are several probable routes by which we could gain substantial performance increases over the current model.
249 |
250 | Such methods include:
251 |
252 | - Writing a custom parser
253 | - Integrating an async worker-thread based model for multi-threading
254 | - Fully replacing any remaining regex
255 |
256 | These would be fun to implement; however, for the time being, the present library is fast enough for my purposes. That
257 | said, I welcome discussion and any PR toward the effort of further improving performance, and I may ultimately do more
258 | work in that capacity in the future!
259 |
260 | ## Help Wanted!
261 |
262 | Looking to contribute? Check out our [help wanted] list for a good place to start!
263 |
264 |
265 | [help wanted]: https://github.com/crosstype/node-html-markdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22
266 |
--------------------------------------------------------------------------------
/benchmark/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013 Andreas Madsen
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
1 | # Benchmark Tool
2 |
3 | Simple benchmark for different html to markdown compilers using real-life data.
4 |
5 | Based on: https://github.com/AndreasMadsen/htmlparser-benchmark
6 |
7 | ## Usage
8 |
9 | ```shell
10 | yarn run benchmark
11 | ```
12 |
--------------------------------------------------------------------------------
/benchmark/_run.js:
--------------------------------------------------------------------------------
1 | const Benchmark = require('./index.js');
2 | const ProgressBar = require('progress');
3 |
4 |
5 | /* ****************************************************************************************************************** *
6 | * Handlers
7 | * ****************************************************************************************************************** */
8 |
9 | process.on('uncaughtException', function(e){
10 | console.error(e);
11 | process.exit(1);
12 | });
13 |
14 | process.on('message', function (item) {
15 | const bar = new ProgressBar('[:bar] :current / :total', {
16 | total: Benchmark.TOTAL,
17 | complete: '=',
18 | incomplete: ' ',
19 | width: 50,
20 | clear: true
21 | });
22 |
23 | const parser = require(item.parser);
24 | const bench = new Benchmark(parser);
25 |
26 | bench.on('progress', () => bar.tick());
27 |
28 | bench.once('result', function (stat) {
29 | const mean = stat.mean();
30 | process.send({
31 | mean: mean,
32 | sd: stat.sd(),
33 | totalFiles: Benchmark.TOTAL,
34 | avgFileSize: Benchmark.AVG_FILE_SIZE,
35 | avgBytesPerMs: Benchmark.AVG_FILE_SIZE / mean
36 | });
37 | process.exit(0);
38 | });
39 | });
40 |
--------------------------------------------------------------------------------
/benchmark/execute.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const path = require('path');
3 | const async = require('async');
4 | const { fork } = require('child_process');
5 |
6 |
7 | /* ****************************************************************************************************************** */
8 | // region: Config / Const
9 | /* ****************************************************************************************************************** */
10 |
11 | const quickMode = process.argv[2] === 'quick'
12 |
13 | const wrappers = fs
14 | .readdirSync(path.join(__dirname, 'wrapper'))
15 | .sort((a, b) => a.localeCompare(b))
16 | .map(filename => ({
17 | name: path.basename(filename, '.js').replace('_reuse', ' (reused instance)'),
18 | parser: path.join(__dirname, 'wrapper', filename)
19 | })
20 | );
21 |
22 | const MAX_WIDTH = Math.max(...wrappers.map(wrapper => wrapper.name.length));
23 |
24 | const SEPARATOR = '\n' + '-'.repeat(MAX_WIDTH + 41) + '\n';
25 |
26 | // endregion
27 |
28 |
29 | /* ****************************************************************************************************************** */
30 | // region: Helpers
31 | /* ****************************************************************************************************************** */
32 |
33 | function formatName(name) {
34 | const left = MAX_WIDTH - name.length;
35 | let str = name;
36 | for (let i = 0; i < left; i++) str += ' ';
37 | return str;
38 | }
39 |
40 | function humanFileSize(size) {
41 | const i = Math.floor( Math.log(size) / Math.log(1024) );
42 | return ( size / Math.pow(1024, i) ).toFixed(2) * 1 + ' ' + ['B', 'kB', 'MB', 'GB', 'TB'][i];
43 | }
44 |
45 | /**
46 | * Turn seconds into written time form
47 | */
48 | function humanTime(seconds) {
49 | let s = seconds;
50 | const hours = Math.floor(s / 3600);
51 | s -= (hours * 3600);
52 | const minutes = Math.floor(s / 60);
53 | s -= (minutes * 60);
54 |
55 | for (const n of [ hours, minutes, s ]) if (!isFinite(n) || isNaN(n)) return 'N/A';
56 |
57 | return (!hours && !minutes && seconds < 1) ? `${Math.round((s % 1) * 1000)}ms` :
58 | (!hours && !minutes) ? `${s.toFixed(2)}sec` :
59 | `${hours ? hours + 'hr, ' : ''}${minutes ? minutes + 'min, ' : ''}${Math.round(s)}sec`;
60 | }
61 |
62 | // endregion
63 |
64 |
65 | /* ****************************************************************************************************************** */
66 | // region: Implementation
67 | /* ****************************************************************************************************************** */
68 |
69 | (function run() {
70 | if (!quickMode) console.log('NOTE: Large mode is generally less reliable in most environments!');
71 | const stats = [];
72 |
73 | console.log(SEPARATOR);
74 |
75 | async.eachSeries(
76 | wrappers,
77 | function (item, done) {
78 | const runner = fork(path.join(__dirname, '_run.js'), void 0, { env: { QUICK_MODE: quickMode, LOG_PERF: true }});
79 | runner.send(item);
80 | runner.on('message', function (stat) {
81 | const name = formatName(item.name);
82 | const mean = stat.mean.toPrecision(6);
83 | const sd = stat.sd.toPrecision(6);
84 | const avgBytesPerSec = (stat.avgBytesPerMs * 1000);
85 |
86 | stats.push({ name, ...stat });
87 | console.log(`${name}: ${mean} ms/file ± ${sd} (${humanFileSize(avgBytesPerSec)}/s)`);
88 | });
89 |
90 | runner.on('close', function (n) {
91 | if (n) console.log('%s failed (exit code %d)', item.name, n);
92 | done();
93 | });
94 | },
95 | function () {
96 | console.log(SEPARATOR);
97 | console.log(
98 | `Total Files: ${stats[0].totalFiles}\n`+
99 | `Avg. file size: ${humanFileSize(stats[0].avgFileSize)}`
100 | );
101 |
102 | /* Get speed estimates */
103 | console.log(SEPARATOR);
104 | console.log(`Estimated processing times (fastest to slowest):`);
105 | const sortedStats = [ ...stats ].sort((a,b) => b.avgBytesPerMs - a.avgBytesPerMs)
106 | sortedStats.forEach(({ name, avgBytesPerMs }) => {
107 | console.log(`\n [${name.trim()}]`);
108 | [ 100, 1024, 51200, 1048576, 52428800 ].map(kbSize => {
109 | const byteSize = kbSize * 1024;
110 | const secToComplete = ((byteSize / avgBytesPerMs) / 1000);
111 | const tag = humanFileSize(byteSize);
112 | const spacing = 8 - tag.length;
113 | console.log(` ${tag}:${' '.repeat(spacing)}${humanTime(secToComplete)}`);
114 | }).join('\n')
115 | });
116 |
117 | /* Get comparisons */
118 | console.log(SEPARATOR);
119 | console.log(`Speed comparison - ${sortedStats[0].name.trim()} is: \n`);
120 | const fastestMean = sortedStats[0].mean;
121 | sortedStats.slice(1).forEach(({ name, mean }) =>
122 | console.log(` ${((mean / fastestMean)).toFixed(2)} times as fast as ${name.trim()}`)
123 | );
124 |
125 | console.log(SEPARATOR);
126 | }
127 | );
128 | })();
129 |
130 | // endregion
131 |
--------------------------------------------------------------------------------
/benchmark/files/5f8b89390d3fc01c6a80728ba2aee597fea1dbfc8399d61015956db71e5336c7.html:
--------------------------------------------------------------------------------
1 | FDA OKs radiation-based prostate cancer drug
12 | FDA OKs radiation-based prostate cancer drug
13 |
14 | The U.S. Food and Drug Administration has approved a new injectable drug that uses radiation to treat advanced prostate cancer that has spread to the bones.
15 | Posted! A link has been posted to your Facebook feed.
Sent! A link has been sent to your friend's email address.
FDA OKs radiation-based prostate cancer drug Story Highlights Bayer Pharmaceuticals makes Xofigo Drug approved to treat advanced prostate cancer Side effects include nausea and diarrhea WASHINGTON (AP) — The U.S. Food and Drug Administration has approved a new injectable drug that uses radiation to treat advanced prostate cancer that has spread to the bones.
The FDA said Wednesday it approved the drug, Xofigo from Bayer Pharmaceuticals, for men whose cancer has grown into bone tumors even after receiving medication or surgery to lower testosterone. The hormone spurs growth of prostate tumors.
Regulators approved Xofigo based on a study of 809 men with advanced prostate cancer who received the drug or placebo. Patients taking Xofigo typically lived 14 months compared to 11.2 months for those taking placebo.
Xofigo's side effects include nausea and diarrhea.
Copyright 2013 The Associated Press. All rights reserved. This material may not be published, broadcast, rewritten or redistributed.
16 | USA NOW
17 | Things to know about notorious Whitey Bulger | USA NOW video Jun 04, 2013
18 | {
19 |
20 | "js_modules": [],
21 |
22 | "assetid": "2162723",
23 | "aws": "news/national",
24 | "aws_id": "news_national",
25 | "blogname": "",
26 |
27 | "byline":"",
28 |
29 | "contenttype": "story pages ",
30 | "seotitle": "Prostate-cancer-drug",
31 | "seotitletag": "FDA OKs radiation-based prostate cancer drug",
32 | "ssts": "news/nation",
33 |
34 | "taxonomykeywords":"U.S. Food and Drug Administration,Health and Wellness",
35 |
36 | "templatename": "stories/default",
37 |
38 | "topic":"health-and-wellness",
39 |
40 |
41 | "videoincluded":"no",
42 |
43 |
44 | "basePageType":"story"
45 |
46 | }
47 |
50 |
--------------------------------------------------------------------------------
/benchmark/files/8bd6d9bcba689408767f770d69f12b59c3f092e73cffcc9332261fbab4aa16e1.html:
--------------------------------------------------------------------------------
1 | Tech stocks: Google, Microsoft to report earnings
Share This Story! Let friends in your social network know what you are reading about
Tech stocks: Google, Microsoft to report earnings It's time for a pair of tech heavyweights to report quarterly earnings after the markets close Thursday. Let's look at the technology stocks to watch. Google shares up slightly. With its stock price slowly
Post to Facebook Posted! A link has been posted to your Facebook feed.
Sent! A link has been sent to your friend's email address.
Tech stocks: Google, Microsoft to report earnings A Google employee rides to work at Google in Mountain View, Calif. (Photo: Justin Sullivan, Getty Images)
SHARE CONNECT EMAIL MORE
It's time for a pair of tech heavyweights to report quarterly earnings after the markets close Thursday. Let's look at the technology stocks to watch.
Google shares up slightly. With its stock price slowly marching toward the $1,000 mark, the tech titan reports second-quarter earnings after the bell.
Analysts expect Google to report an earnings per share of $10.78 with revenue of just over $14 billion.
Shares of Google have surged in the past 12 months, adding more than $300 in value since hitting a 52-week low of $580.76 on this day a year ago.
Microsoft to report earnings. Shares of the Redmond, Wash., company are barely up in pre-market trading as the company reports quarterly earnings after unveiling a massive restructuring.
Last week, Microsoft CEO Steve Ballmer announced a major reorganization of the company he says will help them become more efficient.
The company has also made news related to its product line. Last week, the company slashed prices on its Surface RT tablets by $150. In May, the company revealed the Xbox One, its video game console that will succeed the Xbox 360. However, reception to the console has been mixed, primarily due to restrictions to software that have since been removed.
Follow Brett Molina on Twitter: @bam923 .
117 | USA NOW
118 | What do Charles Manson and the Boston bomber have in common? | USA NOW video Jul 17, 2013
119 | {
120 |
121 | "js_modules": [{"name": "expandable-photo"}],
122 |
123 | "assetid": "2550599",
124 | "aws": "tech",
125 | "aws_id": "tech",
126 | "blogname": "",
127 |
128 | "byline":"Brett Molina",
129 |
130 | "contenttype": "story pages ",
131 | "seotitle": "Tech-stocks-microsoft-google",
132 | "seotitletag": "Tech stocks: Google, Microsoft to report earnings",
133 | "ssts": "tech",
134 |
135 | "taxonomykeywords":"Steve Ballmer",
136 |
137 | "templatename": "stories/default",
138 |
139 | "topic":"steve-ballmer",
140 |
141 |
142 | "videoincluded":"no",
143 |
144 |
145 | "basePageType":"story"
146 |
147 | }
148 |
151 |
--------------------------------------------------------------------------------
/benchmark/index.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const path = require('path');
3 | const util = require('util');
4 | const events = require('events');
5 | const async = require('async');
6 | const summary = require('summary');
7 |
8 |
9 | /* ****************************************************************************************************************** */
10 | // region: Load files
11 | /* ****************************************************************************************************************** */
12 |
13 | const fileNames = fs.readdirSync(path.resolve(__dirname, 'files'));
14 | const FILES = [];
15 | for (let i = 0; i < fileNames.length; i++) {
16 | if (process.env.QUICK_MODE === 'true' && i >= 25) break;
17 |
18 | const fileName = fileNames[i];
19 | const filePath = path.resolve(__dirname, 'files', fileName);
20 | FILES.push({
21 | key: path.basename(fileName, '.html'),
22 | file: filePath,
23 | fileSize: fs.statSync(filePath).size
24 | });
25 | }
26 |
27 | // endregion
28 |
29 |
30 | /* ****************************************************************************************************************** */
31 | // region: Benchmark
32 | /* ****************************************************************************************************************** */
33 |
34 | function Benchmark(parser) {
35 | if (!(this instanceof Benchmark)) return new Benchmark(parser);
36 |
37 | this._parser = parser;
38 | async.mapSeries(FILES, this._file.bind(this), this._done.bind(this));
39 | }
40 |
41 | // The total amount of files
42 | Benchmark.TOTAL = FILES.length;
43 |
44 | // Average file size
45 | Benchmark.AVG_FILE_SIZE = Math.round(FILES.reduce((acc, { fileSize }) => acc + fileSize, 0) / FILES.length);
46 |
47 | // Parse a file
48 | Benchmark.prototype._file = function (item, done) {
49 | const self = this;
50 |
51 | fs.readFile(item.file, 'utf8', function (err, html) {
52 | if (err) return done(err);
53 |
54 | const tic = process.hrtime();
55 | self._parser(html, function (err) {
56 | const toc = process.hrtime(tic);
57 |
58 | if (err) {
59 | done(err, toc);
60 | } else {
61 | self.emit('progress', item.key);
62 | done(null, toc);
63 | }
64 | });
65 | });
66 | };
67 |
68 | // Benchmark for this parser is done
69 | Benchmark.prototype._done = function (err, times) {
70 | if (err) return this.emit('error', err);
71 |
72 | const stat = summary(times.map(function (time) {
73 | return time[0] * 1e3 + time[1] / 1e6;
74 | }));
75 |
76 | this.emit('result', stat);
77 | };
78 |
79 | util.inherits(Benchmark, events.EventEmitter);
80 |
81 | // endregion
82 |
83 |
84 | /* ****************************************************************************************************************** *
85 | * Exports
86 | * ****************************************************************************************************************** */
87 |
88 | module.exports = Benchmark;
89 |
--------------------------------------------------------------------------------
/benchmark/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "benchmark-tool",
3 | "private": true,
4 | "description": "Simple benchmark tool for JS html to markdown compilers",
5 | "main": "./index.js",
6 | "scripts": {
7 | "benchmark": "node execute.js",
8 | "benchmark:quick": "node execute.js quick"
9 | },
10 | "bin": {
11 | "htmltomarkdown-benchmark": "./execute.js"
12 | },
13 | "dependencies": {
14 | "async": "^3.2.3",
15 | "node-html-markdown": "link:../",
16 | "summary": "^2.1.0",
17 | "turndown": "^7.1.1"
18 | },
19 | "devDependencies": {
20 | "progress": "^2.0.3"
21 | },
22 | "license": "MIT",
23 | "engines": {
24 | "node": "0.10 || 0.11"
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/benchmark/wrapper/node-html-markdown.js:
--------------------------------------------------------------------------------
1 | const { NodeHtmlMarkdown } = require('node-html-markdown');
2 |
3 | module.exports = function (html, callback) {
4 | NodeHtmlMarkdown.translate(html);
5 | callback(null);
6 | };
7 |
--------------------------------------------------------------------------------
/benchmark/wrapper/node-html-markdown_reuse.js:
--------------------------------------------------------------------------------
1 | const { NodeHtmlMarkdown } = require('node-html-markdown');
2 | const nhm = new NodeHtmlMarkdown();
3 |
4 | module.exports = function (html, callback) {
5 | nhm.translate(html);
6 | callback(null);
7 | };
8 |
--------------------------------------------------------------------------------
/benchmark/wrapper/turndown.js:
--------------------------------------------------------------------------------
1 | const TurndownService = require('turndown');
2 |
3 | module.exports = function (html, callback) {
4 | (new TurndownService()).turndown(html);
5 | callback(null);
6 | };
7 |
--------------------------------------------------------------------------------
/benchmark/wrapper/turndown_reuse.js:
--------------------------------------------------------------------------------
1 | const TurndownService = require('turndown');
2 | const td = new TurndownService();
3 |
4 | module.exports = function (html, callback) {
5 | td.turndown(html);
6 | callback(null);
7 | };
8 |
--------------------------------------------------------------------------------
/benchmark/yarn.lock:
--------------------------------------------------------------------------------
1 | # THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
2 | # yarn lockfile v1
3 |
4 |
5 | async@^3.2.3:
6 | version "3.2.3"
7 | resolved "https://registry.yarnpkg.com/async/-/async-3.2.3.tgz#ac53dafd3f4720ee9e8a160628f18ea91df196c9"
8 | integrity sha512-spZRyzKL5l5BZQrr/6m/SqFdBN0q3OCI0f9rjfBzCMBIP4p75P620rR3gTmaksNOhmzgdxcaxdNfMy6anrbM0g==
9 |
10 | boolbase@^1.0.0:
11 | version "1.0.0"
12 | resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
13 | integrity sha1-aN/1++YMUes3cl6p4+0xDcwed24=
14 |
15 | css-select@^4.2.1:
16 | version "4.3.0"
17 | resolved "https://registry.yarnpkg.com/css-select/-/css-select-4.3.0.tgz#db7129b2846662fd8628cfc496abb2b59e41529b"
18 | integrity sha512-wPpOYtnsVontu2mODhA19JrqWxNsfdatRKd64kmpRbQgh1KtItko5sTnEpPdpSaJszTOhEMlF/RPz28qj4HqhQ==
19 | dependencies:
20 | boolbase "^1.0.0"
21 | css-what "^6.0.1"
22 | domhandler "^4.3.1"
23 | domutils "^2.8.0"
24 | nth-check "^2.0.1"
25 |
26 | css-what@^6.0.1:
27 | version "6.1.0"
28 | resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
29 | integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
30 |
31 | dom-serializer@^1.0.1:
32 | version "1.3.2"
33 | resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-1.3.2.tgz#6206437d32ceefaec7161803230c7a20bc1b4d91"
34 | integrity sha512-5c54Bk5Dw4qAxNOI1pFEizPSjVsx5+bpJKmL2kPn8JhBUq2q09tTCa3mjijun2NfK78NMouDYNMBkOrPZiS+ig==
35 | dependencies:
36 | domelementtype "^2.0.1"
37 | domhandler "^4.2.0"
38 | entities "^2.0.0"
39 |
40 | domelementtype@^2.0.1, domelementtype@^2.2.0:
41 | version "2.2.0"
42 | resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.2.0.tgz#9a0b6c2782ed6a1c7323d42267183df9bd8b1d57"
43 | integrity sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A==
44 |
45 | domhandler@^4.2.0:
46 | version "4.2.0"
47 | resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-4.2.0.tgz#f9768a5f034be60a89a27c2e4d0f74eba0d8b059"
48 | integrity sha512-zk7sgt970kzPks2Bf+dwT/PLzghLnsivb9CcxkvR8Mzr66Olr0Ofd8neSbglHJHaHa2MadfoSdNlKYAaafmWfA==
49 | dependencies:
50 | domelementtype "^2.2.0"
51 |
52 | domhandler@^4.3.1:
53 | version "4.3.1"
54 | resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-4.3.1.tgz#8d792033416f59d68bc03a5aa7b018c1ca89279c"
55 | integrity sha512-GrwoxYN+uWlzO8uhUXRl0P+kHE4GtVPfYzVLcUxPL7KNdHKj66vvlhiweIHqYYXWlw+T8iLMp42Lm67ghw4WMQ==
56 | dependencies:
57 | domelementtype "^2.2.0"
58 |
59 | domino@^2.1.6:
60 | version "2.1.6"
61 | resolved "https://registry.yarnpkg.com/domino/-/domino-2.1.6.tgz#fe4ace4310526e5e7b9d12c7de01b7f485a57ffe"
62 | integrity sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ==
63 |
64 | domutils@^2.8.0:
65 | version "2.8.0"
66 | resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.8.0.tgz#4437def5db6e2d1f5d6ee859bd95ca7d02048135"
67 | integrity sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A==
68 | dependencies:
69 | dom-serializer "^1.0.1"
70 | domelementtype "^2.2.0"
71 | domhandler "^4.2.0"
72 |
73 | entities@^2.0.0:
74 | version "2.2.0"
75 | resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55"
76 | integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==
77 |
78 | he@1.2.0:
79 | version "1.2.0"
80 | resolved "https://registry.yarnpkg.com/he/-/he-1.2.0.tgz#84ae65fa7eafb165fddb61566ae14baf05664f0f"
81 | integrity sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==
82 |
83 | "node-html-markdown@link:..":
84 | version "0.0.0"
85 | uid ""
86 |
87 | node-html-parser@^5.3.3:
88 | version "5.3.3"
89 | resolved "https://registry.yarnpkg.com/node-html-parser/-/node-html-parser-5.3.3.tgz#2845704f3a7331a610e0e551bf5fa02b266341b6"
90 | integrity sha512-ncg1033CaX9UexbyA7e1N0aAoAYRDiV8jkTvzEnfd1GDvzFdrsXLzR4p4ik8mwLgnaKP/jyUFWDy9q3jvRT2Jw==
91 | dependencies:
92 | css-select "^4.2.1"
93 | he "1.2.0"
94 |
95 | nth-check@^2.0.1:
96 | version "2.0.1"
97 | resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.0.1.tgz#2efe162f5c3da06a28959fbd3db75dbeea9f0fc2"
98 | integrity sha512-it1vE95zF6dTT9lBsYbxvqh0Soy4SPowchj0UBGj/V6cTPnXXtQOPUbhZ6CmGzAD/rW22LQK6E96pcdJXk4A4w==
99 | dependencies:
100 | boolbase "^1.0.0"
101 |
102 | progress@^2.0.3:
103 | version "2.0.3"
104 | resolved "https://registry.yarnpkg.com/progress/-/progress-2.0.3.tgz#7e8cf8d8f5b8f239c1bc68beb4eb78567d572ef8"
105 | integrity sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==
106 |
107 | summary@^2.1.0:
108 | version "2.1.0"
109 | resolved "https://registry.yarnpkg.com/summary/-/summary-2.1.0.tgz#be8a49a0aa34eb6ceea56042cae88f8add4b0885"
110 | integrity sha512-nMIjMrd5Z2nuB2RZCKJfFMjgS3fygbeyGk9PxPPaJR1RIcyN9yn4A63Isovzm3ZtQuEkLBVgMdPup8UeLH7aQw==
111 |
112 | turndown@^7.1.1:
113 | version "7.1.1"
114 | resolved "https://registry.yarnpkg.com/turndown/-/turndown-7.1.1.tgz#96992f2d9b40a1a03d3ea61ad31b5a5c751ef77f"
115 | integrity sha512-BEkXaWH7Wh7e9bd2QumhfAXk5g34+6QUmmWx+0q6ThaVOLuLUqsnkq35HQ5SBHSaxjSfSM7US5o4lhJNH7B9MA==
116 | dependencies:
117 | domino "^2.1.6"
118 |
--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | testEnvironment: "node",
3 | preset: 'ts-jest',
4 | testRegex: '.*(test|spec)\\.tsx?$',
5 | moduleFileExtensions: [ 'ts', 'tsx', 'js', 'jsx', 'json', 'node' ],
6 | transform: {
7 | '^.+\\.tsx?$': [
8 | 'ts-jest',
9 | {
10 | tsconfig: '/test/tsconfig.json'
11 | }
12 | ]
13 | },
14 | modulePaths: [ "" ],
15 | testTimeout: 10000,
16 | roots: [ '' ],
17 | collectCoverageFrom: [ "src/**/*.ts" ]
18 | }
19 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "node-html-markdown",
3 | "description": "Fast HTML to markdown cross-compiler, compatible with both node and the browser",
4 | "version": "1.3.0",
5 | "main": "dist/index.js",
6 | "types": "dist/index.d.ts",
7 | "scripts": {
8 | "compile": "tsc",
9 | "build": "yarn run clean && yarn run compile",
10 | "clean": "npx -y rimraf coverage dist **/*.tsbuildinfo",
11 | "------------- ": "-------------",
12 | "benchmark": "cd benchmark && yarn run benchmark quick",
13 | "benchmark:large": "cd benchmark && yarn run benchmark",
14 | "test": "jest",
15 | "test:coverage": "jest --collect-coverage",
16 | "------------- ": "-------------",
17 | "prepare": "ts-patch patch tsc --silent && cd benchmark && yarn install"
18 | },
19 | "files": [
20 | "README.md",
21 | "CHANGELOG.md",
22 | "dist"
23 | ],
24 | "keywords": [
25 | "html",
26 | "markdown",
27 | "converter",
28 | "md",
29 | "html5",
30 | "node-html-parser",
31 | "fast-html-parser",
32 | "turndown"
33 | ],
34 | "author": {
35 | "name": "Ron S.",
36 | "url": "http://twitter.com/ron"
37 | },
38 | "repository": {
39 | "type": "git",
40 | "url": "git+ssh://git@github.com/crosstype/node-html-markdown.git"
41 | },
42 | "bugs": {
43 | "url": "https://github.com/crosstype/node-html-markdown/issues"
44 | },
45 | "homepage": "https://github.com/crosstype/node-html-markdown#readme",
46 | "license": "MIT",
47 | "engines": {
48 | "node": ">=10.0.0"
49 | },
50 | "dependencies": {
51 | "node-html-parser": "^6.1.1"
52 | },
53 | "devDependencies": {
54 | "@types/jest": "~28.1.1",
55 | "@types/node": "^18.11.5",
56 | "jest": "^29.2.2",
57 | "standard-version": "^9.5.0",
58 | "ts-jest": "^29.0.3",
59 | "ts-node": "^10.9.1",
60 | "ts-patch": "^2.0.2",
61 | "typescript": "^4.8.4",
62 | "rimraf": "^3.0.2"
63 | },
64 | "standard-version": {
65 | "types": [
66 | {
67 | "type": "feat",
68 | "section": "Features"
69 | },
70 | {
71 | "type": "fix",
72 | "section": "Fixes"
73 | },
74 | {
75 | "type": "chore",
76 | "hidden": true
77 | },
78 | {
79 | "type": "docs",
80 | "hidden": true
81 | },
82 | {
83 | "type": "style",
84 | "hidden": true
85 | },
86 | {
87 | "type": "refactor",
88 | "hidden": true
89 | },
90 | {
91 | "type": "perf",
92 | "hidden": true
93 | },
94 | {
95 | "type": "test",
96 | "hidden": true
97 | }
98 | ]
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/src/config.ts:
--------------------------------------------------------------------------------
1 | import { isWhiteSpaceOnly, splitSpecial, surround, tagSurround, trimNewLines } from './utilities';
2 | import { PostProcessResult, TranslatorConfigObject } from './translator';
3 | import { NodeHtmlMarkdownOptions } from './options';
4 | import { Options as NodeHtmlParserOptions } from 'node-html-parser'
5 |
6 |
7 | /* ****************************************************************************************************************** */
8 | // region: Elements
9 | /* ****************************************************************************************************************** */
10 |
11 | export const defaultBlockElements = [
12 | 'ADDRESS', 'ARTICLE', 'ASIDE', 'AUDIO', 'BLOCKQUOTE', 'BODY', 'CANVAS', 'CENTER', 'DD', 'DIR', 'DIV', 'DL',
13 | 'DT', 'FIELDSET', 'FIGCAPTION', 'FIGURE', 'FOOTER', 'FORM', 'FRAMESET', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6',
14 | 'HEADER', 'HGROUP', 'HR', 'HTML', 'ISINDEX', 'LI', 'MAIN', 'MENU', 'NAV', 'NOFRAMES', 'NOSCRIPT', 'OL',
15 | 'OUTPUT', 'P', 'PRE', 'SECTION', 'TABLE', 'TBODY', 'TD', 'TFOOT', 'TH', 'THEAD', 'TR', 'UL'
16 | ]
17 |
18 | export const defaultIgnoreElements = [
19 | 'AREA', 'BASE', 'COL', 'COMMAND', 'EMBED', 'HEAD', 'INPUT', 'KEYGEN', 'LINK', 'META', 'PARAM', 'SCRIPT',
20 | 'SOURCE', 'STYLE', 'TRACK', 'WBR'
21 | ];
22 |
23 | export const contentlessElements = [ 'BR', 'HR', 'IMG' ];
24 |
25 | // endregion
26 |
27 |
28 | /* ****************************************************************************************************************** */
29 | // region: Options
30 | /* ****************************************************************************************************************** */
31 |
32 | // noinspection RegExpUnnecessaryNonCapturingGroup
33 | export const defaultOptions: Readonly = Object.freeze({
34 | preferNativeParser: false,
35 | codeFence: '```',
36 | bulletMarker: '*',
37 | indent: ' ',
38 | codeBlockStyle: <'indented' | 'fenced'>'fenced',
39 | emDelimiter: '_',
40 | strongDelimiter: '**',
41 | strikeDelimiter: '~~',
42 | maxConsecutiveNewlines: 3,
43 | /**
44 | * Character: Affects: Example:
45 | *
46 | * \ Escaping \-
47 | * ` Code `` code ``, ```lang\n code block \n```
48 | * * Bullet & Separators * item, ***
49 | * _ Bold, Italics, Separator _italic_, __bold__, ^___
50 | * ~ Strikethrough, Code ~~strike~~, ~~~lang\n code block \n~~~
51 | * [ Url [caption](url)
52 | * ] Url [caption](url)
53 | */
54 | globalEscape: [ /[\\`*_~\[\]]/gm, '\\$&' ] as const,
55 | /**
56 | * Note: The following compiled pattern was selected after perf testing various alternatives.
57 | * Please be mindful of performance if updating/changing it.
58 | *
59 | * Sequence: Affects: Example:
60 | *
61 | * +(space) Bullets + item
62 | * = Heading heading\n====
63 | * #{1,6}(space) Heading ## Heading
64 | * > Blockquote > quote
65 | * - Bullet, Header, Separator - item, heading\n---, ---
66 | * \d+\.(space) Numbered list item 1. Item
67 | */
68 | lineStartEscape: [
69 | /^(\s*?)((?:\+\s)|(?:[=>-])|(?:#{1,6}\s))|(?:(\d+)(\.\s))/gm,
70 | '$1$3\\$2$4'
71 | ] as const,
72 |
73 | useInlineLinks: true
74 | });
75 |
76 | // endregion
77 |
78 |
79 | /* ****************************************************************************************************************** */
80 | // region: Translators
81 | /* ****************************************************************************************************************** */
82 |
83 | export const defaultTranslators: TranslatorConfigObject = {
84 | /* Pre-formatted text */
85 | 'pre': { noEscape: true, preserveWhitespace: true },
86 |
87 | /* Line break */
88 | 'br': { content: ` \n`, recurse: false },
89 |
90 | /* Horizontal Rule*/
91 | 'hr': { content: '---', recurse: false },
92 |
93 | /* Headings */
94 | 'h1,h2,h3,h4,h5,h6': ({ node }) => ({
95 | prefix: '#'.repeat(+node.tagName.charAt(1)) + ' '
96 | }),
97 |
98 | /* Bold / Strong */
99 | 'strong,b': {
100 | spaceIfRepeatingChar: true,
101 | postprocess: ({ content, options: { strongDelimiter } }) =>
102 | isWhiteSpaceOnly(content)
103 | ? PostProcessResult.RemoveNode
104 | : tagSurround(content, strongDelimiter)
105 | },
106 |
107 | /* Strikethrough */
108 | 'del,s,strike': {
109 | spaceIfRepeatingChar: true,
110 | postprocess: ({ content, options: { strikeDelimiter } }) =>
111 | isWhiteSpaceOnly(content)
112 | ? PostProcessResult.RemoveNode
113 | : tagSurround(content, strikeDelimiter)
114 | },
115 |
116 | /* Italic / Emphasis */
117 | 'em,i': {
118 | spaceIfRepeatingChar: true,
119 | postprocess: ({ content, options: { emDelimiter } }) =>
120 | isWhiteSpaceOnly(content)
121 | ? PostProcessResult.RemoveNode
122 | : tagSurround(content, emDelimiter)
123 | },
124 |
125 | /* Lists (ordered & unordered) */
126 | 'ol,ul': ({ listKind }) => ({
127 | surroundingNewlines: listKind ? 1 : 2,
128 | }),
129 |
130 | /* List Item */
131 | 'li': ({ options: { bulletMarker }, indentLevel, listKind, listItemNumber }) => {
132 | const indentationLevel = +(indentLevel || 0);
133 | return {
134 | prefix: ' '.repeat(+(indentLevel || 0)) +
135 | (((listKind === 'OL') && (listItemNumber !== undefined)) ? `${listItemNumber}. ` : `${bulletMarker} `),
136 | surroundingNewlines: 1,
137 | postprocess: ({ content }) =>
138 | isWhiteSpaceOnly(content)
139 | ? PostProcessResult.RemoveNode
140 | : content
141 | .trim()
142 | .replace(/([^\r\n])(?:\r?\n)+/g, `$1 \n${' '.repeat(indentationLevel)}`)
143 | .replace(/(\S+?)[^\S\r\n]+$/gm, '$1 ')
144 | }
145 | },
146 |
147 | /* Block Quote */
148 | 'blockquote': {
149 | postprocess: ({ content }) => trimNewLines(content).replace(/^(>*)[^\S\r\n]?/gm, `>$1 `)
150 | },
151 |
152 | /* Code (block / inline) */
153 | 'code': ({ node, parent, options: { codeFence, codeBlockStyle }, visitor }) => {
154 | const isCodeBlock = [ 'PRE', 'WRAPPED-PRE' ].includes(parent?.tagName!) && parent!.childNodes.length < 2;
155 |
156 | /* Handle code (non-block) */
157 | if (!isCodeBlock)
158 | return {
159 | spaceIfRepeatingChar: true,
160 | noEscape: true,
161 | postprocess: ({ content }) => {
162 | // Find longest occurring sequence of running backticks and add one more (so content is escaped)
163 | const delimiter = '`' + (content.match(/`+/g)?.sort((a, b) => b.length - a.length)?.[0] || '');
164 | const padding = delimiter.length > 1 ? ' ' : '';
165 |
166 | return surround(surround(content, padding), delimiter)
167 | }
168 | }
169 |
170 | /* Handle code block */
171 | if (codeBlockStyle === 'fenced') {
172 | const language = node.getAttribute('class')?.match(/language-(\S+)/)?.[1] || '';
173 | return {
174 | noEscape: true,
175 | prefix: codeFence + language + '\n',
176 | postfix: '\n' + codeFence,
177 | childTranslators: visitor.instance.codeBlockTranslators
178 | }
179 | } else {
180 | return {
181 | noEscape: true,
182 | postprocess: ({ content }) => content.replace(/^/gm, ' '),
183 | childTranslators: visitor.instance.codeBlockTranslators
184 | }
185 | }
186 | },
187 |
188 | /* Table */
189 | 'table': ({ visitor }) => ({
190 | surroundingNewlines: 2,
191 | childTranslators: visitor.instance.tableTranslators,
192 | postprocess: ({ content, nodeMetadata, node }) => {
193 | // Split and trim leading + trailing pipes
194 | const rawRows = splitSpecial(content).map(({ text }) => text.replace(/^(?:\|\s+)?(.+)\s*\|\s*$/, '$1'));
195 |
196 | /* Get Row Data */
197 | const rows: string[][] = [];
198 | let colWidth: number[] = [];
199 | for (const row of rawRows) {
200 | if (!row) continue;
201 |
202 | /* Track columns */
203 | const cols = row.split(' |').map((c, i) => {
204 | c = c.trim();
205 | if (colWidth.length < i + 1 || colWidth[i] < c.length) colWidth[i] = c.length;
206 |
207 | return c;
208 | });
209 |
210 | rows.push(cols);
211 | }
212 |
213 | if (rows.length < 1) return PostProcessResult.RemoveNode;
214 |
215 | /* Compose Table */
216 | const maxCols = colWidth.length;
217 |
218 | let res = '';
219 | const caption = nodeMetadata.get(node)!.tableMeta!.caption;
220 | if (caption) res += caption + '\n';
221 |
222 | rows.forEach((cols, rowNumber) => {
223 | res += '| ';
224 |
225 | /* Add Columns */
226 | for (let i = 0; i < maxCols; i++) {
227 | let c = (cols[i] ?? '');
228 | c += ' '.repeat(Math.max(0, (colWidth[i] - c.length))); // Pad to max length
229 |
230 | res += c + ' |' + (i < maxCols - 1 ? ' ' : '');
231 | }
232 |
233 | res += '\n';
234 |
235 | // Add separator row
236 | if (rowNumber === 0) res += '|' + colWidth.map(w => ' ' + '-'.repeat(w) + ' |').join('') + '\n'
237 | });
238 |
239 | return res;
240 | }
241 | }),
242 |
243 | /* Link */
244 | 'a': ({ node, options, visitor }) => {
245 | const href = node.getAttribute('href');
246 | if (!href) return {};
247 |
248 | // Encodes symbols that can cause problems in markdown
249 | let encodedHref = '';
250 | for (const chr of href) {
251 | switch (chr) {
252 | case '(':
253 | encodedHref += '%28';
254 | break;
255 | case ')':
256 | encodedHref += '%29';
257 | break;
258 | case '_':
259 | encodedHref += '%5F';
260 | break;
261 | case '*':
262 | encodedHref += '%2A';
263 | break;
264 | default:
265 | encodedHref += chr;
266 | }
267 | }
268 |
269 | const title = node.getAttribute('title');
270 |
271 | // Inline link, when possible
272 | // See: https://github.com/crosstype/node-html-markdown/issues/17
273 | if (node.textContent === href && options.useInlineLinks) return { content: `<${encodedHref}>` };
274 |
275 | return {
276 | postprocess: ({ content }) => content.replace(/(?:\r?\n)+/g, ' '),
277 | childTranslators: visitor.instance.aTagTranslators,
278 | prefix: '[',
279 | postfix: ']' + (!options.useLinkReferenceDefinitions
280 | ? `(${encodedHref}${title ? ` "${title}"` : ''})`
281 | : `[${visitor.addOrGetUrlDefinition(encodedHref)}]`)
282 | }
283 | },
284 |
285 | /* Image */
286 | 'img': ({ node, options }) => {
287 | const src = node.getAttribute('src') || '';
288 | if (!src || (!options.keepDataImages && /^data:/i.test(src))) return { ignore: true };
289 |
290 | const alt = node.getAttribute('alt') || '';
291 | const title = node.getAttribute('title') || '';
292 |
293 | return {
294 | content: ``,
295 | recurse: false
296 | }
297 | },
298 | }
299 |
300 | export const tableTranslatorConfig: TranslatorConfigObject = {
301 | /* Table Caption */
302 | 'caption': ({ visitor }) => ({
303 | surroundingNewlines: false,
304 | childTranslators: visitor.instance.tableCellTranslators,
305 | postprocess: ({ content, nodeMetadata, node }) => {
306 | const caption = content.replace(/(?:\r?\n)+/g, ' ').trim();
307 | if (caption) nodeMetadata.get(node)!.tableMeta!.caption = '__' + caption + '__'
308 |
309 | return PostProcessResult.RemoveNode;
310 | },
311 | }),
312 |
313 | /* Table row */
314 | 'tr': ({ visitor }) => ({
315 | surroundingNewlines: false,
316 | childTranslators: visitor.instance.tableRowTranslators,
317 | postfix: '\n',
318 | prefix: '| ',
319 | postprocess: ({ content }) => !/ \|\s*$/.test(content) ? PostProcessResult.RemoveNode : content
320 | }),
321 |
322 | /* Table cell, (header cell) */
323 | 'th,td': ({ visitor }) => ({
324 | surroundingNewlines: false,
325 | childTranslators: visitor.instance.tableCellTranslators,
326 | prefix: ' ',
327 | postfix: ' |',
328 | postprocess: ({ content }) =>
329 | trimNewLines(content)
330 | .replace('|', '\\|')
331 | .replace(/(?:\r?\n)+/g, ' ')
332 | .trim()
333 | }),
334 | }
335 |
336 | export const tableRowTranslatorConfig: TranslatorConfigObject = {
337 | 'th,td': tableTranslatorConfig['th,td']
338 | }
339 |
340 | export const tableCellTranslatorConfig: TranslatorConfigObject = {
341 | 'a': defaultTranslators['a'],
342 | 'strong,b': defaultTranslators['strong,b'],
343 | 'del,s,strike': defaultTranslators['del,s,strike'],
344 | 'em,i': defaultTranslators['em,i'],
345 | 'img': defaultTranslators['img']
346 | }
347 |
348 | export const defaultCodeBlockTranslators: TranslatorConfigObject = {
349 | 'br': { content: `\n`, recurse: false },
350 | 'hr': { content: '---', recurse: false },
351 | 'h1,h2,h3,h4,h5,h6': { prefix: '[', postfix: ']' },
352 | 'ol,ul': defaultTranslators['ol,ul'],
353 | 'li': defaultTranslators['li'],
354 | 'tr': { surroundingNewlines: true },
355 | 'img': { recurse: false }
356 | }
357 |
358 | export const aTagTranslatorConfig: TranslatorConfigObject = {
359 | 'br': { content: '\n', recurse: false },
360 | 'hr': { content: '\n', recurse: false },
361 | 'pre': defaultTranslators['pre'],
362 | 'strong,b': defaultTranslators['strong,b'],
363 | 'del,s,strike': defaultTranslators['del,s,strike'],
364 | 'em,i': defaultTranslators['em,i'],
365 | 'img': defaultTranslators['img']
366 | }
367 |
368 | // endregion
369 |
370 |
371 | /* ****************************************************************************************************************** */
372 | // region: General
373 | /* ****************************************************************************************************************** */
374 |
375 | /**
376 | * Note: Do not change - values are tuned for performance
377 | */
378 | export const nodeHtmlParserConfig: NodeHtmlParserOptions = {
379 | lowerCaseTagName: false,
380 | comment: false,
381 | fixNestedATags: true,
382 | blockTextElements: {
383 | script: false,
384 | noscript: false,
385 | style: false
386 | }
387 | };
388 |
389 | // endregion
390 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | export { NodeMetadata, NodeMetadataMap } from './visitor'
2 | export { NodeHtmlMarkdown, FileCollection } from './main'
3 | export { NodeHtmlMarkdownOptions } from './options'
4 | export {
5 | TranslatorConfig, TranslatorConfigFactory, TranslatorCollection, PostProcessResult, TranslatorConfigObject
6 | } from './translator'
7 |
--------------------------------------------------------------------------------
/src/main.ts:
--------------------------------------------------------------------------------
1 | import { NodeHtmlMarkdownOptions } from './options';
2 | import { TranslatorCollection, TranslatorConfigObject } from './translator';
3 | import {
4 | aTagTranslatorConfig, defaultBlockElements, defaultCodeBlockTranslators, defaultIgnoreElements, defaultOptions,
5 | defaultTranslators, tableCellTranslatorConfig, tableRowTranslatorConfig, tableTranslatorConfig
6 | } from './config';
7 | import { parseHTML } from './utilities';
8 | import { getMarkdownForHtmlNodes } from './visitor';
9 |
10 |
11 | /* ****************************************************************************************************************** */
12 | // region: Types
13 | /* ****************************************************************************************************************** */
14 |
15 | export type FileCollection = { [fileName: string]: string }
16 | type Options = Partial
17 |
18 | // endregion
19 |
20 |
21 | /* ****************************************************************************************************************** */
22 | // region: NodeHtmlMarkdown (class)
23 | /* ****************************************************************************************************************** */
24 |
25 | export class NodeHtmlMarkdown {
26 | public translators = new TranslatorCollection();
27 | public aTagTranslators = new TranslatorCollection();
28 | public codeBlockTranslators = new TranslatorCollection();
29 | public tableTranslators = new TranslatorCollection();
30 | public tableRowTranslators = new TranslatorCollection();
31 | public tableCellTranslators = new TranslatorCollection();
32 | public readonly options: NodeHtmlMarkdownOptions
33 |
34 | constructor(options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject) {
35 | /* Setup Options */
36 | this.options = { ...defaultOptions, ...options };
37 | const ignoredElements = this.options.ignore?.concat(defaultIgnoreElements) ?? defaultIgnoreElements;
38 | const blockElements = this.options.blockElements?.concat(defaultBlockElements) ?? defaultBlockElements;
39 |
40 | /* Setup Translator Bases */
41 | ignoredElements?.forEach(el => {
42 | this.translators.set(el, { ignore: true, recurse: false });
43 | this.codeBlockTranslators.set(el, { ignore: true, recurse: false });
44 | })
45 |
46 | blockElements?.forEach(el => {
47 | this.translators.set(el, { surroundingNewlines: 2 });
48 | this.codeBlockTranslators.set(el, { surroundingNewlines: 2 });
49 | });
50 |
51 | /* Add and merge bases with default and custom translator configs */
52 | for (const [ elems, cfg ] of Object.entries({ ...defaultTranslators, ...customTranslators }))
53 | this.translators.set(elems, cfg, true);
54 |
55 | for (const [ elems, cfg ] of Object.entries({ ...defaultCodeBlockTranslators, ...customCodeBlockTranslators }))
56 | this.codeBlockTranslators.set(elems, cfg, true);
57 |
58 | for (const [ elems, cfg ] of Object.entries(aTagTranslatorConfig))
59 | this.aTagTranslators.set(elems, cfg, true);
60 |
61 | for (const [ elems, cfg ] of Object.entries(tableTranslatorConfig))
62 | this.tableTranslators.set(elems, cfg, true);
63 |
64 | for (const [ elems, cfg ] of Object.entries(tableRowTranslatorConfig))
65 | this.tableRowTranslators.set(elems, cfg, true);
66 |
67 | for (const [ elems, cfg ] of Object.entries(tableCellTranslatorConfig))
68 | this.tableCellTranslators.set(elems, cfg, true);
69 |
70 | // TODO - Workaround for upstream issue (may not be fixed) - https://github.com/taoqf/node-html-parser/issues/78
71 | if (!this.options.textReplace) this.options.textReplace = [];
72 | this.options.textReplace.push([ /^/gmi, '' ]);
73 | }
74 |
75 | /* ********************************************************* */
76 | // region: Static Methods
77 | /* ********************************************************* */
78 |
79 | /**
80 | * Translate HTML source text to markdown
81 | */
82 | static translate(html: string, options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject): string
83 | /**
84 | * Translate collection of HTML source text to markdown
85 | */
86 | static translate(files: FileCollection, options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject): FileCollection
87 | static translate(htmlOrFiles: string | FileCollection, opt?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject):
88 | string | FileCollection
89 | {
90 | return NodeHtmlMarkdown.prototype.translateWorker.call(new NodeHtmlMarkdown(opt, customTranslators, customCodeBlockTranslators), htmlOrFiles);
91 | }
92 |
93 | // endregion
94 |
95 | /* ********************************************************* */
96 | // region: Methods
97 | /* ********************************************************* */
98 |
99 | /**
100 | * Translate HTML source text to markdown
101 | */
102 | translate(html: string): string
103 | /**
104 | * Translate collection of HTML source text to markdown
105 | */
106 | translate(files: FileCollection): FileCollection
107 | translate(htmlOrFiles: string | FileCollection): string | FileCollection {
108 | return this.translateWorker(htmlOrFiles);
109 | }
110 |
111 | // endregion
112 |
113 | /* ********************************************************* */
114 | // region: Internal Methods
115 | /* ********************************************************* */
116 |
117 | private translateWorker(htmlOrFiles: string | FileCollection) {
118 | const inputIsCollection = typeof htmlOrFiles !== 'string';
119 | const inputFiles: FileCollection = !inputIsCollection ? { 'default': htmlOrFiles } : htmlOrFiles;
120 | const outputFiles: FileCollection = {};
121 |
122 | for (const [ fileName, html ] of Object.entries(inputFiles)) {
123 | const parsedHtml = parseHTML(html, this.options);
124 | outputFiles[fileName] = getMarkdownForHtmlNodes(this, parsedHtml, fileName !== 'default' ? fileName : void 0);
125 | }
126 |
127 | return inputIsCollection ? outputFiles : outputFiles['default'];
128 | }
129 |
130 | // endregion
131 |
132 | }
133 |
134 | // endregion
135 |
--------------------------------------------------------------------------------
/src/nodes.ts:
--------------------------------------------------------------------------------
1 | import * as NHParser from 'node-html-parser';
2 | import { CommentNode, NodeType } from 'node-html-parser';
3 |
4 |
5 | /* ****************************************************************************************************************** */
6 | // region: Types
7 | /* ****************************************************************************************************************** */
8 |
9 | export { NodeType, CommentNode }
10 |
11 | /* ********************************************************* *
12 | * Merged Nodes - Unions of node-html-parser and common DOM
13 | * ********************************************************* */
14 |
15 | type NodeBase = { preserve?: boolean }
16 |
17 | export type HtmlNode = (NHParser.Node | Node) & NodeBase
18 | export type ElementNode = (NHParser.HTMLElement | HTMLElement) & NodeBase
19 | export type TextNode = (NHParser.TextNode) & NodeBase
20 |
21 | // endregion
22 |
23 |
24 | /* ****************************************************************************************************************** */
25 | // region: TypeGuards
26 | /* ****************************************************************************************************************** */
27 |
28 | export const isTextNode = (node: HtmlNode): node is TextNode => node.nodeType === NodeType.TEXT_NODE;
29 | export const isCommentNode = (node: HtmlNode): node is CommentNode => node.nodeType === NodeType.COMMENT_NODE;
30 | export const isElementNode = (node: HtmlNode): node is ElementNode => node.nodeType === NodeType.ELEMENT_NODE;
31 |
32 | // endregion
33 |
--------------------------------------------------------------------------------
/src/options.ts:
--------------------------------------------------------------------------------
1 | /* ****************************************************************************************************************** */
2 | // region: Types
3 | /* ****************************************************************************************************************** */
4 |
5 | export interface NodeHtmlMarkdownOptions {
6 | /**
7 | * Use native window DOMParser when available
8 | * @default false
9 | */
10 | preferNativeParser: boolean,
11 |
12 | /**
13 | * Code block fence
14 | * @default ```
15 | */
16 | codeFence: string,
17 |
18 | /**
19 | * Bullet marker
20 | * @default *
21 | */
22 | bulletMarker: string,
23 |
24 | /**
25 | * Style for code block
26 | * @default fence
27 | */
28 | codeBlockStyle: 'indented' | 'fenced',
29 |
30 | /**
31 | * Emphasis delimiter
32 | * @default _
33 | */
34 | emDelimiter: string,
35 |
36 | /**
37 | * Strong delimiter
38 | * @default **
39 | */
40 | strongDelimiter: string,
41 |
42 | /**
43 | * Strong delimiter
44 | * @default ~~
45 | */
46 | strikeDelimiter: string,
47 |
48 | /**
49 | * Supplied elements will be ignored (ignores inner text does not parse children)
50 | */
51 | readonly ignore?: string[],
52 |
53 | /**
54 | * Supplied elements will be treated as blocks (surrounded with blank lines)
55 | */
56 | readonly blockElements?: string[],
57 |
58 | /**
59 | * Max consecutive new lines allowed
60 | * @default 3
61 | */
62 | maxConsecutiveNewlines: number,
63 |
64 | /**
65 | * Line Start Escape pattern
66 | * (Note: Setting this will override the default escape settings, you might want to use textReplace option instead)
67 | */
68 | lineStartEscape: readonly [ pattern: RegExp, replacement: string ]
69 |
70 | /**
71 | * Global escape pattern
72 | * (Note: Setting this will override the default escape settings, you might want to use textReplace option instead)
73 | */
74 | globalEscape: readonly [ pattern: RegExp, replacement: string ]
75 |
76 | /**
77 | * User-defined text replacement pattern (Replaces matching text retrieved from nodes)
78 | */
79 | textReplace?: (readonly [ pattern: RegExp, replacement: string ])[]
80 |
81 | /**
82 | * Keep images with data: URI (Note: These can be up to 1MB each)
83 | * @example
84 | *
85 | * @default false
86 | */
87 | keepDataImages?: boolean
88 |
89 | /**
90 | * Place URLS at the bottom and format links using link reference definitions
91 | *
92 | * @example
93 | * Click here . Or here . Or this link .
94 | *
95 | * Becomes:
96 | * Click [here][1]. Or [here][2]. Or [this link][1].
97 | *
98 | * [1]: /url
99 | * [2]: /url2
100 | */
101 | useLinkReferenceDefinitions?: boolean
102 |
103 | /**
104 | * Wrap URL text in < > instead of []() syntax.
105 | *
106 | * @example
107 | * The input https://google.com
108 | * becomes
109 | * instead of [https://google.com](https://google.com)
110 | *
111 | * @default true
112 | */
113 | useInlineLinks?: boolean
114 | }
115 |
116 | // endregion
117 |
--------------------------------------------------------------------------------
/src/translator.ts:
--------------------------------------------------------------------------------
1 | import { NodeHtmlMarkdownOptions } from './options';
2 | import { NodeMetadata, NodeMetadataMap, Visitor } from './visitor';
3 | import { ElementNode } from './nodes';
4 |
5 |
6 | /* ****************************************************************************************************************** */
7 | // region: Types
8 | /* ****************************************************************************************************************** */
9 |
10 | export type TranslatorConfigFactory = {
11 | (ctx: TranslatorContext): TranslatorConfig
12 | base?: TranslatorConfig
13 | }
14 |
15 | export type TranslatorConfigObject = { [tags: string]: TranslatorConfig | TranslatorConfigFactory }
16 |
17 | export type TranslatorContext = Partial & {
18 | node: ElementNode
19 | options: NodeHtmlMarkdownOptions
20 | parent?: ElementNode
21 | nodeMetadata: NodeMetadataMap
22 | visitor: Visitor
23 | base?: TranslatorConfig
24 | }
25 |
26 | export interface TranslatorConfig {
27 | /**
28 | * Preceeds content, follows surroundingNewLines
29 | */
30 | prefix?: string
31 |
32 | /**
33 | * Follows content, preceeds surroundingNewLines
34 | */
35 | postfix?: string
36 |
37 | /**
38 | * Set fixed output content
39 | */
40 | content?: string
41 |
42 | /**
43 | * Post-process content after inner nodes have been rendered.
44 | * Returning undefined will cause the content to not be updated
45 | */
46 | postprocess?: (ctx: TranslatorContext & { content: string }) => string | PostProcessResult
47 |
48 | /**
49 | * If false, no child elements will be scanned
50 | * @default true
51 | */
52 | recurse?: boolean
53 |
54 | /**
55 | * Adds newline before and after (true, false, or number of newlines to add per side)
56 | * @default false
57 | */
58 | surroundingNewlines?: boolean | number
59 |
60 | /**
61 | * Ignore node entirely
62 | */
63 | ignore?: boolean
64 |
65 | /**
66 | * Do not escape content
67 | */
68 | noEscape?: boolean
69 |
70 | /**
71 | * If first character matches end of the last written data, add a space
72 | * @example
73 | * // old text: **abc**
74 | * // new text: **def**
75 | * // becomes: **abc** **def**
76 | */
77 | spaceIfRepeatingChar?: boolean
78 |
79 | /**
80 | * Ensure translator is always visited, even if element is empty
81 | * Note: For speed, trees are optimized beforehand to only visit elements which have child nodes or text content.
82 | * In some cases, however, you may want to create or alter a translator to be triggered even if the element is empty.
83 | * (If using a TranslatorConfigFactory, this value is always treated as true)
84 | */
85 | preserveIfEmpty?: boolean
86 |
87 | /**
88 | * Keep whitespace as it is
89 | */
90 | preserveWhitespace?: boolean
91 |
92 | /**
93 | * Custom translator collection to use for child HTML nodes
94 | */
95 | childTranslators?: TranslatorCollection
96 | }
97 |
98 | export enum PostProcessResult {
99 | NoChange,
100 | RemoveNode
101 | }
102 |
103 | // endregion
104 |
105 |
106 | /* ****************************************************************************************************************** */
107 | // region: TranslatorCollection
108 | /* ****************************************************************************************************************** */
109 |
110 | export class TranslatorCollection {
111 | /**
112 | * @internal
113 | */
114 | [tagName: string]: any
115 |
116 | get size() { return Object.keys(this).length }
117 |
118 | /**
119 | * Add / update translator config for one or more element tags
120 | */
121 | set(keys: string, config: TranslatorConfig | TranslatorConfigFactory, /* @internal */ preserveBase?: boolean) {
122 | keys.split(',').forEach(el => {
123 | el = el.toUpperCase();
124 |
125 | let res = config;
126 | if (preserveBase) {
127 | const base = this[el];
128 | if (isTranslatorConfig(base))
129 | res = !isTranslatorConfig(config)
130 | ? Object.assign((...args: any[]) => (config).apply(void 0, args), { base })
131 | : { ...base, ...config };
132 | }
133 |
134 | this[el] = res;
135 | });
136 | }
137 |
138 | /**
139 | * Get translator config for element tag
140 | */
141 | get(key: string): TranslatorConfig | TranslatorConfigFactory {
142 | return this[key.toUpperCase()] as any;
143 | }
144 |
145 | /**
146 | * Returns array of entries
147 | */
148 | entries(): [ elementName: string, config: TranslatorConfig | TranslatorConfigFactory ][] {
149 | return Object.entries(this);
150 | }
151 |
152 | /**
153 | * Remove translator config for one or more element tags
154 | */
155 | remove(keys: string): void {
156 | keys.split(',').forEach(el => delete this[el.toUpperCase()]);
157 | }
158 | }
159 |
160 | // endregion
161 |
162 |
163 | /* ****************************************************************************************************************** */
164 | // region: Utilities
165 | /* ****************************************************************************************************************** */
166 |
167 | /**
168 | * Only use to narrow union of types where only TranslatorConfig has JS type 'object'
169 | */
170 | export const isTranslatorConfig = (v: any): v is TranslatorConfig => typeof v === 'object';
171 |
172 | export function createTranslatorContext(
173 | visitor: Visitor,
174 | node: ElementNode,
175 | metadata?: NodeMetadata,
176 | base?: TranslatorConfig
177 | ): TranslatorContext
178 | {
179 | const { instance, nodeMetadata, } = visitor;
180 | return {
181 | node,
182 | options: instance.options,
183 | parent: node.parentNode,
184 | nodeMetadata,
185 | visitor,
186 | base,
187 | ...metadata
188 | };
189 | }
190 |
191 | // endregion
192 |
--------------------------------------------------------------------------------
/src/utilities.ts:
--------------------------------------------------------------------------------
1 | import { NodeHtmlMarkdownOptions } from './options';
2 | import { ElementNode, HtmlNode } from './nodes';
3 | import { nodeHtmlParserConfig } from './config';
4 |
5 |
6 | /* ****************************************************************************************************************** */
7 | // region: String Utils
8 | /* ****************************************************************************************************************** */
9 |
10 | export const trimNewLines = (s: string) => s.replace(/^\n+|\n+$/g, '');
11 | export const surround = (source: string, surroundStr: string) => `${surroundStr}${source}${surroundStr}`;
12 | export const isWhiteSpaceOnly = (s: string) => !/\S/.test(s);
13 |
14 | /**
15 | * Split string, preserving specific newline used for each line
16 | */
17 | export function splitSpecial(s: string) {
18 | const lines: { text: string, newLineChar: '\r' | '\n' | '\r\n' | '' }[] = [];
19 | const strLen = s.length;
20 |
21 | for (let i = 0, startPos = 0; i < strLen; ++i) {
22 | let char = s.charAt(i);
23 | let newLineChar: typeof lines[number]['newLineChar'] = '';
24 |
25 | if (char === '\r') newLineChar = (s.charAt(i + 1) === '\n') ? '\r\n' : char;
26 | else if (char === '\n') newLineChar = char;
27 |
28 | const endPos = newLineChar ? i :
29 | i === (strLen - 1) ? i + 1 :
30 | undefined;
31 |
32 | if (endPos === undefined) continue;
33 |
34 | lines.push({
35 | text: s.slice(startPos, endPos),
36 | newLineChar
37 | });
38 |
39 | startPos = endPos + newLineChar.length;
40 | if (newLineChar.length > 1) ++i;
41 | }
42 |
43 | return lines;
44 | }
45 |
46 | /**
47 | * Surround tag content with delimiter (moving any leading/trailing space to outside the tag
48 | */
49 | export function tagSurround(content: string, surroundStr: string) {
50 | // If un-escaped surroundStr already occurs, remove all instances
51 | // See: https://github.com/crosstype/node-html-markdown/issues/18
52 | const nestedSurroundStrIndex = content.indexOf(surroundStr);
53 | if (nestedSurroundStrIndex >= 0)
54 | content = content.replace(
55 | new RegExp(`([^\\\\])\\${surroundStr.split('').join('\\')}`, 'gm'),
56 | '$1'
57 | );
58 |
59 | const lines = splitSpecial(content);
60 | let res = '';
61 |
62 | for (const { text, newLineChar } of lines) {
63 | let i: number = 0;
64 | let startPos: number | undefined = undefined;
65 | let endPos: number | undefined = undefined;
66 |
67 | while (i >= 0 && i < text.length) {
68 | if (/[\S]/.test(text[i])) {
69 | if (startPos === undefined) {
70 | startPos = i;
71 | i = text.length;
72 | } else {
73 | endPos = i;
74 | i = NaN;
75 | }
76 | }
77 |
78 | if (startPos === undefined) ++i;
79 | else --i;
80 | }
81 |
82 | // If whole string is non-breaking whitespace, don't surround it
83 | if (startPos === undefined) {
84 | res += text + newLineChar;
85 | continue;
86 | }
87 |
88 | if (endPos === undefined) endPos = text.length - 1;
89 |
90 | const leadingSpace = startPos > 0 ? text[startPos - 1] : '';
91 | const trailingSpace = endPos < (text.length - 1) ? text[endPos + 1] : '';
92 |
93 | const slicedText = text.slice(startPos, endPos + 1)
94 |
95 | res += leadingSpace + surroundStr + slicedText + surroundStr + trailingSpace + newLineChar;
96 | }
97 |
98 | return res;
99 | }
100 |
101 | export const getTrailingWhitespaceInfo = (s: string): { whitespace: number, newLines: number } => {
102 | const res = { whitespace: 0, newLines: 0 };
103 | const minI = Math.max(s.length - 10, 0);
104 | for (let i = s.length - 1; i >= minI; --i) {
105 | const token = s.slice(i, i + 1);
106 | if (!/\s/.test(token)) break;
107 | ++res.whitespace;
108 | if ([ '\r', '\n' ].includes(token)) ++res.newLines;
109 | }
110 | return res;
111 | }
112 |
113 | /**
114 | * If value is truthy, returns `value` (or `v` if no `value` provided), otherwise, returns an empty string
115 | * @param v - Var to check for truthiness
116 | * @param value - Value to return if true
117 | */
118 | export const truthyStr = (v: any, value?: string): string => v ? ((value !== undefined) ? value : String(v)) : '';
119 |
120 | // endregion
121 |
122 |
123 | /* ****************************************************************************************************************** */
124 | // region: Parser
125 | /* ****************************************************************************************************************** */
126 |
127 | function tryParseWithNativeDom(html: string): ElementNode | undefined {
128 | try {
129 | if (!(window?.DOMParser && (new window.DOMParser()).parseFromString('', 'text/html'))) return void 0;
130 | }
131 | catch {
132 | return void 0;
133 | }
134 |
135 | /* Get a document */
136 | let doc: Document;
137 | try {
138 | doc = document.implementation.createHTMLDocument('').open()
139 | }
140 | catch (e) {
141 | const { ActiveXObject } = (window);
142 | if (ActiveXObject) {
143 | const doc = ActiveXObject('htmlfile');
144 | doc.designMode = 'on'; // disable on-page scripts
145 | return doc.open();
146 | }
147 | throw e;
148 | }
149 |
150 | // Prepare document, ensuring we have a wrapper node
151 | doc.write('' + html + ' ');
152 | doc.close();
153 |
154 | return doc.documentElement;
155 | }
156 |
157 | const getNodeHtmlParser = () => {
158 | try {
159 | return require('node-html-parser').parse as typeof import('node-html-parser').parse
160 | }
161 | catch {
162 | return undefined;
163 | }
164 | }
165 |
166 | /**
167 | * Parser string to HTMLElement
168 | */
169 | export function parseHTML(html: string, options: NodeHtmlMarkdownOptions): ElementNode {
170 | let nodeHtmlParse: ReturnType;
171 |
172 | /* If specified, try to parse with native engine, fallback to node-html-parser */
173 | perfStart('parse');
174 | let el: ElementNode | undefined;
175 | if (options.preferNativeParser) {
176 | try {
177 | el = tryParseWithNativeDom(html);
178 | }
179 | catch (e) {
180 | nodeHtmlParse = getNodeHtmlParser();
181 | if (nodeHtmlParse) console.warn('Native DOM parser encountered an error during parse', e);
182 | else throw e;
183 | }
184 | } else nodeHtmlParse = getNodeHtmlParser();
185 |
186 | if (!el) el = nodeHtmlParse!(html, nodeHtmlParserConfig);
187 | perfStop('parse');
188 |
189 | return el;
190 | }
191 |
192 | // endregion
193 |
194 |
195 | /* ****************************************************************************************************************** */
196 | // region: General
197 | /* ****************************************************************************************************************** */
198 |
199 | export function getChildNodes(node: T): T[]
200 | export function getChildNodes(node: HtmlNode | Node): (Node | HtmlNode)[] {
201 | if (!isNodeList(node.childNodes)) return node.childNodes;
202 |
203 | const res: (ChildNode)[] = [];
204 | node.childNodes.forEach(n => res.push(n));
205 |
206 | return res;
207 |
208 | function isNodeList(v: any): v is NodeListOf {
209 | return (v != null) || (typeof v[Symbol.iterator] === 'function');
210 | }
211 | }
212 |
213 | export function perfStart(label: string) {
214 | if (process.env.LOG_PERF) console.time(label);
215 | }
216 |
217 | export function perfStop(label: string) {
218 | if (process.env.LOG_PERF) console.timeEnd(label);
219 | }
220 |
221 | // endregion
222 |
--------------------------------------------------------------------------------
/src/visitor.ts:
--------------------------------------------------------------------------------
1 | import { NodeHtmlMarkdown } from './main';
2 | import { ElementNode, HtmlNode, isElementNode, isTextNode } from './nodes';
3 | import { getChildNodes, getTrailingWhitespaceInfo, perfStart, perfStop, trimNewLines } from './utilities';
4 | import {
5 | createTranslatorContext, isTranslatorConfig, PostProcessResult, TranslatorConfig, TranslatorConfigFactory,
6 | TranslatorConfigObject, TranslatorContext
7 | } from './translator';
8 | import { NodeHtmlMarkdownOptions } from './options';
9 | import { contentlessElements } from './config';
10 |
11 |
12 | /* ****************************************************************************************************************** */
13 | // region: Types
14 | /* ****************************************************************************************************************** */
15 |
16 | export interface NodeMetadata {
17 | indentLevel?: number
18 | listKind?: 'OL' | 'UL'
19 | listItemNumber?: number
20 | noEscape?: boolean
21 | preserveWhitespace?: boolean
22 | translators?: TranslatorConfigObject
23 | tableMeta?: {
24 | node: ElementNode,
25 | caption?: string
26 | }
27 | }
28 |
29 | export type NodeMetadataMap = Map
30 |
31 | type VisitorResult = {
32 | text: string
33 | trailingNewlineStats: {
34 | whitespace: number
35 | newLines: number
36 | }
37 | }
38 |
39 | // endregion
40 |
41 |
42 | /* ****************************************************************************************************************** */
43 | // region: Visitor
44 | /* ****************************************************************************************************************** */
45 |
46 | /**
47 | * Properties & methods marked public are designated as such due to the fact that we may add middleware / transformer
48 | * support in the future
49 | */
50 | export class Visitor {
51 | public result: VisitorResult
52 | public nodeMetadata: NodeMetadataMap = new Map();
53 | public urlDefinitions: string[] = [];
54 | private options: NodeHtmlMarkdownOptions;
55 |
56 | constructor(
57 | public instance: NodeHtmlMarkdown,
58 | public rootNode: HtmlNode,
59 | public fileName?: string,
60 | )
61 | {
62 | this.result = {
63 | text: '',
64 | trailingNewlineStats: {
65 | whitespace: 0,
66 | newLines: 0
67 | }
68 | };
69 | this.options = instance.options;
70 |
71 | this.optimizeTree(rootNode);
72 | this.visitNode(rootNode);
73 | }
74 |
75 | /* ********************************************************* */
76 | // region: Methods
77 | /* ********************************************************* */
78 |
79 | public addOrGetUrlDefinition(url: string): number {
80 | let id = this.urlDefinitions.findIndex(u => u === url);
81 | if (id < 0) id = this.urlDefinitions.push(url) - 1;
82 | return id + 1;
83 | }
84 |
85 | public appendResult(s: string, startPos?: number, spaceIfRepeatingChar?: boolean) {
86 | if (!s && startPos === undefined) return;
87 | const { result } = this;
88 |
89 | if (startPos !== undefined) result.text = result.text.substr(0, startPos);
90 | result.text += (spaceIfRepeatingChar && result.text.slice(-1) === s[0] ? ' ' : '') + s;
91 |
92 | result.trailingNewlineStats = getTrailingWhitespaceInfo(result.text);
93 | }
94 |
95 | public appendNewlines(count: number) {
96 | const { newLines } = this.result.trailingNewlineStats;
97 | this.appendResult('\n'.repeat(Math.max(0, (+count - newLines))));
98 | }
99 |
100 | // endregion
101 |
102 | /* ********************************************************* */
103 | // region: Internal Methods
104 | /* ********************************************************* */
105 |
106 | /**
107 | * Optimize tree, flagging nodes that have usable content
108 | */
109 | private optimizeTree(node: HtmlNode) {
110 | perfStart('Optimize tree');
111 | const { translators } = this.instance;
112 | (function visit(node: HtmlNode): boolean {
113 | let res = false
114 | if (isTextNode(node) || (isElementNode(node) && contentlessElements.includes(node.tagName))) {
115 | res = true;
116 | }
117 | else {
118 | const childNodes = getChildNodes(node);
119 | if (!childNodes.length) {
120 | const translator = translators[(node as ElementNode).tagName];
121 | if (translator?.preserveIfEmpty || typeof translator === 'function') res = true;
122 | }
123 | else
124 | for (const child of childNodes) {
125 | if (!res) res = visit(child);
126 | else visit(child);
127 | }
128 | }
129 | return node.preserve = res;
130 | })(node);
131 | perfStop('Optimize tree');
132 | }
133 |
134 | /**
135 | * Apply escaping and custom replacement rules
136 | */
137 | private processText(text: string, metadata: NodeMetadata | undefined) {
138 | let res = text;
139 | if (!metadata?.preserveWhitespace) res = res.replace(/\s+/g, ' ');
140 | if (metadata?.noEscape) return res;
141 |
142 | const { lineStartEscape, globalEscape, textReplace } = this.options;
143 | res = res
144 | .replace(globalEscape[0], globalEscape[1])
145 | .replace(lineStartEscape[0], lineStartEscape[1])
146 |
147 | /* If specified, apply custom replacement patterns */
148 | if (textReplace)
149 | for (const [ pattern, r ] of textReplace) res = res.replace(pattern, r);
150 |
151 | return res;
152 | }
153 |
154 | public visitNode(node: HtmlNode, textOnly?: boolean, metadata?: NodeMetadata): void {
155 | const { result } = this;
156 |
157 | if (!node.preserve) return;
158 |
159 | /* Handle text node */
160 | if (isTextNode(node)) {
161 | if ((node).wholeText) {
162 | (node).text ??= (node).wholeText;
163 | (node).trimmedText ??= trimNewLines((node).wholeText);
164 | }
165 |
166 | return node.isWhitespace && !metadata?.preserveWhitespace
167 | ? (!result.text.length || result.trailingNewlineStats.whitespace > 0) ? void 0 : this.appendResult(' ')
168 | : this.appendResult(this.processText(metadata?.preserveWhitespace ? node.text : node.trimmedText, metadata));
169 | }
170 |
171 | if (textOnly || !isElementNode(node)) return;
172 |
173 | /* Handle element node */
174 | const translatorCfgOrFactory: TranslatorConfig | TranslatorConfigFactory | undefined =
175 | metadata?.translators ? metadata.translators[node.tagName] : this.instance.translators[node.tagName];
176 |
177 | /* Update metadata with list detail */
178 | switch (node.tagName) {
179 | case 'UL':
180 | case 'OL':
181 | metadata = {
182 | ...metadata,
183 | listItemNumber: 0,
184 | listKind: (node.tagName),
185 | indentLevel: (metadata?.indentLevel ?? -1) + 1
186 | };
187 | break;
188 | case 'LI':
189 | if (metadata?.listKind === 'OL') metadata.listItemNumber = (metadata.listItemNumber ?? 0) + 1;
190 | break;
191 | case 'PRE':
192 | metadata = {
193 | ...metadata,
194 | preserveWhitespace: true
195 | }
196 | break;
197 | case 'TABLE':
198 | metadata = {
199 | ...metadata,
200 | tableMeta: {
201 | node: node
202 | }
203 | }
204 | }
205 | if (metadata) this.nodeMetadata.set(node, metadata);
206 |
207 | // If no translator for element, visit children
208 | if (!translatorCfgOrFactory) {
209 | for (const child of getChildNodes(node)) this.visitNode(child, textOnly, metadata);
210 | return;
211 | }
212 |
213 | /* Get Translator Config */
214 | let cfg: TranslatorConfig;
215 | let ctx: TranslatorContext | undefined;
216 | if (!isTranslatorConfig(translatorCfgOrFactory)) {
217 | ctx = createTranslatorContext(this, node, metadata, translatorCfgOrFactory.base);
218 | cfg = { ...translatorCfgOrFactory.base, ...translatorCfgOrFactory(ctx) };
219 | } else cfg = translatorCfgOrFactory;
220 |
221 | // Skip and don't check children if ignore flag set
222 | if (cfg.ignore) return;
223 |
224 | /* Update metadata if needed */
225 | if (cfg.noEscape && !metadata?.noEscape) {
226 | metadata = { ...metadata, noEscape: cfg.noEscape };
227 | this.nodeMetadata.set(node, metadata);
228 | }
229 |
230 | if (cfg.childTranslators && (cfg.childTranslators !== metadata?.translators)) {
231 | metadata = { ...metadata, translators: cfg.childTranslators }
232 | this.nodeMetadata.set(node, metadata);
233 | }
234 |
235 | const startPosOuter = result.text.length;
236 |
237 | /* Write opening */
238 | if (cfg.surroundingNewlines) this.appendNewlines(+cfg.surroundingNewlines);
239 | if (cfg.prefix) this.appendResult(cfg.prefix);
240 |
241 | /* Write inner content */
242 | if (typeof cfg.content === 'string') this.appendResult(cfg.content, void 0, cfg.spaceIfRepeatingChar);
243 | else {
244 | const startPos = result.text.length;
245 |
246 | // Process child nodes
247 | for (const child of getChildNodes(node)) this.visitNode(child, (cfg.recurse === false), metadata);
248 |
249 | /* Apply translator post-processing */
250 | if (cfg.postprocess) {
251 | const postRes = cfg.postprocess({
252 | ...(ctx || createTranslatorContext(this, node, metadata)),
253 | content: result.text.substr(startPos)
254 | });
255 |
256 | // If remove flag sent, remove / omit everything for this node (prefix, newlines, content, postfix)
257 | if (postRes === PostProcessResult.RemoveNode) {
258 | if (node.tagName === 'LI' && metadata?.listItemNumber) --metadata.listItemNumber;
259 | return this.appendResult('', startPosOuter);
260 | }
261 |
262 | if (typeof postRes === 'string') this.appendResult(postRes, startPos, cfg.spaceIfRepeatingChar);
263 | }
264 | }
265 |
266 | /* Write closing */
267 | if (cfg.postfix) this.appendResult(cfg.postfix);
268 | if (cfg.surroundingNewlines) this.appendNewlines(+cfg.surroundingNewlines);
269 | }
270 |
271 | // endregion
272 | }
273 |
274 | // endregion
275 |
276 |
277 | /* ****************************************************************************************************************** */
278 | // region: Utilities
279 | /* ****************************************************************************************************************** */
280 |
281 | export function getMarkdownForHtmlNodes(instance: NodeHtmlMarkdown, rootNode: HtmlNode, fileName?: string): string {
282 | perfStart('walk');
283 | const visitor = new Visitor(instance, rootNode, fileName);
284 | let result = visitor.result.text;
285 | perfStop('walk');
286 |
287 | /* Post-processing */
288 | // Add link references, if set
289 | if (instance.options.useLinkReferenceDefinitions) {
290 | if (/[^\r\n]/.test(result.slice(-1))) result += '\n';
291 | visitor.urlDefinitions.forEach((url, idx) => {
292 | result += `\n[${idx + 1}]: ${url}`;
293 | });
294 | }
295 |
296 | // Fixup repeating newlines
297 | const { maxConsecutiveNewlines } = instance.options;
298 | if (maxConsecutiveNewlines) result = result.replace(
299 | new RegExp(String.raw`(?:\r?\n\s*)+((?:\r?\n\s*){${maxConsecutiveNewlines}})`, 'g'),
300 | '$1'
301 | );
302 |
303 | return trimNewLines(result);
304 | }
305 |
306 | // endregion
307 |
--------------------------------------------------------------------------------
/test/default-tags-codeblock.test.ts:
--------------------------------------------------------------------------------
1 | // noinspection HtmlUnknownTarget
2 |
3 | import { NodeHtmlMarkdown } from '../src';
4 |
5 |
6 | /* ****************************************************************************************************************** *
7 | * Tests
8 | * ****************************************************************************************************************** */
9 |
10 | // Note: Newline handling for block elements within code blocks is not very clean. This can be fixed later.
11 | describe(`Default Tags`, () => {
12 | let instance: NodeHtmlMarkdown;
13 | const translateAsBlock = (html: string) => instance.translate(`${html}
`);
14 | const getExpected = (s: string) => '```\n' + s + '\n```';
15 | beforeAll(() => {
16 | instance = new NodeHtmlMarkdown();
17 | });
18 |
19 | test(`Line Break (br)`, () => {
20 | const res = translateAsBlock(`a b`);
21 | expect(res).toBe(getExpected(`a\nb`));
22 | });
23 |
24 | test(`Horizontal Rule (hr)`, () => {
25 | const res = translateAsBlock(`a b`);
26 | expect(res).toBe(getExpected(`a\n\n---\n\nb`));
27 | });
28 |
29 | test(`Non-processed Elements (b, strong, del, s, strike, em, i, pre, code, blockquote, a)`, () => {
30 | const tags = [ 'b', 'strong', 'del', 's', 'strike', 'em', 'i', 'code', 'a', 'pre', 'blockquote' ];
31 | const html = tags.map(t => `<${t}>${t}${t}>`).join(' ');
32 | const exp = 'b strong del s strike em i code a \n\npre\n\n blockquote\n\n';
33 |
34 | const res = translateAsBlock(html);
35 | expect(res).toBe(getExpected(exp));
36 | });
37 |
38 | test(`Image (img)`, () => {
39 | const res = translateAsBlock(`a b`);
40 | expect(res).toBe(getExpected(`ab`));
41 | });
42 |
43 | test(`Headings (h1, h2, h3, h4, h5, h6)`, () => {
44 | let nodes: string[] = [];
45 | for (let i = 1; i < 8; i++) nodes.push(`a `);
46 | const res = translateAsBlock(nodes.join(''));
47 | expect(res).toBe(getExpected('\n[a]\n'.repeat(6) + '\na'));
48 | });
49 |
50 | // Note: Newline handling here for block elements is unusual
51 | describe(`Lists (ol + li, ul + li)`, () => {
52 | test(`Multi-level Ordered List`, () => {
53 | const res = translateAsBlock(`
54 |
55 | ab
56 |
57 | b
58 | c d
59 |
60 |
61 |
62 | `);
63 | expect(res).toBe(getExpected(` \n \n1. a \nb\n \n \n2. b \n \n 1. c \n d \n \n * e \n f\n \n `));
64 | });
65 |
66 | test(`Multi-level Unordered List`, () => {
67 | const res = translateAsBlock(`
68 |
69 | ab
70 |
71 | b
72 |
73 | e f
74 |
75 |
76 | `);
77 | expect(res).toBe(getExpected(` \n \n* a \nb\n \n \n* b \n \n * c \n d \n \n 1. e \n f\n \n `));
78 | });
79 | });
80 |
81 | test(`Table`, () => {
82 | const res = translateAsBlock('ab c');
83 | expect(res).toBe(getExpected(`a\nb\nc\n\nX\n\n`));
84 | })
85 | });
86 |
--------------------------------------------------------------------------------
/test/default-tags.test.ts:
--------------------------------------------------------------------------------
1 | // noinspection HtmlUnknownTarget
2 |
3 | import { NodeHtmlMarkdown } from '../src';
4 |
5 |
6 | /* ****************************************************************************************************************** *
7 | * Tests
8 | * ****************************************************************************************************************** */
9 |
10 | describe(`Default Tags`, () => {
11 | let instance: NodeHtmlMarkdown;
12 | const translate = (html: string) => instance.translate(html);
13 | beforeAll(() => {
14 | instance = new NodeHtmlMarkdown();
15 | });
16 |
17 | test(`Line Break (br)`, () => {
18 | const res = translate(`a b`);
19 | expect(res).toBe(`a \nb`);
20 | });
21 |
22 | test(`Horizontal Rule (hr)`, () => {
23 | const res = translate(`a b`);
24 | expect(res).toBe(`a\n\n---\n\nb`);
25 | });
26 |
27 | test(`Bold (b, strong)`, () => {
28 | const res = translate(`ab c d ab c d `);
29 | const exp = `**a~~b~~** \n \n**c** \n**d**`;
30 | expect(res).toBe(exp + ' ' + exp);
31 | });
32 |
33 | test(`Strikethrough (del, s, strike)`, () => {
34 | const res = translate(`ab c dab c d ab c d `);
35 | const exp = `~~a_b_~~ \n \n~~c~~ \n~~d~~`;
36 | expect(res).toBe(exp + ' ' + exp + ' ' + exp);
37 | });
38 |
39 | test(`Italic / Emphasis (em, i)`, () => {
40 | const res = translate(`a b c d a b c d `);
41 | const exp = `_a ~~b~~_ \n \n_c_ \n_d_`;
42 | expect(res).toBe(exp + ' ' + exp);
43 | });
44 |
45 | test(`Link (a)`, () => {
46 | const url = 'http://www.github.com/crosstype';
47 | const specialUrl = 'http://www.github.com/crosstype/**/_test(123)';
48 | const encodedSpecialUrl = 'http://www.github.com/crosstype/%2A%2A/%5Ftest%28123%29';
49 | const res = translate(`
50 | a bc
51 | ab
52 | ${url}
53 |
54 | a nested b
55 | b
56 | `);
57 | expect(res).toBe(`[a b**c**](${url}) a**b** <${url}> [a](${url})[nested](2)b **_[b](${encodedSpecialUrl} "a")_** `);
58 | });
59 |
60 | test(`Image (img)`, () => {
61 | const url = `http://www.github.com/crosstype/`
62 | const res = translate(`
63 |
64 |
65 |
66 |
67 |
68 | `);
69 | expect(res).toBe(`` + ` ` + `  `);
70 | });
71 |
72 | test(`Pre-formatted Text (pre)`, () => {
73 | const str = `* test \t\n1. test\n\\Test`;
74 | const res = translate(`${str}# hello `);
75 | expect(res).toBe(str + ' \n**# hello**');
76 | });
77 |
78 | test(`Block Quote (blockquote)`, () => {
79 | const res = translate(`a b cdef `);
80 | expect(res).toBe(`> a \n> b \n> c\n> \n>> def`);
81 | });
82 |
83 | test(`Headings (h1, h2, h3, h4, h5, h6)`, () => {
84 | const res = translate(
85 | `ab ab ab ab ab ab `
86 | );
87 | expect(res).toBe(Array.from(Array(6), (v, i) => `#`.repeat(i + 1) + ` a**b**\n\n`).join('').trim());
88 | });
89 |
90 | test(`Code (code)`, () => {
91 | const res = translate('```` a \n\nb\n* c
d
');
92 | expect(res).toBe('````` ```` a b * c ````` `d`');
93 | });
94 |
95 | describe(`Code-block (pre + code)`, () => {
96 | const str = `* test \n\n1. test\n\\Test`;
97 |
98 | test(`Fenced`, () => {
99 | const res = translate(`${str}
${str}
`);
100 | expect(res).toBe('```fortran\n' + str + '\n```\n\n```\n' + str + '\n```');
101 | });
102 |
103 | test(`Indented`, () => {
104 | const originalCodeFence = instance.options.codeBlockStyle;
105 | instance.options.codeBlockStyle = 'indented';
106 |
107 | const res = translate(`${str}
${str}
`);
108 | const exp = str.replace(/^/gm, ' ');
109 | expect(res).toBe(exp + '\n\n' + exp);
110 |
111 | instance.options.codeFence = originalCodeFence;
112 | });
113 | });
114 |
115 | describe(`Lists (ol + li, ul + li)`, () => {
116 | test(`Multi-level Ordered List`, () => {
117 | const res = translate(`
118 |
119 | ab
120 |
121 | b
122 | c d
123 |
124 |
125 |
126 | `);
127 | expect(res).toBe(`1. a \n \n~~b~~\n2. b \n 1. c \n d \n * e \n f`);
128 | });
129 |
130 | test(`Multi-level Unordered List`, () => {
131 | const res = translate(`
132 |
133 | ab
134 |
135 | b
136 |
137 | e f
138 |
139 |
140 | `);
141 | expect(res).toBe(`* a \n \n~~b~~\n* b \n * c \n d \n 1. e \n f`);
142 | });
143 |
144 | test(`List item with block content`, () => {
145 | const res = translate(`a`);
146 | expect(res).toBe(`*  \na`);
147 | });
148 | });
149 | });
150 |
--------------------------------------------------------------------------------
/test/options.test.ts:
--------------------------------------------------------------------------------
1 | // noinspection RegExpUnnecessaryNonCapturingGroup,HtmlUnknownTarget
2 |
3 | import { NodeHtmlMarkdown } from '../src';
4 |
5 |
6 | /* ****************************************************************************************************************** *
7 | * Options Tests
8 | * ****************************************************************************************************************** */
9 |
10 | describe(`Options`, () => {
11 | let instance: NodeHtmlMarkdown;
12 | const translate = (html: string) => instance.translate(html);
13 | beforeAll(() => {
14 | instance = new NodeHtmlMarkdown();
15 | });
16 |
17 | test(`codeFence`, () => {
18 | const originalCodeFence = instance.options.codeFence;
19 | const str = `* test \n\n1. test\n\\Test`;
20 | const html = `${str}
`;
21 |
22 | const resDefaultFence = translate(html);
23 | expect(resDefaultFence).toBe('```fortran\n' + str + '\n```');
24 |
25 | instance.options.codeFence = `+++++`;
26 | const resFencePlus = translate(html);
27 | expect(resFencePlus).toBe('+++++fortran\n' + str + '\n+++++');
28 |
29 | instance.options.codeFence = `?`;
30 | const resFence1Char = translate(html);
31 | expect(resFence1Char).toBe('?fortran\n' + str + '\n?');
32 |
33 | instance.options.codeFence = originalCodeFence;
34 | });
35 |
36 | test(`bulletMarker`, () => {
37 | const originalBulletMarker = instance.options.bulletMarker;
38 | const html = ``;
39 |
40 | const resDefaultMarker = translate(html);
41 | expect(resDefaultMarker).toBe(`* item1
42 | * item2`);
43 |
44 | instance.options.bulletMarker = '-';
45 | const resDashMarker = translate(html);
46 | expect(resDashMarker).toBe(`- item1
47 | - item2`);
48 |
49 | instance.options.bulletMarker = '<->';
50 | const resWideMarker = translate(html);
51 | expect(resWideMarker).toBe(`<-> item1
52 | <-> item2`);
53 | instance.options.bulletMarker = originalBulletMarker;
54 | });
55 |
56 | test(`codeBlockStyle`, () => {
57 | const originalCodeFence = instance.options.codeBlockStyle;
58 | const html = `line1\nline2
`;
59 |
60 | instance.options.codeBlockStyle = 'fenced';
61 | const resFenced = translate(html);
62 | expect(resFenced).toBe('```\nline1\nline2\n```');
63 |
64 | instance.options.codeBlockStyle = 'indented';
65 | const resIndented = translate(html);
66 | expect(resIndented).toBe('line1\nline2'.replace(/^/gm, ' '));
67 |
68 | instance.options.codeFence = originalCodeFence;
69 | });
70 |
71 | test(`emDelimiter`, () => {
72 | const originalEmDelimiter = instance.options.emDelimiter;
73 | const html = `some text more text `;
74 |
75 | const resDefaultEmDelimiter = translate(html);
76 | expect(resDefaultEmDelimiter).toBe(`_some text_ _more text_`);
77 |
78 | instance.options.emDelimiter = '|';
79 | const resShortEmDelimiter = translate(`some text more text `);
80 | expect(resShortEmDelimiter).toBe(`|some text| |more text|`);
81 |
82 | instance.options.emDelimiter = '+++';
83 | const resWideEmDelimiter = translate(`some text more text `);
84 | expect(resWideEmDelimiter).toBe(`+++some text+++ +++more text+++`);
85 | instance.options.emDelimiter = originalEmDelimiter;
86 | });
87 |
88 | test(`strongDelimiter`, () => {
89 | const originalStrongDelimiter = instance.options.strongDelimiter;
90 | const html = `some text more text `;
91 |
92 | const resDefaultStrongDelimiter = translate(html);
93 | expect(resDefaultStrongDelimiter).toBe(`**some text** **more text**`);
94 |
95 | instance.options.strongDelimiter = '|';
96 | const resShortStrongDelimiter = translate(html);
97 | expect(resShortStrongDelimiter).toBe(`|some text| |more text|`);
98 |
99 | instance.options.strongDelimiter = '+++';
100 | const resWideStrongDelimiter = translate(html);
101 | expect(resWideStrongDelimiter).toBe(`+++some text+++ +++more text+++`);
102 | instance.options.strongDelimiter = originalStrongDelimiter;
103 | });
104 |
105 |
106 | test(`strikeDelimiter`, () => {
107 | const originalStrikeDelimiter = instance.options.strikeDelimiter;
108 | const html = `some text more text one more text`;
109 |
110 | const resDefaultStrikeDelimiter = translate(html);
111 | expect(resDefaultStrikeDelimiter).toBe(`~~some text~~ ~~more text~~ ~~one more text~~`);
112 |
113 | instance.options.strikeDelimiter = '~';
114 | const resShortStrikeDelimiter = translate(html);
115 | expect(resShortStrikeDelimiter).toBe(`~some text~ ~more text~ ~one more text~`);
116 |
117 | instance.options.strikeDelimiter = '+++';
118 | const resWideStrikeDelimiter = translate(html);
119 | expect(resWideStrikeDelimiter).toBe(`+++some text+++ +++more text+++ +++one more text+++`);
120 | instance.options.strikeDelimiter = originalStrikeDelimiter;
121 | });
122 |
123 | test(`ignore`, () => {
124 | const strongEmHTML = `some text more text `;
125 |
126 | const instanceIgnore = new NodeHtmlMarkdown({
127 | ignore: ['STRONG']
128 | });
129 | const resNoStrong = instanceIgnore.translate(strongEmHTML);
130 | expect(resNoStrong).toBe(`_more text_`);
131 |
132 | const instanceIgnoreEm = new NodeHtmlMarkdown({
133 | ignore: ['EM']
134 | });
135 | const resNoEm = instanceIgnoreEm.translate(strongEmHTML);
136 | expect(resNoEm).toBe(`**some text**`);
137 |
138 | const instanceIgnoreBoth = new NodeHtmlMarkdown({
139 | ignore: ['EM', 'STRONG']
140 | });
141 | const resNoEmStrong = instanceIgnoreBoth.translate(strongEmHTML);
142 | expect(resNoEmStrong).toBe(``);
143 |
144 | const instanceIgnoreMiss = new NodeHtmlMarkdown({
145 | ignore: ['UL', 'H1']
146 | });
147 | const resWithAll = instanceIgnoreMiss.translate(strongEmHTML);
148 | expect(resWithAll).toBe(`**some text**_more text_`);
149 | });
150 |
151 | test(`blockElements`, () => {
152 | const html = `x yyy x text `;
153 | const instanceStrongBlock = new NodeHtmlMarkdown({
154 | blockElements: ['STRONG']
155 | });
156 | const resStrongBlock = instanceStrongBlock.translate(html);
157 | expect(resStrongBlock).toBe(`_x_
158 |
159 | **yyy**
160 |
161 | _x_text`);
162 |
163 | const instanceEmBlock = new NodeHtmlMarkdown({
164 | blockElements: ['EM']
165 | });
166 | const resEmBlock = instanceEmBlock.translate(html);
167 | expect(resEmBlock).toBe(`_x_
168 |
169 | **yyy**
170 |
171 | _x_
172 |
173 | text`);
174 | });
175 |
176 | test(`maxConsecutiveNewlines`, () => {
177 | const originalMaxConsecutiveNewlines = instance.options.maxConsecutiveNewlines;
178 | const html = `text ${' '.repeat(10)}something `;
179 |
180 | const resDefaultMaxNewLines = translate(html);
181 | expect(resDefaultMaxNewLines).toBe(`**text**${' \n'.repeat(3)}_something_`);
182 |
183 | instance.options.maxConsecutiveNewlines = 5;
184 | const res5MaxNewLines = translate(html);
185 | expect(res5MaxNewLines).toBe(`**text**${' \n'.repeat(5)}_something_`);
186 |
187 | instance.options.maxConsecutiveNewlines = 10;
188 | const res10MaxNewLines = translate(html);
189 | expect(res10MaxNewLines).toBe(`**text**${' \n'.repeat(10)}_something_`);
190 |
191 | instance.options.maxConsecutiveNewlines = originalMaxConsecutiveNewlines;
192 | });
193 |
194 | test(`lineStartEscape`, () => {
195 | const originalLineStartEscape = instance.options.lineStartEscape;
196 |
197 | const resEscapedPlus = translate(`text + text + more text
`);
198 | expect(resEscapedPlus).toBe("text \n\\+ text \n\\+ more text");
199 |
200 | const resEscapedQuote = translate(`text > text > more text
`);
201 | expect(resEscapedQuote).toBe("text \n\\> text \n\\> more text");
202 |
203 | // No escape for +
204 | instance.options.lineStartEscape = [/^(\s*?)((?:[=>-])|(?:#{1,6}\s))|(?:(\d+)(\.\s))/gm, '$1$3\\$2$4'];
205 |
206 | const resNotEscapedPlus = translate(`text + text + more text
`);
207 | expect(resNotEscapedPlus).toBe("text \n+ text \n+ more text");
208 |
209 | // No escape also for >
210 | instance.options.lineStartEscape = [/^(\s*?)((?:#{1,6}\s))|(?:(\d+)(\.\s))/gm, '$1$3\\$2$4'];
211 |
212 | const resNotEscapedQuote = translate(`text > text > more text
`);
213 | expect(resNotEscapedQuote).toBe("text \n> text \n> more text");
214 |
215 | instance.options.lineStartEscape = originalLineStartEscape;
216 | });
217 |
218 | test(`globalEscape`, () => {
219 | const originalGlobalEscape = instance.options.globalEscape;
220 |
221 | const resEscapedStar = translate(`text**text `);
222 | expect(resEscapedStar).toBe("**text\\*\\*text**");
223 |
224 | // No escape for star
225 | instance.options.globalEscape = [ /[_~\[\]]/gm, '\\$&' ];
226 |
227 | const resNotEscapedStar = translate(`text**text `);
228 | expect(resNotEscapedStar).toBe("_text**text_");
229 |
230 | const resEscapedBrackets = translate(`title [more words] `);
231 | expect(resEscapedBrackets).toBe("# title \\[more words\\]");
232 |
233 | // No escape also for brackets
234 | instance.options.globalEscape = [ /[_~]/gm, '\\$&' ];
235 | const resNotEscapedBrackets = translate(`title [more words] `);
236 | expect(resNotEscapedBrackets).toBe("# title [more words]");
237 |
238 | instance.options.globalEscape = originalGlobalEscape;
239 | });
240 |
241 | test(`textReplace`, () => {
242 | const originalReplace = instance.options.textReplace;
243 |
244 | instance.options.textReplace = [[/abc/g, "xyz"]];
245 | const replaced = translate('hello abc ');
246 | expect(replaced).toBe(`# hello xyz`);
247 |
248 | instance.options.textReplace = [[/hello/g, "X"]];
249 | const replaced2 = translate('hello abc ');
250 | expect(replaced2).toBe(`# X abc`);
251 |
252 | instance.options.textReplace = originalReplace;
253 | });
254 |
255 | test(`keepDataImages`, () => {
256 | const originalKeepDataImages = instance.options.keepDataImages;
257 |
258 | instance.options.keepDataImages = true;
259 | const resKeep = translate(`
260 | `);
261 | expect(resKeep).toBe(` `);
262 |
263 | instance.options.keepDataImages = false;
264 | const resNoKeep = translate(`
265 | `);
266 | expect(resNoKeep).toBe(` `);
267 |
268 | instance.options.keepDataImages = originalKeepDataImages;
269 | });
270 |
271 | test(`useLinkReferenceDefinitions`, () => {
272 | const originalUseLinkReferenceDefinitions = instance.options.useLinkReferenceDefinitions;
273 |
274 | const url = 'http://www.github.com/crosstype';
275 | const html = `Hello:
276 | a bc
277 | ab
278 | link2
279 | repeat link
280 | ${url} Goodbye!
281 | `;
282 |
283 | instance.options.useLinkReferenceDefinitions = false;
284 | let res = translate(html);
285 | expect(res).toBe(
286 | `Hello: [a b**c**](${url}) a**b** [link2](${url}/other) [repeat link](${url}) <${url}> Goodbye!`
287 | );
288 |
289 | instance.options.useLinkReferenceDefinitions = true;
290 | res = translate(html);
291 | expect(res).toBe(
292 | `Hello: [a b**c**][1] a**b** [link2][2] [repeat link][1] <${url}> Goodbye!\n\n[1]: ${url}\n[2]: ${url}/other`
293 | );
294 |
295 | instance.options.useLinkReferenceDefinitions = originalUseLinkReferenceDefinitions;
296 | });
297 |
298 | test(`useInlineLinks`, () => {
299 | const originalUseInlineLinksDefinitions = instance.options.useInlineLinks;
300 |
301 | const url = 'http://www.github.com/crosstype';
302 | const html = `Hello:
303 | ${url}
304 | ab
305 | link2
306 | repeat link Goodbye!
307 | `;
308 |
309 | instance.options.useInlineLinks = false;
310 | let res = translate(html);
311 | expect(res).toBe(`Hello: [${url}](${url}) a**b** [link2](${url}/other) [repeat link](${url}) Goodbye!`);
312 |
313 | instance.options.useInlineLinks = true;
314 | res = translate(html);
315 | expect(res).toBe(
316 | `Hello: <${url}> a**b** [link2](${url}/other) [repeat link](${url}) Goodbye!`
317 | );
318 |
319 | instance.options.useLinkReferenceDefinitions = originalUseInlineLinksDefinitions;
320 | });
321 | });
322 |
--------------------------------------------------------------------------------
/test/special-cases.test.ts:
--------------------------------------------------------------------------------
1 | import { NodeHtmlMarkdown } from '../src';
2 |
3 |
4 | /* ****************************************************************************************************************** *
5 | * Config
6 | * ****************************************************************************************************************** */
7 |
8 | const textFormatTags = [ 'strong', 'b', 'del', 's', 'strike', 'em', 'i' ] as const;
9 | const getDelims = (instance: NodeHtmlMarkdown) => Object.fromEntries(textFormatTags.map(t => [
10 | t,
11 | (() => {
12 | switch (t) {
13 | case 'strong':
14 | case 'b':
15 | return instance.options.strongDelimiter;
16 | case 'del':
17 | case 's':
18 | case 'strike':
19 | return instance.options.strikeDelimiter;
20 | case 'em':
21 | case 'i':
22 | return instance.options.emDelimiter;
23 | }
24 | })()
25 | ]));
26 |
27 |
28 | /* ****************************************************************************************************************** *
29 | * Tests
30 | * ****************************************************************************************************************** */
31 |
32 | describe(`Special Cases`, () => {
33 | let instance: NodeHtmlMarkdown;
34 | let delims: ReturnType;
35 | const translate = (html: string) => instance.translate(html);
36 | beforeAll(() => {
37 | instance = new NodeHtmlMarkdown();
38 | delims = getDelims(instance);
39 | });
40 |
41 | test(`Removes uncaught Doctype`, () => {
42 | const res = translate(`abc`);
43 | expect(res).toBe(`abc`);
44 | });
45 |
46 | describe(`Whitespace handled for leading / trailing whitespace in tags`, () => {
47 | test.each(textFormatTags)(`%s`, tag => {
48 | const delim = delims[tag];
49 |
50 | expect(translate(`<${tag}> Label: ${tag}>Value
`)).toBe(` ${delim}Label:${delim} Value`);
51 | expect(translate(`<${tag}> Label: ${tag}>Value
`)).toBe(` ${delim}Label:${delim} Value`);
52 | });
53 | });
54 |
55 | // See: https://github.com/crosstype/node-html-markdown/issues/18
56 | describe(`Removes nested text formatting tags`, () => {
57 | test.each(textFormatTags)(`%s`, tag => {
58 | const delim = delims[tag];
59 |
60 | expect(translate(`<${tag}>My <${tag}>bold${tag}> text${tag}>`)).toBe(
61 | `${delim}My bold text${delim}`
62 | );
63 | });
64 | });
65 |
66 | // See: https://github.com/crosstype/node-html-markdown/issues/16
67 | // See: https://github.com/crosstype/node-html-markdown/issues/21
68 | test(`Handles whitespace with single space`, () => {
69 | const res = translate(`test test2 \ntest3 \r\n\r\n\t\t\ttest4 \ttest5\r\n\n\n\t\ttest6 `);
70 | expect(res).toBe(`test test2 test3 test4 test5 test6`);
71 | });
72 |
73 | // See: https://github.com/crosstype/node-html-markdown/issues/19
74 | test(`Childless nodes visited if preserveIfEmpty set`, () => {
75 | const html = `Hello World `;
76 |
77 | let res = NodeHtmlMarkdown.translate(html, void 0, { iframe: { content:'[iframe]' } });
78 | expect(res).toBe(`HelloWorld`);
79 |
80 | res = NodeHtmlMarkdown.translate(html, void 0, { iframe: { content:'[iframe]', preserveIfEmpty: true } });
81 | expect(res).toBe(`Hello[iframe]World`);
82 | });
83 |
84 | // See: https://github.com/crosstype/node-html-markdown/issues/20
85 | // See: https://github.com/crosstype/node-html-markdown/issues/22
86 | test(`Code blocks preserve whitespace & decode entities`, () => {
87 | const html =
88 | ` \n` +
89 | `function getURL(s: string): string {\n ` +
90 | ` return \`https://myurl.com/\${s}\`; \n` +
91 | `} ` +
92 | `
`;
93 | const expected =
94 | '```\n' +
95 | `// > Get URL Path\n` +
96 | `function getURL(s: string): string {\n` +
97 | ` return \`https://myurl.com/\${s}\`;\n` +
98 | `}\n` +
99 | '```';
100 |
101 | const res = translate(html);
102 | expect(res).toBe(expected);
103 | });
104 | });
105 |
--------------------------------------------------------------------------------
/test/table.test.ts:
--------------------------------------------------------------------------------
1 | import { NodeHtmlMarkdown } from '../src';
2 |
3 |
4 | /* ****************************************************************************************************************** *
5 | * Tests
6 | * ****************************************************************************************************************** */
7 |
8 | describe(`Table`, () => {
9 | let instance: NodeHtmlMarkdown;
10 | const translate = (html: string) => instance.translate(html);
11 | beforeAll(() => {
12 | instance = new NodeHtmlMarkdown();
13 | });
14 |
15 | test(`Single row, Single column table`, () => {
16 | const expected = `| col1 |\n| ---- |`;
17 |
18 | expect(translate(` `)).toBe(expected);
19 | expect(translate(``)).toBe(expected);
20 | expect(translate(``)).toBe(expected);
21 | });
22 |
23 | test(`Single row table`, () => {
24 | const expected = `| col1 | col2 |\n| ---- | ---- |`;
25 |
26 | expect(translate(``)).toBe(expected);
27 | expect(translate(``)).toBe(expected);
28 | expect(translate(``)).toBe(expected);
29 | });
30 |
31 | test(`Table with caption`, () => {
32 | const expected =
33 | `__Hello__\n` +
34 | `| col1 | col2 |\n` +
35 | `| ---- | ---- |`;
36 |
37 | expect(translate(``)).toBe(expected);
38 | expect(translate(``)).toBe(expected);
39 | });
40 |
41 | describe(`Special Cases`, () => {
42 | test(`"|" is escaped`, () => {
43 | expect(translate(``)).toBe(`| A\\|B |\n| ---- |`);
44 | });
45 |
46 | test(`Pads cells`, () => {
47 | const html = `
48 | abc def ghi
49 | abc1 def123 ghi1234567
50 | a def1234 c
51 |
`;
52 | const expected =
53 | `| abc | def | ghi |\n` +
54 | `| ---- | ------- | ---------- |\n` +
55 | `| abc1 | def123 | ghi1234567 |\n` +
56 | `| a | def1234 | c |`;
57 |
58 | expect(translate(html)).toBe(expected);
59 | });
60 |
61 | test(`Nested tables are not supported`, () => {
62 | const html = ``;
63 | expect(translate(html)).toBe(`| nested | abc |\n| ------ | --- |`);
64 | });
65 |
66 | test(`Supports inline tags + mismatched rows`, () => {
67 | const html = `
68 |
69 |
70 |
71 | COL1
72 | C
73 | O
74 | L2
75 |
76 |
77 |
78 |
79 | b
80 | i
81 | a
82 |
83 |
84 |
85 |
86 |
87 | h1
88 |
89 |
90 |
91 | `;
92 |
93 | const expected =
94 | `| COL1 | C O L2 | | |\n` +
95 | `| ----- | ------ | --------- | --------- |\n` +
96 | `| **b** | _i_ | [a](link) |  |\n` +
97 | `| list | | h1 | |`;
98 |
99 | expect(translate(html)).toBe(expected);
100 | });
101 | });
102 | });
103 |
--------------------------------------------------------------------------------
/test/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "../tsconfig.base.json",
3 |
4 | "compilerOptions": {
5 | "types": [ "jest", "node" ],
6 | "noEmit": true,
7 | "incremental": false,
8 | "target": "ES2018"
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/transformer.js:
--------------------------------------------------------------------------------
1 | // Remove perf timing statements from build
2 | module.exports = function (program, cfg, { ts }) {
3 | return (ctx) => {
4 | return function visit(node) {
5 | if (process.env.CI || !cfg.removePerf) return node;
6 | // Remove the functions
7 | if (ts.isFunctionDeclaration(node)
8 | && [ 'perfStart', 'perfStop' ].includes(node.name.escapedText))
9 | return undefined
10 |
11 | // Remove the invokes
12 | if (
13 | ts.isExpressionStatement(node) && ts.isCallExpression(node.expression) &&
14 | [ 'perfStart', 'perfStop' ].includes(node.expression.expression.text)
15 | )
16 | return undefined;
17 |
18 | return ts.visitEachChild(node, visit, ctx);
19 | }
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/tsconfig.base.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "noErrorTruncation": false,
4 | "incremental": true,
5 |
6 | "lib": [ "esnext", "dom" ],
7 | "target": "ES2017",
8 | "module": "CommonJS",
9 | "moduleResolution": "node",
10 |
11 | "strict": true,
12 | "declaration": true,
13 | "preserveConstEnums": true,
14 | "removeComments": false,
15 | "sourceMap": true,
16 | "allowSyntheticDefaultImports": true,
17 | "esModuleInterop": true,
18 | "experimentalDecorators": true,
19 | "emitDecoratorMetadata": true,
20 | "preserveSymlinks": true,
21 | "stripInternal": true
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "./tsconfig.base",
3 | "include": [ "src" ],
4 |
5 | "compilerOptions": {
6 | "rootDir": "src",
7 | "outDir": "dist",
8 | "sourceMap": true,
9 | "plugins": [
10 | {
11 | "transform": "./transformer.js",
12 | "removePerf": true
13 | }
14 | ]
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
Join the Nation's Conversation
To find out more about Facebook commenting please read the Conversation Guidelines and FAQs