├── .editorconfig
├── .gitignore
├── CHANGELOG.md
├── README.md
├── UNLICENSE
├── cmd.js
├── index.js
├── package-lock.json
├── package.json
├── test.html
└── test.js
/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*]
4 | charset = utf-8
5 | end_of_line = lf
6 | indent_size = 4
7 | indent_style = space
8 | insert_final_newline = true
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | .idea/
3 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | 2.0.0
2 | -----
3 |
4 | * New options to prevent clobbering Angular files, thanks to @joeyparrish:
5 | * allow-attributes-without-values
6 | * lower-case-tags
7 | * lower-case-attribute-names
8 | * I'm not sure when the CLI broke but it works again.
9 | * The replace-nbsp option has been renamed to decode-entities.
10 | * Script and style tags are no longer removed with the new preserve-tags option.
11 | Fixes #12, #13 and #19.
12 |
13 | 1.5.0
14 | -----
15 |
16 | Regular expressions are now supported in the remove-attributes,
17 | remove-empty-tags and remove-tags options. Thanks, @smnbbrv!
18 |
19 | 1.4.3
20 | -----
21 |
22 | Extra spaces are now removed from attribute values.
23 |
24 | 1.4.2
25 | -----
26 |
27 | Hanging indent is now applied to wrapped lines.
28 |
29 | Multiline comments are now squashed into a single line, just like text. This
30 | makes wrapping them easier and simplifies how conditional comments are handled.
31 |
32 | 1.4.1
33 | -----
34 |
35 | Maximum call stack error when trying to wrap lines without spaces has been
36 | fixed.
37 |
38 | Support for conditional comments has been added.
39 |
40 | Trying to preserve CSS and JavaScript formatting is a pain, so style and
41 | script tags are no longer supported in this release. They will simply be
42 | removed from the output.
43 |
44 | 1.4.0
45 | -----
46 |
47 | The license has been switched from ISC to [Unlicense](http://unlicense.org).
48 |
49 | 1.3.8
50 | -----
51 |
52 | The htmlparser2 and minimist dependencies have been updated.
53 |
54 | 1.3.7
55 | -----
56 |
57 | Up until now, this thing really only supported cleaning fragments of HTML. If
58 | you tried to feed it an entire HTML page (with doctype declaration, style
59 | tags, script tags, etc.) it would blow up.
60 |
61 | Thanks in part to @RonanDrouglazet, this embarassing oversight has been
62 | addressed. However, I have no intention of turning this into a CSS or
63 | JavaScript cleaner/formatter. Anything found within a style or script tag will
64 | be output as is.
65 |
66 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HTML cleaner and beautifier
2 |
3 | 
4 | 
5 | 
6 | 
7 |
8 | ## Usage
9 |
10 | ### In a script
11 |
12 | ```javascript
13 | const cleaner = require('clean-html');
14 | const fs = require('fs');
15 |
16 | fs.readFile('foo.html', 'utf8', (err, input) => {
17 | cleaner.clean(input, output => console.log(output));
18 | });
19 | ```
20 |
21 | Options can be provided like so:
22 |
23 | ```
24 | const options = {
25 | 'break-around-comments': false,
26 | 'decode-entities': true,
27 | 'remove-tags': ['b', 'i', 'center', 'font'],
28 | 'wrap': 80
29 | };
30 |
31 | cleaner.clean(input, options, output => {...});
32 | ```
33 |
34 | ### From the command line
35 |
36 | If installed globally, just run `clean-html`. Otherwise, run `npx clean-html`.
37 |
38 | Input can be piped from stdin:
39 |
40 | ```
41 | $ echo '
Hello, World!
' | clean-html
42 | $ cat foo.html | clean-html
43 | ```
44 |
45 | Or you can provide a filename as the first argument:
46 |
47 | ```
48 | $ clean-html foo.html
49 | ```
50 |
51 | Output can be redirected to another file:
52 |
53 | ```
54 | $ clean-html foo.html > bar.html
55 | ```
56 |
57 | Or you can edit the file in place:
58 |
59 | ```
60 | $ clean-html foo.html --in-place
61 | ```
62 |
63 | Other options can be provided like so:
64 |
65 | ```
66 | $ clean-html foo.html \
67 | --break-around-comments \
68 | --decode-entities false \
69 | --remove-tags b,i,center,font \
70 | --wrap 80
71 | ```
72 |
73 | > Array type option values should be separated by commas. Boolean type options are disabled if
74 | > followed by `false` and enabled if followed by `true` or nothing.
75 |
76 | ## Options
77 |
78 | ### allow-attributes-without-values
79 |
80 | Allows attributes to be output without values. For example, `checked` instead of `checked=""`.
81 |
82 | Please set to `true` for Angular components or for `` elements.
83 |
84 | Type: Boolean
85 | Default: `false`
86 |
87 | ### break-around-comments
88 |
89 | Adds line breaks before and after comments.
90 |
91 | Type: Boolean
92 | Default: `true`
93 |
94 | ### break-around-tags
95 |
96 | Tags that should have line breaks added before and after.
97 |
98 | Type: Array of strings
99 | Default: `['body', 'blockquote', 'br', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr',
100 | 'link', 'meta', 'p', 'table', 'title', 'td', 'tr']`
101 |
102 | ### decode-entities
103 |
104 | Replaces HTML entities with their decoded equivalents. e.g., if `true` then ` ` will be
105 | replaced by a space character.
106 |
107 | Type: Boolean
108 | Default: `false`
109 |
110 | ### indent
111 |
112 | The string to use for indentation. e.g., a tab character or one or more spaces.
113 |
114 | Type: String
115 | Default: `' '` (two spaces)
116 |
117 | ### lower-case-tags
118 |
119 | Converts all tag names to lower case.
120 |
121 | Please set to `false` for Angular components.
122 |
123 | Type: Boolean
124 | Default: `true`
125 |
126 | ### lower-case-attribute-names
127 |
128 | Converts all attribute names to lower case.
129 |
130 | Please set to `false` for Angular components.
131 |
132 | Type: Boolean
133 | Default: `true`
134 |
135 | ### preserve-tags
136 |
137 | Tags that should be left alone. i.e., content inside these tags will not be formatted or indented.
138 |
139 | Type: Array of strings
140 | Default: `['script', 'style']`
141 |
142 | ### remove-attributes
143 |
144 | Attributes to remove from markup.
145 |
146 | Type: Array of strings or regular expressions
147 | Default: `['align', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'color', 'height', 'target',
148 | 'valign', 'width']`
149 |
150 | ### remove-comments
151 |
152 | Removes comments.
153 |
154 | Type: Boolean
155 | Default: `false`
156 |
157 | ### remove-empty-tags
158 |
159 | Tags to remove from markup if empty.
160 |
161 | Type: Array of strings or regular expressions
162 | Default: `[]`
163 |
164 | ### remove-tags
165 |
166 | Tags to always remove from markup. Nested content is preserved.
167 |
168 | Type: Array of strings or regular expressions
169 | Default: `['center', 'font']`
170 |
171 | ### wrap
172 |
173 | The column number where lines should wrap. Set to 0 to disable line wrapping.
174 |
175 | Type: Integer
176 | Default: `120`
177 |
178 | ## Adding values to option lists
179 |
180 | These options exist for your convenience.
181 |
182 | ### add-break-around-tags
183 |
184 | Additional tags to include in `break-around-tags`.
185 |
186 | Type: Array of strings
187 | Default: `null`
188 |
189 | ### add-remove-attributes
190 |
191 | Additional attributes to include in `remove-attributes`.
192 |
193 | Type: Array of strings
194 | Default: `null`
195 |
196 | ### add-remove-tags
197 |
198 | Additional tags to include in `remove-tags`.
199 |
200 | Type: Array of strings
201 | Default: `null`
202 |
--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
1 | This is free and unencumbered software released into the public domain.
2 |
3 | Anyone is free to copy, modify, publish, use, compile, sell, or
4 | distribute this software, either in source code form or as a compiled
5 | binary, for any purpose, commercial or non-commercial, and by any
6 | means.
7 |
8 | In jurisdictions that recognize copyright laws, the author or authors
9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 |
24 | For more information, please refer to
25 |
--------------------------------------------------------------------------------
/cmd.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | const fs = require('node:fs');
4 |
5 | const parseArgs = require('minimist');
6 |
7 | const cleaner = require('./index.js');
8 |
9 | const argv = parseArgs(process.argv.slice(2));
10 | const filename = argv['_'][0];
11 | const inPlace = getOptAsBool(argv['in-place']);
12 |
13 | const options = {
14 | 'allow-attributes-without-values': getOptAsBool(argv['allow-attributes-without-values']),
15 | 'break-around-comments': getOptAsBool(argv['break-around-comments']),
16 | 'break-around-tags': getOptAsArray(argv['break-around-tags']),
17 | 'decode-entities': getOptAsBool(argv['decode-entities']),
18 | 'indent': argv['indent'],
19 | 'lower-case-tags': getOptAsBool(argv['lower-case-tags']),
20 | 'lower-case-attribute-names': getOptAsBool(argv['lower-case-attribute-names']),
21 | 'preserve-tags': getOptAsArray(argv['preserve-tags']),
22 | 'remove-attributes': getOptAsArray(argv['remove-attributes']),
23 | 'remove-comments': getOptAsBool(argv['remove-comments']),
24 | 'remove-empty-tags': getOptAsArray(argv['remove-empty-tags']),
25 | 'remove-tags': getOptAsArray(argv['remove-tags']),
26 | 'wrap': getOptAsInt(argv['wrap']),
27 | 'add-break-around-tags': getOptAsArray(argv['add-break-around-tags']),
28 | 'add-remove-attributes': getOptAsArray(argv['add-remove-attributes']),
29 | 'add-remove-tags': getOptAsArray(argv['add-remove-tags'])
30 | };
31 |
32 | function getOptAsArray(opt) {
33 | if (opt === undefined) {
34 | return undefined;
35 | }
36 |
37 | if (Array.isArray(opt)) {
38 | return opt
39 | .map(o => o.split(','))
40 | .reduce((prev, curr) => prev.concat(curr));
41 | }
42 |
43 | return opt.split(',');
44 | }
45 |
46 | function getOptAsBool(opt) {
47 | if (opt === undefined) {
48 | return undefined;
49 | }
50 |
51 | return opt === true || opt === 'true';
52 | }
53 |
54 | function getOptAsInt(opt) {
55 | if (opt === undefined) {
56 | return undefined;
57 | }
58 |
59 | const val = parseInt(opt);
60 |
61 | return isNaN(val) ? undefined : val;
62 | }
63 |
64 | function read(filename, callback) {
65 | return fs.readFile(filename, 'utf8', (err, data) => {
66 | if (err) {
67 | throw err;
68 | }
69 |
70 | callback(data);
71 | });
72 | }
73 |
74 | function write(html, filename) {
75 | return fs.writeFile(filename, html + '\n', err => {
76 | if (err) {
77 | throw err;
78 | }
79 | });
80 | }
81 |
82 | read(filename || process.stdin.fd, data => {
83 | cleaner.clean(data, options, html => {
84 | if (filename && inPlace) {
85 | return write(html, filename);
86 | }
87 |
88 | write(html, process.stdout.fd);
89 | });
90 | });
91 |
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | const htmlparser = require('htmlparser2');
2 |
3 | const voidElements = [
4 | 'area',
5 | 'base',
6 | 'basefont',
7 | 'br',
8 | 'col',
9 | 'command',
10 | 'embed',
11 | 'frame',
12 | 'hr',
13 | 'img',
14 | 'input',
15 | 'isindex',
16 | 'keygen',
17 | 'link',
18 | 'meta',
19 | 'param',
20 | 'source',
21 | 'track',
22 | 'wbr'
23 | ];
24 |
25 | let options = {};
26 |
27 | function setup(opt) {
28 | options = {
29 | 'allow-attributes-without-values': opt['allow-attributes-without-values'] === true ? true : false,
30 | 'break-around-comments': opt['break-around-comments'] === false ? false : true,
31 | 'break-around-tags': opt['break-around-tags'] || [
32 | 'blockquote',
33 | 'body',
34 | 'br',
35 | 'div',
36 | 'h1',
37 | 'h2',
38 | 'h3',
39 | 'h4',
40 | 'h5',
41 | 'h6',
42 | 'head',
43 | 'hr',
44 | 'link',
45 | 'meta',
46 | 'p',
47 | 'table',
48 | 'td',
49 | 'title',
50 | 'tr'
51 | ],
52 | 'decode-entities': opt['decode-entities'] === true ? true : false,
53 | 'indent': opt['indent'] || ' ',
54 | 'lower-case-tags': opt['lower-case-tags'] === false ? false : true,
55 | 'lower-case-attribute-names': opt['lower-case-attribute-names'] === false ? false : true,
56 | 'preserve-tags': opt['preserve-tags'] || [
57 | 'math',
58 | 'script',
59 | 'style',
60 | 'svg'
61 | ],
62 | 'remove-attributes': opt['remove-attributes'] || [
63 | 'align',
64 | 'bgcolor',
65 | 'border',
66 | 'cellpadding',
67 | 'cellspacing',
68 | 'color',
69 | 'height',
70 | 'target',
71 | 'valign',
72 | 'width'
73 | ],
74 | 'remove-comments': opt['remove-comments'] === true ? true : false,
75 | 'remove-empty-tags': opt['remove-empty-tags'] || [],
76 | 'remove-tags': opt['remove-tags'] || [
77 | 'center',
78 | 'font'
79 | ],
80 | 'wrap': opt['wrap'] >= 0 ? opt['wrap'] : 120
81 | };
82 |
83 | if (opt['add-break-around-tags']) {
84 | options['break-around-tags'] = options['break-around-tags'].concat(opt['add-break-around-tags']);
85 | }
86 |
87 | if (opt['add-remove-attributes']) {
88 | options['remove-attributes'] = options['remove-attributes'].concat(opt['add-remove-attributes']);
89 | }
90 |
91 | if (opt['add-remove-tags']) {
92 | options['remove-tags'] = options['remove-tags'].concat(opt['add-remove-tags']);
93 | }
94 | }
95 |
96 | function breakAround(node) {
97 | if (shouldRemove(node)) {
98 | return false;
99 | }
100 |
101 | if (node.type == 'text') {
102 | return false;
103 | }
104 |
105 | if (node.type == 'comment') {
106 | return options['break-around-comments'];
107 | }
108 |
109 | if (options['break-around-tags'].includes(node.name)) {
110 | return true;
111 | }
112 |
113 | return breakWithin(node);
114 | }
115 |
116 | function breakWithin(node) {
117 | if (shouldRemove(node)) {
118 | return false;
119 | }
120 |
121 | if (node.type != 'tag') {
122 | return false;
123 | }
124 |
125 | return node.children.some(breakAround) || node.children.some(breakWithin);
126 | }
127 |
128 | function isEmpty(node) {
129 | if (node.type == 'text') {
130 | return !node.data.trim();
131 | }
132 |
133 | if (node.type == 'comment') {
134 | return !node.data.trim();
135 | }
136 |
137 | if (voidElements.includes(node.name)) {
138 | return false;
139 | }
140 |
141 | return !node.children.length || node.children.every(isEmpty);
142 | }
143 |
144 | function removeExtraSpace(text) {
145 | return text.replace(/\s+/g, ' ');
146 | }
147 |
148 | function shouldRemove(node) {
149 | if (node.type == 'text') {
150 | return isEmpty(node);
151 | }
152 |
153 | if (node.type == 'comment') {
154 | return options['remove-comments'] || isEmpty(node);
155 | }
156 |
157 | if (isListedInOptions('remove-empty-tags', node.name)) {
158 | return isEmpty(node);
159 | }
160 |
161 | return isListedInOptions('remove-tags', node.name);
162 | }
163 |
164 | function isListedInOptions(optionsArrayName, name) {
165 | return options[optionsArrayName].some(option => {
166 | return option instanceof RegExp && option.test(name) || option === name;
167 | });
168 | }
169 |
170 | function renderText(node) {
171 | if (shouldRemove(node)) {
172 | return '';
173 | }
174 |
175 | let text = removeExtraSpace(node.data);
176 |
177 | if (!node.prev || breakAround(node.prev)) {
178 | text = text.trimLeft();
179 | }
180 |
181 | if (!node.next || breakAround(node.next)) {
182 | text = text.trimRight();
183 | }
184 |
185 | return text;
186 | }
187 |
188 | function renderComment(node) {
189 | if (shouldRemove(node)) {
190 | return '';
191 | }
192 |
193 | const comment = '';
194 |
195 | if (breakAround(node)) {
196 | return '\n' + comment + '\n';
197 | }
198 |
199 | return comment;
200 | }
201 |
202 | function renderTag(node) {
203 | if (shouldRemove(node)) {
204 | if (isEmpty(node)) {
205 | return '';
206 | }
207 |
208 | return render(node.children);
209 | }
210 |
211 | let openTag = '<' + node.name;
212 |
213 | for (let attrib in node.attribs) {
214 | if (!isListedInOptions('remove-attributes', attrib)) {
215 | if (!node.attribs[attrib] && options['allow-attributes-without-values']) {
216 | openTag += ' ' + attrib;
217 | } else {
218 | openTag += ` ${attrib}="${removeExtraSpace(node.attribs[attrib])}"`;
219 | }
220 | }
221 | }
222 |
223 | openTag += '>';
224 |
225 | if (voidElements.includes(node.name)) {
226 | if (breakAround(node)) {
227 | return '\n' + openTag + '\n';
228 | }
229 |
230 | return openTag;
231 | }
232 |
233 | let closeTag = '' + node.name + '>';
234 |
235 | if (breakAround(node)) {
236 | openTag = '\n' + openTag;
237 | closeTag = closeTag + '\n';
238 | }
239 |
240 | if (breakWithin(node)) {
241 | openTag = openTag + '\n';
242 | closeTag = '\n' + closeTag;
243 | }
244 |
245 | return openTag + render(node.children) + closeTag;
246 | }
247 |
248 | function renderDirective(node) {
249 | return '<' + node.data + '>';
250 | }
251 |
252 | function render(nodes) {
253 | let html = '';
254 |
255 | nodes.forEach(node => {
256 | if (node.type == 'root') {
257 | html += render(node.children);
258 | return;
259 | }
260 |
261 | if (node.type == 'text') {
262 | html += renderText(node);
263 | return;
264 | }
265 |
266 | if (node.type == 'comment') {
267 | html += renderComment(node);
268 | return;
269 | }
270 |
271 | if (node.type == 'directive') {
272 | html += renderDirective(node)
273 | return;
274 | }
275 |
276 | html += renderTag(node);
277 | });
278 |
279 | // remove extra line breaks
280 | return html.replace(/\n+/g, '\n');
281 | }
282 |
283 | function wrap(line, indent) {
284 | // find the last space before the column limit
285 | let bound = line.lastIndexOf(' ', options['wrap']);
286 |
287 | if (bound == -1) {
288 | // there are no spaces before the colum limit
289 | // so find the first space after it
290 | bound = line.indexOf(' ', options['wrap']);
291 |
292 | if (bound == -1) {
293 | // there are no spaces in the line
294 | // so we can't wrap it
295 | return line;
296 | }
297 | }
298 |
299 | const line1 = line.substr(0, bound);
300 | let line2 = indent + options['indent'].repeat(2) + line.substr(bound + 1);
301 |
302 | if (line1.trim().length == 0) {
303 | // there are no spaces in the line other than the indent
304 | // so we can't wrap it
305 | return line;
306 | }
307 |
308 | if (line2.length > options['wrap']) {
309 | line2 = wrap(line2, indent);
310 | }
311 |
312 | return line1 + '\n' + line2;
313 | }
314 |
315 | function indent(html) {
316 | let indentLevel = 0;
317 | const openTagRe = /^<(\w+)[^>]*>$/;
318 | const closeTagRe = /^<\/(\w+)>$/;
319 |
320 | return html.split('\n').map(line => {
321 | const closeTagMatch = line.match(closeTagRe);
322 |
323 | if (closeTagMatch) {
324 | indentLevel--;
325 | }
326 |
327 | const indent = options['indent'].repeat(indentLevel);
328 | const indented = indent + line;
329 |
330 | const openTagMatch = line.match(openTagRe);
331 |
332 | if (openTagMatch && !voidElements.includes(openTagMatch[1])) {
333 | indentLevel++;
334 | }
335 |
336 | if (options['wrap'] && indented.length > options['wrap']) {
337 | return wrap(indented, indent);
338 | }
339 |
340 | return indented;
341 | }).join('\n');
342 | }
343 |
344 | const preserveTagReplacements = {};
345 |
346 | function preserveTags(html) {
347 | const tagPattern = options['preserve-tags'].join('|');
348 | const re = new RegExp(`<(?:${tagPattern})[^>]*>.*?<\/(?:${tagPattern})>`, 'gs');
349 |
350 | return html.replace(re, (match, offset) => {
351 | preserveTagReplacements[offset] = match;
352 | return ``;
353 | });
354 | }
355 |
356 | function undoPreserveTags(html) {
357 | const re = //g;
358 |
359 | return html.replace(re, (_, offset) => {
360 | return preserveTagReplacements[offset];
361 | });
362 | }
363 |
364 | function clean(html, opt, callback) {
365 | if (typeof opt == 'function') {
366 | callback = opt;
367 | opt = null;
368 | }
369 |
370 | setup(opt || {});
371 |
372 | const handler = new htmlparser.DomHandler((err, dom) => {
373 | if (err) {
374 | throw err;
375 | }
376 |
377 | callback(
378 | undoPreserveTags(
379 | indent(
380 | render(dom)
381 | ).trim()
382 | )
383 | );
384 | });
385 |
386 | const parser = new htmlparser.Parser(handler, {
387 | decodeEntities: options['decode-entities'],
388 | lowerCaseTags: options['lower-case-tags'],
389 | lowerCaseAttributeNames: options['lower-case-attribute-names'],
390 | });
391 |
392 | parser.write(
393 | preserveTags(html)
394 | );
395 |
396 | parser.end();
397 | }
398 |
399 | module.exports = {clean};
400 |
--------------------------------------------------------------------------------
/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "clean-html",
3 | "version": "2.0.1",
4 | "lockfileVersion": 3,
5 | "requires": true,
6 | "packages": {
7 | "": {
8 | "name": "clean-html",
9 | "version": "2.0.1",
10 | "license": "Unlicense",
11 | "dependencies": {
12 | "htmlparser2": "^8.0.2",
13 | "minimist": "^1.2.8"
14 | },
15 | "bin": {
16 | "clean-html": "cmd.js"
17 | }
18 | },
19 | "node_modules/dom-serializer": {
20 | "version": "2.0.0",
21 | "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
22 | "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
23 | "dependencies": {
24 | "domelementtype": "^2.3.0",
25 | "domhandler": "^5.0.2",
26 | "entities": "^4.2.0"
27 | },
28 | "funding": {
29 | "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
30 | }
31 | },
32 | "node_modules/domelementtype": {
33 | "version": "2.3.0",
34 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
35 | "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
36 | "funding": [
37 | {
38 | "type": "github",
39 | "url": "https://github.com/sponsors/fb55"
40 | }
41 | ]
42 | },
43 | "node_modules/domhandler": {
44 | "version": "5.0.3",
45 | "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
46 | "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
47 | "dependencies": {
48 | "domelementtype": "^2.3.0"
49 | },
50 | "engines": {
51 | "node": ">= 4"
52 | },
53 | "funding": {
54 | "url": "https://github.com/fb55/domhandler?sponsor=1"
55 | }
56 | },
57 | "node_modules/domutils": {
58 | "version": "3.0.1",
59 | "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.0.1.tgz",
60 | "integrity": "sha512-z08c1l761iKhDFtfXO04C7kTdPBLi41zwOZl00WS8b5eiaebNpY00HKbztwBq+e3vyqWNwWF3mP9YLUeqIrF+Q==",
61 | "dependencies": {
62 | "dom-serializer": "^2.0.0",
63 | "domelementtype": "^2.3.0",
64 | "domhandler": "^5.0.1"
65 | },
66 | "funding": {
67 | "url": "https://github.com/fb55/domutils?sponsor=1"
68 | }
69 | },
70 | "node_modules/entities": {
71 | "version": "4.4.0",
72 | "resolved": "https://registry.npmjs.org/entities/-/entities-4.4.0.tgz",
73 | "integrity": "sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==",
74 | "engines": {
75 | "node": ">=0.12"
76 | },
77 | "funding": {
78 | "url": "https://github.com/fb55/entities?sponsor=1"
79 | }
80 | },
81 | "node_modules/htmlparser2": {
82 | "version": "8.0.2",
83 | "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
84 | "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
85 | "funding": [
86 | "https://github.com/fb55/htmlparser2?sponsor=1",
87 | {
88 | "type": "github",
89 | "url": "https://github.com/sponsors/fb55"
90 | }
91 | ],
92 | "dependencies": {
93 | "domelementtype": "^2.3.0",
94 | "domhandler": "^5.0.3",
95 | "domutils": "^3.0.1",
96 | "entities": "^4.4.0"
97 | }
98 | },
99 | "node_modules/minimist": {
100 | "version": "1.2.8",
101 | "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
102 | "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
103 | "funding": {
104 | "url": "https://github.com/sponsors/ljharb"
105 | }
106 | }
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "clean-html",
3 | "version": "2.0.1",
4 | "description": "HTML cleaner and beautifier",
5 | "main": "index.js",
6 | "bin": "cmd.js",
7 | "dependencies": {
8 | "htmlparser2": "^8.0.2",
9 | "minimist": "^1.2.8"
10 | },
11 | "files": [
12 | "cmd.js",
13 | "index.js",
14 | "package.json",
15 | "README.md",
16 | "release-notes.md",
17 | "UNLICENSE"
18 | ],
19 | "scripts": {
20 | "test": "node test.js"
21 | },
22 | "repository": {
23 | "type": "git",
24 | "url": "git@github.com:dave-kennedy/clean-html.git"
25 | },
26 | "keywords": [
27 | "beautify",
28 | "clean",
29 | "html",
30 | "pretty",
31 | "tidy"
32 | ],
33 | "author": "Dave Kennedy (http://github.com/dave-kennedy)",
34 | "license": "Unlicense",
35 | "bugs": {
36 | "url": "https://github.com/dave-kennedy/clean-html/issues"
37 | },
38 | "homepage": "https://github.com/dave-kennedy/clean-html"
39 | }
40 |
--------------------------------------------------------------------------------
/test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
Currently we have these articles available:
4 |
5 |
14 |
--------------------------------------------------------------------------------
/test.js:
--------------------------------------------------------------------------------
1 | const assert = require('node:assert/strict');
2 | const childProcess = require('node:child_process');
3 | const fs = require('node:fs');
4 | const os = require('node:os');
5 | const path = require('node:path');
6 | const util = require('node:util');
7 |
8 | const cleaner = require('./index.js');
9 |
10 | const results = [];
11 | const tests = [];
12 |
13 | function logFail(message) {
14 | return console.error(`\x1b[31m${message}\x1b[0m`);
15 | }
16 |
17 | function logPass(message) {
18 | return console.log(`\x1b[32m${message}\x1b[0m`);
19 | }
20 |
21 | function registerTest(description, callback) {
22 | tests.push({description, callback});
23 | }
24 |
25 | function runTest(description, callback) {
26 | try {
27 | callback();
28 | } catch (error) {
29 | if (error instanceof assert.AssertionError) {
30 | const message = `✗ ${description}\n` +
31 | ` Expected: ${util.inspect(error.expected)}\n` +
32 | ` Actual: ${util.inspect(error.actual)}`;
33 |
34 | logFail(message);
35 | results.push({message, result: 'fail'});
36 | return;
37 | }
38 |
39 | const message = `✗ ${description}: ${error}`;
40 | logFail(message);
41 | results.push({message, result: 'fail'});
42 | return;
43 | }
44 |
45 | const message = `✓ ${description}`;
46 | logPass(message);
47 | results.push({message, result: 'pass'});
48 | }
49 |
50 | function runTests(filter) {
51 | const filteredTests = tests.filter(test => !filter || filter(test));
52 |
53 | if (filteredTests.length === 0) {
54 | logFail('No tests satisfy filter');
55 | process.exit(1);
56 | }
57 |
58 | for (const test of filteredTests) {
59 | runTest(test.description, test.callback);
60 | }
61 | }
62 |
63 | function summarizeResults() {
64 | const numPassed = results.filter(r => r.result == 'pass').length;
65 | const numFailed = results.filter(r => r.result == 'fail').length;
66 |
67 | if (numPassed > 0) {
68 | logPass(`Passed: ${numPassed}`);
69 | }
70 |
71 | if (numFailed > 0) {
72 | logFail(`Failed: ${numFailed}`);
73 | }
74 | }
75 |
76 | function test(description, callback) {
77 | registerTest(description, callback);
78 | }
79 |
80 | test('text is unchanged', () => {
81 | cleaner.clean('Foo Bar', html => {
82 | assert.equal(html, 'Foo Bar');
83 | });
84 | });
85 |
86 | test('extra whitespace is replaced by a single space', () => {
87 | cleaner.clean('Foo \n Bar', html => {
88 | assert.equal(html, 'Foo Bar');
89 | });
90 | });
91 |
92 | test('extra whitespace inside comment is replaced by a single space', () => {
93 | cleaner.clean('', html => {
94 | assert.equal(html, '');
95 | });
96 | });
97 |
98 | test('output is trimmed', () => {
99 | cleaner.clean(' foo\n', html => {
100 | assert.equal(html, 'foo');
101 | });
102 | });
103 |
104 | test('directive is unchanged', () => {
105 | cleaner.clean('', html => {
106 | assert.equal(html, '')
107 | });
108 | });
109 |
110 | test('empty value is added when allow-attributes-without-values is false', () => {
111 | cleaner.clean('', {'allow-attributes-without-values': false}, html => {
112 | assert.equal(html, '');
113 | });
114 | });
115 |
116 | test('empty value not added when allow-attributes-without-values is true', () => {
117 | cleaner.clean('', {'allow-attributes-without-values': true}, html => {
118 | assert.equal(html, '');
119 | });
120 | });
121 |
122 | test('line breaks are not added around comments when break-around-comments is false', () => {
123 | cleaner.clean('fooqux', {'break-around-comments': false}, html => {
124 | assert.equal(html, 'fooqux');
125 | });
126 | });
127 |
128 | test('line breaks are added around comments when break-around-comments is true', () => {
129 | cleaner.clean('fooqux', {'break-around-comments': true}, html => {
130 | assert.equal(html, 'foo\n\nqux');
131 | });
132 | });
133 |
134 | test('line breaks are not added around tags when not included in break-around-tags', () => {
135 | cleaner.clean('foobar', {'break-around-tags': []}, html => {
136 | assert.equal(html, 'foobar');
137 | });
138 | });
139 |
140 | test('line breaks are added around tags when included in break-around-tags', () => {
141 | cleaner.clean('foobar', {'break-around-tags': ['div']}, html => {
142 | assert.equal(html, 'foo\n\nbar');
143 | });
144 | });
145 |
146 | test('non-breaking space is not replaced by a single space when decode-entities is false', () => {
147 | cleaner.clean('Foo Bar', {'decode-entities': false}, html => {
148 | assert.equal(html, 'Foo Bar');
149 | });
150 | });
151 |
152 | test('non-breaking space is replaced by a single space when decode-entities is true', () => {
153 | cleaner.clean('Foo Bar', {'decode-entities': true}, html => {
154 | assert.equal(html, 'Foo Bar');
155 | });
156 | });
157 |
158 | test('tag is lowercased when lower-case-tags is true', () => {
159 | cleaner.clean('bar', {'lower-case-tags': true}, html => {
160 | assert.equal(html, 'bar');
161 | });
162 | });
163 |
164 | test('tag is not lowercased when lower-case-tags is false', () => {
165 | cleaner.clean('bar', {'lower-case-tags': false}, html => {
166 | assert.equal(html, 'bar');
167 | });
168 | });
169 |
170 | test('attribute name is lowercased when lower-case-attribute-names is true', () => {
171 | cleaner.clean('bar', {'lower-case-attribute-names': true}, html => {
172 | assert.equal(html, 'bar');
173 | });
174 | });
175 |
176 | test('attribute name is not lowercased when lower-case-attribute-names is false', () => {
177 | cleaner.clean('bar', {'lower-case-attribute-names': false}, html => {
178 | assert.equal(html, 'bar');
179 | });
180 | });
181 |
182 | test('tag is not preserved when not included in preserve-tags', () => {
183 | const input = ``;
190 |
191 | cleaner.clean(input, {'preserve-tags': []}, output => {
192 | assert.notEqual(output, input);
193 | });
194 | });
195 |
196 | test('tag is preserved when included in preserve-tags', () => {
197 | const input = ``;
204 |
205 | cleaner.clean(input, {'preserve-tags': ['script']}, output => {
206 | assert.equal(output, input);
207 | });
208 | });
209 |
210 | test('attribute is not removed when not included in remove-attributes', () => {
211 | cleaner.clean('foo', {'remove-attributes': []}, html => {
212 | assert.equal(html, 'foo');
213 | });
214 | });
215 |
216 | test('attribute is removed when included in remove-attributes', () => {
217 | cleaner.clean('foo', {'remove-attributes': ['color']}, html => {
218 | assert.equal(html, 'foo');
219 | });
220 | });
221 |
222 | test('attribute is removed when it matches at least one pattern included in remove-attributes', () => {
223 | cleaner.clean('foo', {'remove-attributes': [/_test-[a-z0-9-]+/i]}, html => {
224 | assert.equal(html, 'foo');
225 | });
226 | });
227 |
228 | test('comment is not removed when remove-comments is false', () => {
229 | cleaner.clean('', {'remove-comments': false}, html => {
230 | assert.equal(html, '');
231 | });
232 | });
233 |
234 | test('comment is removed when remove-comments is true', () => {
235 | cleaner.clean('', {'remove-comments': true}, html => {
236 | assert.equal(html, '');
237 | });
238 | });
239 |
240 | test('empty tag is not removed when not included in remove-empty-tags', () => {
241 | cleaner.clean('', {'remove-empty-tags': []}, html => {
242 | assert.equal(html, '');
243 | });
244 | });
245 |
246 | test('empty tag is removed when included in remove-empty-tags', () => {
247 | cleaner.clean('', {'remove-empty-tags': ['p']}, html => {
248 | assert.equal(html, '');
249 | });
250 | });
251 |
252 | test('non-empty tag is not removed when included in remove-empty-tags', () => {
253 | cleaner.clean('
', {'remove-empty-tags': ['p']}, html => {
254 | assert.equal(html, '
');
255 | });
256 | });
257 |
258 | test('empty tag is removed when it matches at least one pattern included in remove-empty-tags', () => {
259 | cleaner.clean('', {'remove-empty-tags': [/^app-.*/i]}, html => {
260 | assert.equal(html, '');
261 | });
262 | });
263 |
264 | test('tag is not removed when not included in remove-tags', () => {
265 | cleaner.clean('foo', {'remove-tags': []}, html => {
266 | assert.equal(html, 'foo');
267 | });
268 | });
269 |
270 | test('tag is removed and child is preserved when included in remove-tags', () => {
271 | cleaner.clean('foo', {'remove-tags': ['font']}, html => {
272 | assert.equal(html, 'foo');
273 | });
274 | });
275 |
276 | test('tag is removed and child is preserved when it matches at least one pattern included in remove-tags', () => {
277 | cleaner.clean('foo', {'remove-tags': [/app-.+/]}, html => {
278 | assert.equal(html, 'foo');
279 | });
280 | });
281 |
282 | // indent tests
283 |
284 | test('indent is not added when child is text', () => {
285 | cleaner.clean('foobarqux', {'indent': ' '}, html => {
286 | assert.equal(html, 'foobarqux');
287 | });
288 | });
289 |
290 | test('indent is not added when child is comment and break-around-comments is false', () => {
291 | cleaner.clean('fooqux', {'break-around-comments': false, 'indent': ' '}, html => {
292 | assert.equal(html, 'fooqux');
293 | });
294 | });
295 |
296 | test('indent is added when child is comment and break-around-comments is true', () => {
297 | cleaner.clean('fooqux', {'break-around-comments': true, 'indent': ' '}, html => {
298 | assert.equal(html, 'foo\n\n \n\nqux');
299 | });
300 | });
301 |
302 | test('indent is not added when child tag is not included in break-around-tags', () => {
303 | cleaner.clean('foo
\n\nqux');
311 | });
312 | });
313 |
314 | test('indent is added when child tag is not included in break-around-tags but descendant is', () => {
315 | cleaner.clean('foo