├── .editorconfig ├── .gitignore ├── CHANGELOG.md ├── README.md ├── UNLICENSE ├── cmd.js ├── index.js ├── package-lock.json ├── package.json ├── test.html └── test.js /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | indent_size = 4 7 | indent_style = space 8 | insert_final_newline = true 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .idea/ 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2.0.0 2 | ----- 3 | 4 | * New options to prevent clobbering Angular files, thanks to @joeyparrish: 5 | * allow-attributes-without-values 6 | * lower-case-tags 7 | * lower-case-attribute-names 8 | * I'm not sure when the CLI broke but it works again. 9 | * The replace-nbsp option has been renamed to decode-entities. 10 | * Script and style tags are no longer removed with the new preserve-tags option. 11 | Fixes #12, #13 and #19. 12 | 13 | 1.5.0 14 | ----- 15 | 16 | Regular expressions are now supported in the remove-attributes, 17 | remove-empty-tags and remove-tags options. Thanks, @smnbbrv! 18 | 19 | 1.4.3 20 | ----- 21 | 22 | Extra spaces are now removed from attribute values. 23 | 24 | 1.4.2 25 | ----- 26 | 27 | Hanging indent is now applied to wrapped lines. 28 | 29 | Multiline comments are now squashed into a single line, just like text. This 30 | makes wrapping them easier and simplifies how conditional comments are handled. 31 | 32 | 1.4.1 33 | ----- 34 | 35 | Maximum call stack error when trying to wrap lines without spaces has been 36 | fixed. 37 | 38 | Support for conditional comments has been added. 39 | 40 | Trying to preserve CSS and JavaScript formatting is a pain, so style and 41 | script tags are no longer supported in this release. They will simply be 42 | removed from the output. 43 | 44 | 1.4.0 45 | ----- 46 | 47 | The license has been switched from ISC to [Unlicense](http://unlicense.org). 48 | 49 | 1.3.8 50 | ----- 51 | 52 | The htmlparser2 and minimist dependencies have been updated. 53 | 54 | 1.3.7 55 | ----- 56 | 57 | Up until now, this thing really only supported cleaning fragments of HTML. If 58 | you tried to feed it an entire HTML page (with doctype declaration, style 59 | tags, script tags, etc.) it would blow up. 60 | 61 | Thanks in part to @RonanDrouglazet, this embarassing oversight has been 62 | addressed. However, I have no intention of turning this into a CSS or 63 | JavaScript cleaner/formatter. Anything found within a style or script tag will 64 | be output as is. 65 | 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTML cleaner and beautifier 2 | 3 | ![npm](https://img.shields.io/npm/v/clean-html) 4 | ![npm](https://img.shields.io/npm/dw/clean-html) 5 | ![Libraries.io dependency status for GitHub repo](https://img.shields.io/librariesio/github/dave-kennedy/clean-html) 6 | ![Snyk Vulnerabilities for GitHub Repo](https://img.shields.io/snyk/vulnerabilities/github/dave-kennedy/clean-html) 7 | 8 | ## Usage 9 | 10 | ### In a script 11 | 12 | ```javascript 13 | const cleaner = require('clean-html'); 14 | const fs = require('fs'); 15 | 16 | fs.readFile('foo.html', 'utf8', (err, input) => { 17 | cleaner.clean(input, output => console.log(output)); 18 | }); 19 | ``` 20 | 21 | Options can be provided like so: 22 | 23 | ``` 24 | const options = { 25 | 'break-around-comments': false, 26 | 'decode-entities': true, 27 | 'remove-tags': ['b', 'i', 'center', 'font'], 28 | 'wrap': 80 29 | }; 30 | 31 | cleaner.clean(input, options, output => {...}); 32 | ``` 33 | 34 | ### From the command line 35 | 36 | If installed globally, just run `clean-html`. Otherwise, run `npx clean-html`. 37 | 38 | Input can be piped from stdin: 39 | 40 | ``` 41 | $ echo '

Hello, World!

' | clean-html 42 | $ cat foo.html | clean-html 43 | ``` 44 | 45 | Or you can provide a filename as the first argument: 46 | 47 | ``` 48 | $ clean-html foo.html 49 | ``` 50 | 51 | Output can be redirected to another file: 52 | 53 | ``` 54 | $ clean-html foo.html > bar.html 55 | ``` 56 | 57 | Or you can edit the file in place: 58 | 59 | ``` 60 | $ clean-html foo.html --in-place 61 | ``` 62 | 63 | Other options can be provided like so: 64 | 65 | ``` 66 | $ clean-html foo.html \ 67 | --break-around-comments \ 68 | --decode-entities false \ 69 | --remove-tags b,i,center,font \ 70 | --wrap 80 71 | ``` 72 | 73 | > Array type option values should be separated by commas. Boolean type options are disabled if 74 | > followed by `false` and enabled if followed by `true` or nothing. 75 | 76 | ## Options 77 | 78 | ### allow-attributes-without-values 79 | 80 | Allows attributes to be output without values. For example, `checked` instead of `checked=""`. 81 | 82 | Please set to `true` for Angular components or for `` elements. 83 | 84 | Type: Boolean 85 | Default: `false` 86 | 87 | ### break-around-comments 88 | 89 | Adds line breaks before and after comments. 90 | 91 | Type: Boolean 92 | Default: `true` 93 | 94 | ### break-around-tags 95 | 96 | Tags that should have line breaks added before and after. 97 | 98 | Type: Array of strings 99 | Default: `['body', 'blockquote', 'br', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 100 | 'link', 'meta', 'p', 'table', 'title', 'td', 'tr']` 101 | 102 | ### decode-entities 103 | 104 | Replaces HTML entities with their decoded equivalents. e.g., if `true` then ` ` will be 105 | replaced by a space character. 106 | 107 | Type: Boolean 108 | Default: `false` 109 | 110 | ### indent 111 | 112 | The string to use for indentation. e.g., a tab character or one or more spaces. 113 | 114 | Type: String 115 | Default: `' '` (two spaces) 116 | 117 | ### lower-case-tags 118 | 119 | Converts all tag names to lower case. 120 | 121 | Please set to `false` for Angular components. 122 | 123 | Type: Boolean 124 | Default: `true` 125 | 126 | ### lower-case-attribute-names 127 | 128 | Converts all attribute names to lower case. 129 | 130 | Please set to `false` for Angular components. 131 | 132 | Type: Boolean 133 | Default: `true` 134 | 135 | ### preserve-tags 136 | 137 | Tags that should be left alone. i.e., content inside these tags will not be formatted or indented. 138 | 139 | Type: Array of strings 140 | Default: `['script', 'style']` 141 | 142 | ### remove-attributes 143 | 144 | Attributes to remove from markup. 145 | 146 | Type: Array of strings or regular expressions 147 | Default: `['align', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'color', 'height', 'target', 148 | 'valign', 'width']` 149 | 150 | ### remove-comments 151 | 152 | Removes comments. 153 | 154 | Type: Boolean 155 | Default: `false` 156 | 157 | ### remove-empty-tags 158 | 159 | Tags to remove from markup if empty. 160 | 161 | Type: Array of strings or regular expressions 162 | Default: `[]` 163 | 164 | ### remove-tags 165 | 166 | Tags to always remove from markup. Nested content is preserved. 167 | 168 | Type: Array of strings or regular expressions 169 | Default: `['center', 'font']` 170 | 171 | ### wrap 172 | 173 | The column number where lines should wrap. Set to 0 to disable line wrapping. 174 | 175 | Type: Integer 176 | Default: `120` 177 | 178 | ## Adding values to option lists 179 | 180 | These options exist for your convenience. 181 | 182 | ### add-break-around-tags 183 | 184 | Additional tags to include in `break-around-tags`. 185 | 186 | Type: Array of strings 187 | Default: `null` 188 | 189 | ### add-remove-attributes 190 | 191 | Additional attributes to include in `remove-attributes`. 192 | 193 | Type: Array of strings 194 | Default: `null` 195 | 196 | ### add-remove-tags 197 | 198 | Additional tags to include in `remove-tags`. 199 | 200 | Type: Array of strings 201 | Default: `null` 202 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /cmd.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('node:fs'); 4 | 5 | const parseArgs = require('minimist'); 6 | 7 | const cleaner = require('./index.js'); 8 | 9 | const argv = parseArgs(process.argv.slice(2)); 10 | const filename = argv['_'][0]; 11 | const inPlace = getOptAsBool(argv['in-place']); 12 | 13 | const options = { 14 | 'allow-attributes-without-values': getOptAsBool(argv['allow-attributes-without-values']), 15 | 'break-around-comments': getOptAsBool(argv['break-around-comments']), 16 | 'break-around-tags': getOptAsArray(argv['break-around-tags']), 17 | 'decode-entities': getOptAsBool(argv['decode-entities']), 18 | 'indent': argv['indent'], 19 | 'lower-case-tags': getOptAsBool(argv['lower-case-tags']), 20 | 'lower-case-attribute-names': getOptAsBool(argv['lower-case-attribute-names']), 21 | 'preserve-tags': getOptAsArray(argv['preserve-tags']), 22 | 'remove-attributes': getOptAsArray(argv['remove-attributes']), 23 | 'remove-comments': getOptAsBool(argv['remove-comments']), 24 | 'remove-empty-tags': getOptAsArray(argv['remove-empty-tags']), 25 | 'remove-tags': getOptAsArray(argv['remove-tags']), 26 | 'wrap': getOptAsInt(argv['wrap']), 27 | 'add-break-around-tags': getOptAsArray(argv['add-break-around-tags']), 28 | 'add-remove-attributes': getOptAsArray(argv['add-remove-attributes']), 29 | 'add-remove-tags': getOptAsArray(argv['add-remove-tags']) 30 | }; 31 | 32 | function getOptAsArray(opt) { 33 | if (opt === undefined) { 34 | return undefined; 35 | } 36 | 37 | if (Array.isArray(opt)) { 38 | return opt 39 | .map(o => o.split(',')) 40 | .reduce((prev, curr) => prev.concat(curr)); 41 | } 42 | 43 | return opt.split(','); 44 | } 45 | 46 | function getOptAsBool(opt) { 47 | if (opt === undefined) { 48 | return undefined; 49 | } 50 | 51 | return opt === true || opt === 'true'; 52 | } 53 | 54 | function getOptAsInt(opt) { 55 | if (opt === undefined) { 56 | return undefined; 57 | } 58 | 59 | const val = parseInt(opt); 60 | 61 | return isNaN(val) ? undefined : val; 62 | } 63 | 64 | function read(filename, callback) { 65 | return fs.readFile(filename, 'utf8', (err, data) => { 66 | if (err) { 67 | throw err; 68 | } 69 | 70 | callback(data); 71 | }); 72 | } 73 | 74 | function write(html, filename) { 75 | return fs.writeFile(filename, html + '\n', err => { 76 | if (err) { 77 | throw err; 78 | } 79 | }); 80 | } 81 | 82 | read(filename || process.stdin.fd, data => { 83 | cleaner.clean(data, options, html => { 84 | if (filename && inPlace) { 85 | return write(html, filename); 86 | } 87 | 88 | write(html, process.stdout.fd); 89 | }); 90 | }); 91 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const htmlparser = require('htmlparser2'); 2 | 3 | const voidElements = [ 4 | 'area', 5 | 'base', 6 | 'basefont', 7 | 'br', 8 | 'col', 9 | 'command', 10 | 'embed', 11 | 'frame', 12 | 'hr', 13 | 'img', 14 | 'input', 15 | 'isindex', 16 | 'keygen', 17 | 'link', 18 | 'meta', 19 | 'param', 20 | 'source', 21 | 'track', 22 | 'wbr' 23 | ]; 24 | 25 | let options = {}; 26 | 27 | function setup(opt) { 28 | options = { 29 | 'allow-attributes-without-values': opt['allow-attributes-without-values'] === true ? true : false, 30 | 'break-around-comments': opt['break-around-comments'] === false ? false : true, 31 | 'break-around-tags': opt['break-around-tags'] || [ 32 | 'blockquote', 33 | 'body', 34 | 'br', 35 | 'div', 36 | 'h1', 37 | 'h2', 38 | 'h3', 39 | 'h4', 40 | 'h5', 41 | 'h6', 42 | 'head', 43 | 'hr', 44 | 'link', 45 | 'meta', 46 | 'p', 47 | 'table', 48 | 'td', 49 | 'title', 50 | 'tr' 51 | ], 52 | 'decode-entities': opt['decode-entities'] === true ? true : false, 53 | 'indent': opt['indent'] || ' ', 54 | 'lower-case-tags': opt['lower-case-tags'] === false ? false : true, 55 | 'lower-case-attribute-names': opt['lower-case-attribute-names'] === false ? false : true, 56 | 'preserve-tags': opt['preserve-tags'] || [ 57 | 'math', 58 | 'script', 59 | 'style', 60 | 'svg' 61 | ], 62 | 'remove-attributes': opt['remove-attributes'] || [ 63 | 'align', 64 | 'bgcolor', 65 | 'border', 66 | 'cellpadding', 67 | 'cellspacing', 68 | 'color', 69 | 'height', 70 | 'target', 71 | 'valign', 72 | 'width' 73 | ], 74 | 'remove-comments': opt['remove-comments'] === true ? true : false, 75 | 'remove-empty-tags': opt['remove-empty-tags'] || [], 76 | 'remove-tags': opt['remove-tags'] || [ 77 | 'center', 78 | 'font' 79 | ], 80 | 'wrap': opt['wrap'] >= 0 ? opt['wrap'] : 120 81 | }; 82 | 83 | if (opt['add-break-around-tags']) { 84 | options['break-around-tags'] = options['break-around-tags'].concat(opt['add-break-around-tags']); 85 | } 86 | 87 | if (opt['add-remove-attributes']) { 88 | options['remove-attributes'] = options['remove-attributes'].concat(opt['add-remove-attributes']); 89 | } 90 | 91 | if (opt['add-remove-tags']) { 92 | options['remove-tags'] = options['remove-tags'].concat(opt['add-remove-tags']); 93 | } 94 | } 95 | 96 | function breakAround(node) { 97 | if (shouldRemove(node)) { 98 | return false; 99 | } 100 | 101 | if (node.type == 'text') { 102 | return false; 103 | } 104 | 105 | if (node.type == 'comment') { 106 | return options['break-around-comments']; 107 | } 108 | 109 | if (options['break-around-tags'].includes(node.name)) { 110 | return true; 111 | } 112 | 113 | return breakWithin(node); 114 | } 115 | 116 | function breakWithin(node) { 117 | if (shouldRemove(node)) { 118 | return false; 119 | } 120 | 121 | if (node.type != 'tag') { 122 | return false; 123 | } 124 | 125 | return node.children.some(breakAround) || node.children.some(breakWithin); 126 | } 127 | 128 | function isEmpty(node) { 129 | if (node.type == 'text') { 130 | return !node.data.trim(); 131 | } 132 | 133 | if (node.type == 'comment') { 134 | return !node.data.trim(); 135 | } 136 | 137 | if (voidElements.includes(node.name)) { 138 | return false; 139 | } 140 | 141 | return !node.children.length || node.children.every(isEmpty); 142 | } 143 | 144 | function removeExtraSpace(text) { 145 | return text.replace(/\s+/g, ' '); 146 | } 147 | 148 | function shouldRemove(node) { 149 | if (node.type == 'text') { 150 | return isEmpty(node); 151 | } 152 | 153 | if (node.type == 'comment') { 154 | return options['remove-comments'] || isEmpty(node); 155 | } 156 | 157 | if (isListedInOptions('remove-empty-tags', node.name)) { 158 | return isEmpty(node); 159 | } 160 | 161 | return isListedInOptions('remove-tags', node.name); 162 | } 163 | 164 | function isListedInOptions(optionsArrayName, name) { 165 | return options[optionsArrayName].some(option => { 166 | return option instanceof RegExp && option.test(name) || option === name; 167 | }); 168 | } 169 | 170 | function renderText(node) { 171 | if (shouldRemove(node)) { 172 | return ''; 173 | } 174 | 175 | let text = removeExtraSpace(node.data); 176 | 177 | if (!node.prev || breakAround(node.prev)) { 178 | text = text.trimLeft(); 179 | } 180 | 181 | if (!node.next || breakAround(node.next)) { 182 | text = text.trimRight(); 183 | } 184 | 185 | return text; 186 | } 187 | 188 | function renderComment(node) { 189 | if (shouldRemove(node)) { 190 | return ''; 191 | } 192 | 193 | const comment = ''; 194 | 195 | if (breakAround(node)) { 196 | return '\n' + comment + '\n'; 197 | } 198 | 199 | return comment; 200 | } 201 | 202 | function renderTag(node) { 203 | if (shouldRemove(node)) { 204 | if (isEmpty(node)) { 205 | return ''; 206 | } 207 | 208 | return render(node.children); 209 | } 210 | 211 | let openTag = '<' + node.name; 212 | 213 | for (let attrib in node.attribs) { 214 | if (!isListedInOptions('remove-attributes', attrib)) { 215 | if (!node.attribs[attrib] && options['allow-attributes-without-values']) { 216 | openTag += ' ' + attrib; 217 | } else { 218 | openTag += ` ${attrib}="${removeExtraSpace(node.attribs[attrib])}"`; 219 | } 220 | } 221 | } 222 | 223 | openTag += '>'; 224 | 225 | if (voidElements.includes(node.name)) { 226 | if (breakAround(node)) { 227 | return '\n' + openTag + '\n'; 228 | } 229 | 230 | return openTag; 231 | } 232 | 233 | let closeTag = ''; 234 | 235 | if (breakAround(node)) { 236 | openTag = '\n' + openTag; 237 | closeTag = closeTag + '\n'; 238 | } 239 | 240 | if (breakWithin(node)) { 241 | openTag = openTag + '\n'; 242 | closeTag = '\n' + closeTag; 243 | } 244 | 245 | return openTag + render(node.children) + closeTag; 246 | } 247 | 248 | function renderDirective(node) { 249 | return '<' + node.data + '>'; 250 | } 251 | 252 | function render(nodes) { 253 | let html = ''; 254 | 255 | nodes.forEach(node => { 256 | if (node.type == 'root') { 257 | html += render(node.children); 258 | return; 259 | } 260 | 261 | if (node.type == 'text') { 262 | html += renderText(node); 263 | return; 264 | } 265 | 266 | if (node.type == 'comment') { 267 | html += renderComment(node); 268 | return; 269 | } 270 | 271 | if (node.type == 'directive') { 272 | html += renderDirective(node) 273 | return; 274 | } 275 | 276 | html += renderTag(node); 277 | }); 278 | 279 | // remove extra line breaks 280 | return html.replace(/\n+/g, '\n'); 281 | } 282 | 283 | function wrap(line, indent) { 284 | // find the last space before the column limit 285 | let bound = line.lastIndexOf(' ', options['wrap']); 286 | 287 | if (bound == -1) { 288 | // there are no spaces before the colum limit 289 | // so find the first space after it 290 | bound = line.indexOf(' ', options['wrap']); 291 | 292 | if (bound == -1) { 293 | // there are no spaces in the line 294 | // so we can't wrap it 295 | return line; 296 | } 297 | } 298 | 299 | const line1 = line.substr(0, bound); 300 | let line2 = indent + options['indent'].repeat(2) + line.substr(bound + 1); 301 | 302 | if (line1.trim().length == 0) { 303 | // there are no spaces in the line other than the indent 304 | // so we can't wrap it 305 | return line; 306 | } 307 | 308 | if (line2.length > options['wrap']) { 309 | line2 = wrap(line2, indent); 310 | } 311 | 312 | return line1 + '\n' + line2; 313 | } 314 | 315 | function indent(html) { 316 | let indentLevel = 0; 317 | const openTagRe = /^<(\w+)[^>]*>$/; 318 | const closeTagRe = /^<\/(\w+)>$/; 319 | 320 | return html.split('\n').map(line => { 321 | const closeTagMatch = line.match(closeTagRe); 322 | 323 | if (closeTagMatch) { 324 | indentLevel--; 325 | } 326 | 327 | const indent = options['indent'].repeat(indentLevel); 328 | const indented = indent + line; 329 | 330 | const openTagMatch = line.match(openTagRe); 331 | 332 | if (openTagMatch && !voidElements.includes(openTagMatch[1])) { 333 | indentLevel++; 334 | } 335 | 336 | if (options['wrap'] && indented.length > options['wrap']) { 337 | return wrap(indented, indent); 338 | } 339 | 340 | return indented; 341 | }).join('\n'); 342 | } 343 | 344 | const preserveTagReplacements = {}; 345 | 346 | function preserveTags(html) { 347 | const tagPattern = options['preserve-tags'].join('|'); 348 | const re = new RegExp(`<(?:${tagPattern})[^>]*>.*?<\/(?:${tagPattern})>`, 'gs'); 349 | 350 | return html.replace(re, (match, offset) => { 351 | preserveTagReplacements[offset] = match; 352 | return ``; 353 | }); 354 | } 355 | 356 | function undoPreserveTags(html) { 357 | const re = //g; 358 | 359 | return html.replace(re, (_, offset) => { 360 | return preserveTagReplacements[offset]; 361 | }); 362 | } 363 | 364 | function clean(html, opt, callback) { 365 | if (typeof opt == 'function') { 366 | callback = opt; 367 | opt = null; 368 | } 369 | 370 | setup(opt || {}); 371 | 372 | const handler = new htmlparser.DomHandler((err, dom) => { 373 | if (err) { 374 | throw err; 375 | } 376 | 377 | callback( 378 | undoPreserveTags( 379 | indent( 380 | render(dom) 381 | ).trim() 382 | ) 383 | ); 384 | }); 385 | 386 | const parser = new htmlparser.Parser(handler, { 387 | decodeEntities: options['decode-entities'], 388 | lowerCaseTags: options['lower-case-tags'], 389 | lowerCaseAttributeNames: options['lower-case-attribute-names'], 390 | }); 391 | 392 | parser.write( 393 | preserveTags(html) 394 | ); 395 | 396 | parser.end(); 397 | } 398 | 399 | module.exports = {clean}; 400 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "clean-html", 3 | "version": "2.0.1", 4 | "lockfileVersion": 3, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "clean-html", 9 | "version": "2.0.1", 10 | "license": "Unlicense", 11 | "dependencies": { 12 | "htmlparser2": "^8.0.2", 13 | "minimist": "^1.2.8" 14 | }, 15 | "bin": { 16 | "clean-html": "cmd.js" 17 | } 18 | }, 19 | "node_modules/dom-serializer": { 20 | "version": "2.0.0", 21 | "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", 22 | "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", 23 | "dependencies": { 24 | "domelementtype": "^2.3.0", 25 | "domhandler": "^5.0.2", 26 | "entities": "^4.2.0" 27 | }, 28 | "funding": { 29 | "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" 30 | } 31 | }, 32 | "node_modules/domelementtype": { 33 | "version": "2.3.0", 34 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", 35 | "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", 36 | "funding": [ 37 | { 38 | "type": "github", 39 | "url": "https://github.com/sponsors/fb55" 40 | } 41 | ] 42 | }, 43 | "node_modules/domhandler": { 44 | "version": "5.0.3", 45 | "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", 46 | "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", 47 | "dependencies": { 48 | "domelementtype": "^2.3.0" 49 | }, 50 | "engines": { 51 | "node": ">= 4" 52 | }, 53 | "funding": { 54 | "url": "https://github.com/fb55/domhandler?sponsor=1" 55 | } 56 | }, 57 | "node_modules/domutils": { 58 | "version": "3.0.1", 59 | "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.0.1.tgz", 60 | "integrity": "sha512-z08c1l761iKhDFtfXO04C7kTdPBLi41zwOZl00WS8b5eiaebNpY00HKbztwBq+e3vyqWNwWF3mP9YLUeqIrF+Q==", 61 | "dependencies": { 62 | "dom-serializer": "^2.0.0", 63 | "domelementtype": "^2.3.0", 64 | "domhandler": "^5.0.1" 65 | }, 66 | "funding": { 67 | "url": "https://github.com/fb55/domutils?sponsor=1" 68 | } 69 | }, 70 | "node_modules/entities": { 71 | "version": "4.4.0", 72 | "resolved": "https://registry.npmjs.org/entities/-/entities-4.4.0.tgz", 73 | "integrity": "sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==", 74 | "engines": { 75 | "node": ">=0.12" 76 | }, 77 | "funding": { 78 | "url": "https://github.com/fb55/entities?sponsor=1" 79 | } 80 | }, 81 | "node_modules/htmlparser2": { 82 | "version": "8.0.2", 83 | "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", 84 | "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", 85 | "funding": [ 86 | "https://github.com/fb55/htmlparser2?sponsor=1", 87 | { 88 | "type": "github", 89 | "url": "https://github.com/sponsors/fb55" 90 | } 91 | ], 92 | "dependencies": { 93 | "domelementtype": "^2.3.0", 94 | "domhandler": "^5.0.3", 95 | "domutils": "^3.0.1", 96 | "entities": "^4.4.0" 97 | } 98 | }, 99 | "node_modules/minimist": { 100 | "version": "1.2.8", 101 | "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", 102 | "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", 103 | "funding": { 104 | "url": "https://github.com/sponsors/ljharb" 105 | } 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "clean-html", 3 | "version": "2.0.1", 4 | "description": "HTML cleaner and beautifier", 5 | "main": "index.js", 6 | "bin": "cmd.js", 7 | "dependencies": { 8 | "htmlparser2": "^8.0.2", 9 | "minimist": "^1.2.8" 10 | }, 11 | "files": [ 12 | "cmd.js", 13 | "index.js", 14 | "package.json", 15 | "README.md", 16 | "release-notes.md", 17 | "UNLICENSE" 18 | ], 19 | "scripts": { 20 | "test": "node test.js" 21 | }, 22 | "repository": { 23 | "type": "git", 24 | "url": "git@github.com:dave-kennedy/clean-html.git" 25 | }, 26 | "keywords": [ 27 | "beautify", 28 | "clean", 29 | "html", 30 | "pretty", 31 | "tidy" 32 | ], 33 | "author": "Dave Kennedy (http://github.com/dave-kennedy)", 34 | "license": "Unlicense", 35 | "bugs": { 36 | "url": "https://github.com/dave-kennedy/clean-html/issues" 37 | }, 38 | "homepage": "https://github.com/dave-kennedy/clean-html" 39 | } 40 | -------------------------------------------------------------------------------- /test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | 13 |
Currently we have these articles available: 4 | 5 |
6 |

The History of Foo
7 | An informative piece of information.

8 |

A Horse Walked Into a Bar
The bartender said 9 | "Why the long face?"

10 |
11 |
14 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | const assert = require('node:assert/strict'); 2 | const childProcess = require('node:child_process'); 3 | const fs = require('node:fs'); 4 | const os = require('node:os'); 5 | const path = require('node:path'); 6 | const util = require('node:util'); 7 | 8 | const cleaner = require('./index.js'); 9 | 10 | const results = []; 11 | const tests = []; 12 | 13 | function logFail(message) { 14 | return console.error(`\x1b[31m${message}\x1b[0m`); 15 | } 16 | 17 | function logPass(message) { 18 | return console.log(`\x1b[32m${message}\x1b[0m`); 19 | } 20 | 21 | function registerTest(description, callback) { 22 | tests.push({description, callback}); 23 | } 24 | 25 | function runTest(description, callback) { 26 | try { 27 | callback(); 28 | } catch (error) { 29 | if (error instanceof assert.AssertionError) { 30 | const message = `✗ ${description}\n` + 31 | ` Expected: ${util.inspect(error.expected)}\n` + 32 | ` Actual: ${util.inspect(error.actual)}`; 33 | 34 | logFail(message); 35 | results.push({message, result: 'fail'}); 36 | return; 37 | } 38 | 39 | const message = `✗ ${description}: ${error}`; 40 | logFail(message); 41 | results.push({message, result: 'fail'}); 42 | return; 43 | } 44 | 45 | const message = `✓ ${description}`; 46 | logPass(message); 47 | results.push({message, result: 'pass'}); 48 | } 49 | 50 | function runTests(filter) { 51 | const filteredTests = tests.filter(test => !filter || filter(test)); 52 | 53 | if (filteredTests.length === 0) { 54 | logFail('No tests satisfy filter'); 55 | process.exit(1); 56 | } 57 | 58 | for (const test of filteredTests) { 59 | runTest(test.description, test.callback); 60 | } 61 | } 62 | 63 | function summarizeResults() { 64 | const numPassed = results.filter(r => r.result == 'pass').length; 65 | const numFailed = results.filter(r => r.result == 'fail').length; 66 | 67 | if (numPassed > 0) { 68 | logPass(`Passed: ${numPassed}`); 69 | } 70 | 71 | if (numFailed > 0) { 72 | logFail(`Failed: ${numFailed}`); 73 | } 74 | } 75 | 76 | function test(description, callback) { 77 | registerTest(description, callback); 78 | } 79 | 80 | test('text is unchanged', () => { 81 | cleaner.clean('Foo Bar', html => { 82 | assert.equal(html, 'Foo Bar'); 83 | }); 84 | }); 85 | 86 | test('extra whitespace is replaced by a single space', () => { 87 | cleaner.clean('Foo \n Bar', html => { 88 | assert.equal(html, 'Foo Bar'); 89 | }); 90 | }); 91 | 92 | test('extra whitespace inside comment is replaced by a single space', () => { 93 | cleaner.clean('', html => { 94 | assert.equal(html, ''); 95 | }); 96 | }); 97 | 98 | test('output is trimmed', () => { 99 | cleaner.clean(' foo\n', html => { 100 | assert.equal(html, 'foo'); 101 | }); 102 | }); 103 | 104 | test('directive is unchanged', () => { 105 | cleaner.clean('', html => { 106 | assert.equal(html, '') 107 | }); 108 | }); 109 | 110 | test('empty value is added when allow-attributes-without-values is false', () => { 111 | cleaner.clean('', {'allow-attributes-without-values': false}, html => { 112 | assert.equal(html, ''); 113 | }); 114 | }); 115 | 116 | test('empty value not added when allow-attributes-without-values is true', () => { 117 | cleaner.clean('', {'allow-attributes-without-values': true}, html => { 118 | assert.equal(html, ''); 119 | }); 120 | }); 121 | 122 | test('line breaks are not added around comments when break-around-comments is false', () => { 123 | cleaner.clean('fooqux', {'break-around-comments': false}, html => { 124 | assert.equal(html, 'fooqux'); 125 | }); 126 | }); 127 | 128 | test('line breaks are added around comments when break-around-comments is true', () => { 129 | cleaner.clean('fooqux', {'break-around-comments': true}, html => { 130 | assert.equal(html, 'foo\n\nqux'); 131 | }); 132 | }); 133 | 134 | test('line breaks are not added around tags when not included in break-around-tags', () => { 135 | cleaner.clean('foo
bar', {'break-around-tags': []}, html => { 136 | assert.equal(html, 'foo
bar'); 137 | }); 138 | }); 139 | 140 | test('line breaks are added around tags when included in break-around-tags', () => { 141 | cleaner.clean('foo
bar', {'break-around-tags': ['div']}, html => { 142 | assert.equal(html, 'foo\n
\nbar'); 143 | }); 144 | }); 145 | 146 | test('non-breaking space is not replaced by a single space when decode-entities is false', () => { 147 | cleaner.clean('Foo Bar', {'decode-entities': false}, html => { 148 | assert.equal(html, 'Foo Bar'); 149 | }); 150 | }); 151 | 152 | test('non-breaking space is replaced by a single space when decode-entities is true', () => { 153 | cleaner.clean('Foo Bar', {'decode-entities': true}, html => { 154 | assert.equal(html, 'Foo Bar'); 155 | }); 156 | }); 157 | 158 | test('tag is lowercased when lower-case-tags is true', () => { 159 | cleaner.clean('bar', {'lower-case-tags': true}, html => { 160 | assert.equal(html, 'bar'); 161 | }); 162 | }); 163 | 164 | test('tag is not lowercased when lower-case-tags is false', () => { 165 | cleaner.clean('bar', {'lower-case-tags': false}, html => { 166 | assert.equal(html, 'bar'); 167 | }); 168 | }); 169 | 170 | test('attribute name is lowercased when lower-case-attribute-names is true', () => { 171 | cleaner.clean('bar', {'lower-case-attribute-names': true}, html => { 172 | assert.equal(html, 'bar'); 173 | }); 174 | }); 175 | 176 | test('attribute name is not lowercased when lower-case-attribute-names is false', () => { 177 | cleaner.clean('bar', {'lower-case-attribute-names': false}, html => { 178 | assert.equal(html, 'bar'); 179 | }); 180 | }); 181 | 182 | test('tag is not preserved when not included in preserve-tags', () => { 183 | const input = ``; 190 | 191 | cleaner.clean(input, {'preserve-tags': []}, output => { 192 | assert.notEqual(output, input); 193 | }); 194 | }); 195 | 196 | test('tag is preserved when included in preserve-tags', () => { 197 | const input = ``; 204 | 205 | cleaner.clean(input, {'preserve-tags': ['script']}, output => { 206 | assert.equal(output, input); 207 | }); 208 | }); 209 | 210 | test('attribute is not removed when not included in remove-attributes', () => { 211 | cleaner.clean('foo', {'remove-attributes': []}, html => { 212 | assert.equal(html, 'foo'); 213 | }); 214 | }); 215 | 216 | test('attribute is removed when included in remove-attributes', () => { 217 | cleaner.clean('foo', {'remove-attributes': ['color']}, html => { 218 | assert.equal(html, 'foo'); 219 | }); 220 | }); 221 | 222 | test('attribute is removed when it matches at least one pattern included in remove-attributes', () => { 223 | cleaner.clean('foo', {'remove-attributes': [/_test-[a-z0-9-]+/i]}, html => { 224 | assert.equal(html, 'foo'); 225 | }); 226 | }); 227 | 228 | test('comment is not removed when remove-comments is false', () => { 229 | cleaner.clean('', {'remove-comments': false}, html => { 230 | assert.equal(html, ''); 231 | }); 232 | }); 233 | 234 | test('comment is removed when remove-comments is true', () => { 235 | cleaner.clean('', {'remove-comments': true}, html => { 236 | assert.equal(html, ''); 237 | }); 238 | }); 239 | 240 | test('empty tag is not removed when not included in remove-empty-tags', () => { 241 | cleaner.clean('

', {'remove-empty-tags': []}, html => { 242 | assert.equal(html, '

'); 243 | }); 244 | }); 245 | 246 | test('empty tag is removed when included in remove-empty-tags', () => { 247 | cleaner.clean('

', {'remove-empty-tags': ['p']}, html => { 248 | assert.equal(html, ''); 249 | }); 250 | }); 251 | 252 | test('non-empty tag is not removed when included in remove-empty-tags', () => { 253 | cleaner.clean('

', {'remove-empty-tags': ['p']}, html => { 254 | assert.equal(html, '

'); 255 | }); 256 | }); 257 | 258 | test('empty tag is removed when it matches at least one pattern included in remove-empty-tags', () => { 259 | cleaner.clean('', {'remove-empty-tags': [/^app-.*/i]}, html => { 260 | assert.equal(html, ''); 261 | }); 262 | }); 263 | 264 | test('tag is not removed when not included in remove-tags', () => { 265 | cleaner.clean('foo', {'remove-tags': []}, html => { 266 | assert.equal(html, 'foo'); 267 | }); 268 | }); 269 | 270 | test('tag is removed and child is preserved when included in remove-tags', () => { 271 | cleaner.clean('foo', {'remove-tags': ['font']}, html => { 272 | assert.equal(html, 'foo'); 273 | }); 274 | }); 275 | 276 | test('tag is removed and child is preserved when it matches at least one pattern included in remove-tags', () => { 277 | cleaner.clean('foo', {'remove-tags': [/app-.+/]}, html => { 278 | assert.equal(html, 'foo'); 279 | }); 280 | }); 281 | 282 | // indent tests 283 | 284 | test('indent is not added when child is text', () => { 285 | cleaner.clean('foobarqux', {'indent': ' '}, html => { 286 | assert.equal(html, 'foobarqux'); 287 | }); 288 | }); 289 | 290 | test('indent is not added when child is comment and break-around-comments is false', () => { 291 | cleaner.clean('fooqux', {'break-around-comments': false, 'indent': ' '}, html => { 292 | assert.equal(html, 'fooqux'); 293 | }); 294 | }); 295 | 296 | test('indent is added when child is comment and break-around-comments is true', () => { 297 | cleaner.clean('fooqux', {'break-around-comments': true, 'indent': ' '}, html => { 298 | assert.equal(html, 'foo\n\n \n\nqux'); 299 | }); 300 | }); 301 | 302 | test('indent is not added when child tag is not included in break-around-tags', () => { 303 | cleaner.clean('foo
bar
qux', {'break-around-tags': [], 'indent': ' '}, html => { 304 | assert.equal(html, 'foo
bar
qux'); 305 | }); 306 | }); 307 | 308 | test('indent is added when child tag is included in break-around-tags', () => { 309 | cleaner.clean('foo
bar
qux', {'break-around-tags': ['div'], 'indent': ' '}, html => { 310 | assert.equal(html, 'foo\n\n
bar
\n
\nqux'); 311 | }); 312 | }); 313 | 314 | test('indent is added when child tag is not included in break-around-tags but descendant is', () => { 315 | cleaner.clean('foo
bar
qux', {'break-around-tags': ['div'], 'indent': ' '}, html => { 316 | assert.equal(html, 'foo\n\n \n
bar
\n
\n
\nqux'); 317 | }); 318 | }); 319 | 320 | test('indent is not added inside comment', () => { 321 | cleaner.clean('', {'break-around-tags': ['div'], 'indent': ' '}, html => { 322 | assert.equal(html, ''); 323 | }); 324 | }); 325 | 326 | test('indent is not added after comment', () => { 327 | cleaner.clean('
foo
', {'break-around-tags': ['div'], 'indent': ' '}, html => { 328 | assert.equal(html, '\n
foo
'); 329 | }); 330 | }); 331 | 332 | // wrap tests 333 | 334 | test('long line is wrapped with hanging indent', () => { 335 | cleaner.clean('
I prefer the concrete, the graspable, the proveable.
', {'wrap': 40}, html => { 336 | assert.equal(html, '
I prefer the concrete, the\n graspable, the proveable.
'); 337 | }); 338 | }); 339 | 340 | test('long line without whitespace is not wrapped', () => { 341 | cleaner.clean('
Iprefertheconcrete,thegraspable,theproveable.
', {'wrap': 40}, html => { 342 | assert.equal(html, '
Iprefertheconcrete,thegraspable,theproveable.
'); 343 | }); 344 | }); 345 | 346 | test('long line inside nested tag is wrapped with hanging indent', () => { 347 | cleaner.clean('
I prefer the concrete, the graspable, the proveable.
', {'wrap': 40}, html => { 348 | assert.equal(html, '
\n
I prefer the concrete, the\n graspable, the proveable.
\n
'); 349 | }); 350 | }); 351 | 352 | test('long line without whitespace inside nested tag is not wrapped', () => { 353 | cleaner.clean('
Iprefertheconcrete,thegraspable,theproveable.
', {'wrap': 40}, html => { 354 | assert.equal(html, '
\n
Iprefertheconcrete,thegraspable,theproveable.
\n
'); 355 | }); 356 | }); 357 | 358 | test('long comment is wrapped and indented', () => { 359 | cleaner.clean('', {'wrap': 40}, html => { 360 | assert.equal(html, ''); 361 | }); 362 | }); 363 | 364 | // command line tests 365 | 366 | test('command line read from stdin and write to stdout', () => { 367 | const input = fs.readFileSync('test.html', 'utf8'); 368 | 369 | const expected = ` 370 | 371 | 386 | 387 |
372 | Currently we have these articles available: 373 |
374 |

375 | The History of Foo 376 |
377 | An informative piece of information. 378 |

379 |

380 | A Horse Walked Into a Bar 381 |
382 | The bartender said "Why the long face?" 383 |

384 |
385 |
\n`; 388 | 389 | const actual = childProcess.execFileSync('node', ['cmd.js'], {encoding: 'utf8', input: input}); 390 | assert.equal(actual, expected); 391 | }); 392 | 393 | test('command line read from file and write to stdout', () => { 394 | const expected = ` 395 | 396 | 411 | 412 |
397 | Currently we have these articles available: 398 |
399 |

400 | The History of Foo 401 |
402 | An informative piece of information. 403 |

404 |

405 | A Horse Walked Into a Bar 406 |
407 | The bartender said "Why the long face?" 408 |

409 |
410 |
\n`; 413 | 414 | const actual = childProcess.execFileSync('node', ['cmd.js', 'test.html'], {encoding: 'utf8'}); 415 | assert.equal(actual, expected); 416 | }); 417 | 418 | test('command line read from file and write to stdout with options', () => { 419 | const expected = `Currently we have these articles available: 420 |

421 | The History of Foo 422 |
423 | An informative piece of information. 424 |

425 |

426 | A Horse Walked Into a Bar 427 |
428 | The bartender said "Why the long face?" 429 |

\n`; 430 | 431 | const actual = childProcess.execFileSync( 432 | 'node', 433 | ['cmd.js', 'test.html', '--add-remove-tags', 'table,tr,td,blockquote'], 434 | {encoding: 'utf8'} 435 | ); 436 | 437 | assert.equal(actual, expected); 438 | }); 439 | 440 | test('command line read from file and write to file in place', () => { 441 | const expected = ` 442 | 443 | 458 | 459 |
444 | Currently we have these articles available: 445 |
446 |

447 | The History of Foo 448 |
449 | An informative piece of information. 450 |

451 |

452 | A Horse Walked Into a Bar 453 |
454 | The bartender said "Why the long face?" 455 |

456 |
457 |
\n`; 460 | 461 | const tempDirPrefix = path.join(os.tmpdir(), 'clean-html-'); 462 | const tempDir = fs.mkdtempSync(tempDirPrefix); 463 | 464 | try { 465 | const tempFile = path.join(tempDir, 'test.html'); 466 | fs.copyFileSync('test.html', tempFile, fs.constants.COPYFILE_EXCL); 467 | 468 | childProcess.execFileSync('node', ['cmd.js', tempFile, '--in-place']); 469 | 470 | const actual = fs.readFileSync(tempFile, 'utf8'); 471 | assert.equal(actual, expected); 472 | } finally { 473 | fs.rmSync(tempDir, {recursive: true}); 474 | } 475 | }); 476 | 477 | const args = process.argv.slice(2); 478 | 479 | if (args.length > 0) { 480 | const argPatterns = args.map(arg => new RegExp(arg)); 481 | runTests((test) => argPatterns.some(pattern => pattern.test(test.description))); 482 | } else { 483 | runTests(); 484 | } 485 | 486 | summarizeResults(); 487 | --------------------------------------------------------------------------------