├── .gitignore ├── cli.js ├── package.json ├── LICENSE.txt └── deba.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | package-lock.json 3 | yarn.lock 4 | *.html 5 | *.txt -------------------------------------------------------------------------------- /cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import { default as deba, Utils } from "./deba.js"; 4 | 5 | import { JSDOM } from "jsdom"; 6 | 7 | function serialize(jsdom) { 8 | process.stdout.write(deba(jsdom.window.document) + "\n"); 9 | } 10 | 11 | const arg = process.argv[2]; 12 | 13 | if(arg.startsWith("http:") || arg.startsWith("https:") || arg.startsWith("file:")) { 14 | JSDOM.fromURL(arg).then(serialize); 15 | } 16 | else { 17 | JSDOM.fromFile(arg).then(serialize); 18 | } 19 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "deba", 3 | "version": "0.17.0", 4 | "description": "Deba takes a HTML document or fragment and extracts the textual content into a subset of Markdown.", 5 | "type": "module", 6 | "main": "deba.js", 7 | "scripts": { 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "repository": "github:bloopletech/deba.js", 11 | "author": "Brenton \"B-Train\" Fletcher", 12 | "license": "MIT", 13 | "bugs": "https://github.com/bloopletech/deba.js/issues", 14 | "homepage": "https://github.com/bloopletech/deba.js", 15 | "dependencies": {}, 16 | "peerDependencies": { 17 | "jsdom": "^12.0.0" 18 | }, 19 | "bin": { 20 | "deba": "cli.js" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017-2018 Brenton Fletcher (http://bloople.net i@bloople.net) 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /deba.js: -------------------------------------------------------------------------------- 1 | "use babel"; 2 | "use strict"; 3 | 4 | const Utils = { 5 | isPresent: function(text) { 6 | return text != "" && text.search(/^\s*$/) == -1; 7 | }, 8 | escape: function(text) { 9 | /* 10 | From the Commonmark spec, version 0.29: 11 | An ASCII punctuation character is !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F), :, ;, <, =, >, ?, @ (U+003A–0040), [, \, ], ^, _, ` (U+005B–0060), {, |, }, or ~ (U+007B–007E). 12 | 13 | Breaking this up into characters that need to be escaped all the time: 14 | * because it can mean emphasis and also can mean a list marker 15 | < because it can mean a HTML open tag or HTML close tag 16 | [ because it can mean a link title or image title 17 | \ because it can escape the following character 18 | _ because it can mean emphasis 19 | ` because it can mean a code fence 20 | ~ because it can mean a code fence 21 | 22 | Characters that need to be escaped if they are the start of a block (with optional leading whitespace): 23 | # because it can be a heading start 24 | + because it can be a list marker 25 | - because it can be a list marker 26 | = because it can be a heading start 27 | > because it can be a blockquote marker 28 | 29 | Characters that need to be escaped in certain situations: 30 | & when it is followed by some characters and then a semicolon 31 | . when it is preceded by a number, because then it can be a list item 32 | 33 | Characters that can be ignored: 34 | ! because it only has meaning before a '[' or after a '<', and both of those will be escaped 35 | " because it only has meaning in HTML attributes and link titles, both of which will be escaped 36 | $ because it has no special meaning 37 | % because it has no special meaning 38 | ' because it only has meaning in HTML attributes and link titles, both of which will be escaped 39 | ( because it only has meaning in links and images, both of which will be escaped 40 | ) because it only has meaning in links and images, both of which will be escaped 41 | , because it has no special meaning 42 | / because it only has meaning in a HTML close tag, which will be escaped 43 | : because it only has meaning in links and HTML tags, both of which will be escaped 44 | ; because it only has meaning in HTML entities, which will be escaped 45 | ? because it has no special meaning 46 | @ because it has no special meaning 47 | ] because it only has meaning in links and images, both of which will be escaped 48 | ^ because it has no special meaning 49 | { because it has no special meaning 50 | | because it has no special meaning 51 | } because it has no special meaning 52 | */ 53 | 54 | //Escaping that needs to be done all the time. 55 | text = text.replace(/([*<\[\\_`~])/g, '\\$1'); 56 | 57 | //Escaping that needs to be done at the start of a block. 58 | text = text.replace(/^(\s*?)([#+\-=>])/g, '$1\\$2'); 59 | 60 | //Escaping that needs to happen in certain situations 61 | //Conditional escaping for the '&' that begins a HTML entity. 62 | text = text.replace(/(&.*?;)/g, '\\$1'); 63 | //Conditional escaping for the '.' following a number that would start an ordinal list item. 64 | text = text.replace(/^(\s*\d+)\. /g, '$1\\. '); 65 | 66 | return text; 67 | }, 68 | normalise: function(text) { 69 | return text.replace(/\s+/g, " ").trim(); 70 | } 71 | }; 72 | 73 | function Stringifier(segments) { 74 | this.segments = segments; 75 | } 76 | 77 | Stringifier.prototype.chunkUpSegments = function() { 78 | const chunks = []; 79 | let lastType = null; 80 | let currentChunk = []; 81 | 82 | for(const segment of this.segments.concat(null)) { 83 | if(lastType == null || segment == null || segment.constructor.name != lastType) { 84 | if(currentChunk.length) { 85 | chunks.push([lastType, currentChunk]); 86 | currentChunk = []; 87 | 88 | if(segment == null) break; 89 | } 90 | 91 | lastType = segment.constructor.name; 92 | } 93 | 94 | currentChunk.push(segment); 95 | } 96 | 97 | return chunks; 98 | } 99 | 100 | Stringifier.prototype.stringify = function() { 101 | const chunks = this.chunkUpSegments(); 102 | const output = []; 103 | 104 | for(const chunk of chunks) { 105 | const type = chunk[0]; 106 | const text = chunk[1].join(""); 107 | 108 | if(type == "Span") output.push(Utils.normalise(text)); 109 | else output.push(text); 110 | } 111 | 112 | return output.join(""); 113 | } 114 | 115 | function Span(text, useRaw) { 116 | this.text = useRaw ? text : Utils.escape(text); 117 | } 118 | 119 | Span.prototype.toString = function() { 120 | return this.text; 121 | } 122 | 123 | function Pre(segments) { 124 | this.segments = segments; 125 | } 126 | 127 | Pre.prototype.toArray = function() { 128 | const nodes = this.segments.join("").split(/\n{2,}/g); 129 | 130 | var result = []; 131 | for(const node of nodes) { 132 | const normalised = Utils.normalise(node); 133 | if(Utils.isPresent(normalised)) result.push(normalised); 134 | } 135 | 136 | return result.length ? [result.join("\n\n") + "\n\n"] : []; 137 | } 138 | 139 | function Heading(segments, level) { 140 | this.segments = segments; 141 | this.level = level; 142 | } 143 | 144 | Heading.prototype.toArray = function() { 145 | return ["######".substr(-this.level) + " "].concat(this.segments).concat(["\n\n"]); 146 | } 147 | 148 | function ListItem(segments, last, index) { 149 | this.segments = segments; 150 | this.last = last; 151 | this.index = index; 152 | } 153 | 154 | ListItem.prototype.toArray = function() { 155 | return [this.prefix()].concat(this.segments).concat(["\n" + (this.last ? "\n" : "")]); 156 | } 157 | 158 | ListItem.prototype.prefix = function() { 159 | if(this.index == null) return "* "; 160 | else return this.index + ". "; 161 | } 162 | 163 | function DefinitionTerm(segments) { 164 | this.segments = segments; 165 | } 166 | 167 | DefinitionTerm.prototype.toArray = function() { 168 | return this.segments.concat([":\n"]); 169 | } 170 | 171 | function DefinitionDescription(segments, last) { 172 | this.segments = segments; 173 | this.last = last; 174 | } 175 | 176 | DefinitionDescription.prototype.toArray = function() { 177 | return this.segments.concat(["\n" + (this.last ? "\n" : "")]); 178 | } 179 | 180 | function Paragraph(segments) { 181 | this.segments = segments; 182 | } 183 | 184 | Paragraph.prototype.toArray = function() { 185 | return this.segments.concat(["\n\n"]); 186 | } 187 | 188 | function Document(extractor) { 189 | this.extractor = extractor; 190 | this.content = ""; 191 | 192 | this.start(); 193 | } 194 | 195 | Document.prototype.getContent = function() { 196 | return this.content; 197 | } 198 | 199 | Document.prototype.push = function(segment) { 200 | this.segments.push(segment); 201 | } 202 | 203 | Document.prototype.break = function() { 204 | this.finish(); 205 | this.start(Array.prototype.slice.call(arguments)); 206 | } 207 | 208 | Document.prototype.finish = function() { 209 | if(!this.isPresent()) return; 210 | 211 | if(this.extractor.isInBlockquote()) this.content += "> "; 212 | this.content += this.blockContent(); 213 | } 214 | 215 | Document.prototype.start = function(args) { 216 | this.segments = []; 217 | this.args = args || []; 218 | } 219 | 220 | Document.prototype.isPresent = function() { 221 | for(const segment of this.segments) { 222 | if(segment instanceof Span && Utils.isPresent(segment.toString())) return true; 223 | } 224 | return false; 225 | } 226 | 227 | Document.prototype.blockContent = function() { 228 | const blockType = this.args.shift(); 229 | this.args.unshift(this.segments); 230 | this.args.unshift(null); 231 | 232 | const block = new (Function.prototype.bind.apply(blockType, this.args)); 233 | 234 | return (new Stringifier(block.toArray())).stringify(); 235 | } 236 | 237 | function Extractor(input, options) { 238 | this.nodes = this.arrayify(input).map(this.convertNode); 239 | this.options = Object.assign({ images: true, links: true, excludeHidden: true }, options); 240 | 241 | if(!this.nodes.length) return; 242 | 243 | this.textProperty = ("innerText" in this.nodes[0] ? "innerText" : "textContent"); 244 | this.domDocument = this.nodes[0].ownerDocument; 245 | this.isDomReal = !this.domDocument.hidden; 246 | 247 | this.pageBounds = this.getPageBounds(); 248 | 249 | this.HEADING_TAGS = ["h1", "h2", "h3", "h4", "h5", "h6"]; 250 | this.BLOCK_INITIATING_TAGS = ["address", "article", "aside", "body", "blockquote", "div", "dd", "dl", "dt", "figure", 251 | "footer", "header", "li", "main", "nav", "ol", "p", "pre", "section", "td", "th", "ul"]; 252 | this.ENHANCERS = { b: "**", strong: "**", i: "*", em: "*" }; 253 | this.SKIP_TAGS = ["head", "style", "script", "noscript"]; 254 | this.BREAK_TAGS_QUERY = (this.HEADING_TAGS.concat(this.BLOCK_INITIATING_TAGS)).join(", "); 255 | } 256 | 257 | Extractor.prototype.getPageBounds = function() { 258 | if(!this.isDomReal || !this.options.excludeHidden) return null; 259 | 260 | let tallestHeight = 0; 261 | for(const element of this.domDocument.documentElement.querySelectorAll("*")) { 262 | const elementHeight = element.scrollHeight; 263 | if(elementHeight > tallestHeight) tallestHeight = elementHeight; 264 | } 265 | 266 | return { 267 | top: 0, 268 | right: this.domDocument.documentElement.scrollWidth, 269 | bottom: tallestHeight, 270 | left: 0 271 | }; 272 | } 273 | 274 | Extractor.prototype.blocks = function() { 275 | return this.blocks; 276 | } 277 | 278 | Extractor.prototype.extract = function() { 279 | this.justAppendedBr = false; 280 | this.inBlockquote = false; 281 | this.groupWithNext = false; 282 | 283 | this.document = new Document(this); 284 | 285 | for(const node of this.nodes) { 286 | this.document.break(Paragraph); 287 | this.process(node); 288 | this.document.break(Paragraph); 289 | } 290 | 291 | return this.document.getContent().trim(); 292 | } 293 | 294 | Extractor.prototype.arrayify = function(input) { 295 | if(Array.isArray(input)) return input; 296 | else return [input]; 297 | } 298 | 299 | Extractor.prototype.convertNode = function(input) { 300 | var type = input.constructor.name; 301 | if(type == "Document" || type == "HTMLDocument") return input.documentElement; 302 | else if(type == "Window") return input.document.documentElement; 303 | else return input; 304 | } 305 | 306 | Extractor.prototype.process = function(node) { 307 | const nodeName = node.nodeName.toLowerCase(); 308 | 309 | if(this.SKIP_TAGS.includes(nodeName)) return; 310 | 311 | if(this.options.exclude) { 312 | for(const selector of this.options.exclude) { 313 | if(node.matches(selector)) return; 314 | } 315 | } 316 | 317 | if(this.options.excludeHidden && !this.isElementVisible(node)) return; 318 | 319 | //Handle repeated brs by making a paragraph break 320 | if(nodeName == "br") { 321 | if(this.justAppendedBr) { 322 | this.justAppendedBr = false; 323 | 324 | this.document.break(Paragraph); 325 | 326 | return; 327 | } 328 | else { 329 | this.justAppendedBr = true; 330 | } 331 | } 332 | else if(this.justAppendedBr) { 333 | this.justAppendedBr = false; 334 | 335 | this.document.push("\n"); 336 | } 337 | 338 | if(node.nodeType == 3) { 339 | this.document.push(new Span(node.textContent)); 340 | 341 | return; 342 | } 343 | 344 | if(this.ENHANCERS[nodeName]) { 345 | if(!Utils.isPresent(node[this.textProperty])) return; 346 | 347 | var enhancer = new Span(this.ENHANCERS[nodeName], true); 348 | 349 | this.document.push(enhancer); 350 | this.processChildren(node); 351 | this.document.push(enhancer); 352 | 353 | return; 354 | } 355 | 356 | if(this.options.images && nodeName == "img") { 357 | this.document.push(new Span("![" + Utils.escape(node.alt) + "](" + node.src + ")", true)); 358 | return; 359 | } 360 | 361 | if(this.options.links && nodeName == "a") { 362 | if(!Utils.isPresent(node[this.textProperty])) return; 363 | 364 | if(node.querySelectorAll(this.BREAK_TAGS_QUERY).length) { 365 | this.processChildren(node); 366 | return; 367 | } 368 | 369 | this.document.push(new Span("[", true)); 370 | this.processChildren(node); 371 | this.document.push(new Span("](" + node.href + ")", true)); 372 | 373 | return; 374 | } 375 | 376 | if(nodeName == "blockquote") { 377 | this.inBlockquote = true; 378 | 379 | this.document.break(Paragraph); 380 | this.processFlowContent(node); 381 | 382 | this.inBlockquote = false; 383 | 384 | return; 385 | } 386 | 387 | if(nodeName == "li") { 388 | let index = null; 389 | if(node.parentNode.nodeName.toLowerCase() == "ol") { 390 | index = 1; 391 | let sibling = node; 392 | while((sibling = sibling.previousElementSibling)) index++; 393 | } 394 | 395 | this.document.break(ListItem, node.nextElementSibling == null, index); 396 | this.processFlowContent(node); 397 | 398 | return; 399 | } 400 | 401 | if(nodeName == "dt") { 402 | this.document.break(DefinitionTerm); 403 | this.processFlowContent(node); 404 | return; 405 | } 406 | 407 | if(nodeName == "dd") { 408 | this.document.break(DefinitionDescription, node.nextElementSibling == null); 409 | this.processFlowContent(node); 410 | return; 411 | } 412 | 413 | if(nodeName == "pre") { 414 | this.document.break(Pre); 415 | this.processChildren(node); 416 | this.document.break(Paragraph); 417 | 418 | return; 419 | } 420 | 421 | if(nodeName == "textarea") { 422 | this.document.break(Pre); 423 | this.document.push(new Span(node.value)); 424 | this.document.break(Paragraph); 425 | 426 | return; 427 | } 428 | 429 | //These tags terminate the current paragraph, if present, and start a new paragraph 430 | if(this.BLOCK_INITIATING_TAGS.includes(nodeName)) { 431 | if(this.groupWithNext) this.groupWithNext = false; 432 | else this.document.break(Paragraph); 433 | this.processChildren(node); 434 | this.document.break(Paragraph); 435 | 436 | return; 437 | } 438 | 439 | if(this.HEADING_TAGS.includes(nodeName)) { 440 | this.document.break(Heading, parseInt(nodeName[1])); 441 | this.processChildren(node); 442 | this.document.break(Paragraph); 443 | 444 | return; 445 | } 446 | 447 | //Pretend that the children of this node were siblings of this node (move them one level up the tree) 448 | this.processChildren(node); 449 | } 450 | 451 | Extractor.prototype.processFlowContent = function(node) { 452 | this.groupWithNext = true; 453 | this.processChildren(node); 454 | this.groupWithNext = false; 455 | this.document.break(Paragraph); 456 | } 457 | 458 | Extractor.prototype.processChildren = function(node) { 459 | for(const child of node.childNodes) this.process(child); 460 | } 461 | 462 | Extractor.prototype.isElementVisible = function(node) { 463 | //It's only possible to determine if an element is visible if we have access to a real browser layout engine. 464 | if(!this.isDomReal) return true; 465 | 466 | //Only elements can be hidden/visible; the concept doesn't make sense for other node types 467 | if(node.nodeType != 1) return true; 468 | 469 | //If an element doesn't have a width or a height and doesn't generate any boxes, then it's definitely hidden 470 | if(!node.offsetWidth && !node.offsetHeight && !node.getClientRects().length) return false; 471 | 472 | const window = node.ownerDocument.defaultView; 473 | const styles = window.getComputedStyle(node); 474 | 475 | const nodeBounds = node.getBoundingClientRect(); 476 | 477 | return (nodeBounds.left < this.pageBounds.right && nodeBounds.right > this.pageBounds.left && 478 | nodeBounds.top < this.pageBounds.bottom && nodeBounds.bottom > this.pageBounds.top); 479 | } 480 | 481 | Extractor.prototype.isInBlockquote = function() { 482 | return this.inBlockquote; 483 | } 484 | 485 | export { 486 | Utils, 487 | Stringifier, 488 | Span, 489 | Pre, 490 | Heading, 491 | ListItem, 492 | DefinitionTerm, 493 | DefinitionDescription, 494 | Paragraph, 495 | Document, 496 | Extractor 497 | } 498 | 499 | export default function(input, options) { 500 | return (new Extractor(input, options)).extract(); 501 | } 502 | --------------------------------------------------------------------------------