├── .gitignore
├── cli.js
├── package.json
├── LICENSE.txt
└── deba.js


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | package-lock.json
3 | yarn.lock
4 | *.html
5 | *.txt


--------------------------------------------------------------------------------
/cli.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | import { default as deba, Utils } from "./deba.js";
 4 | 
 5 | import { JSDOM } from "jsdom";
 6 | 
 7 | function serialize(jsdom) {
 8 |   process.stdout.write(deba(jsdom.window.document) + "\n");
 9 | }
10 | 
11 | const arg = process.argv[2];
12 | 
13 | if(arg.startsWith("http:") || arg.startsWith("https:") || arg.startsWith("file:")) {
14 |   JSDOM.fromURL(arg).then(serialize);
15 | }
16 | else {
17 |   JSDOM.fromFile(arg).then(serialize);
18 | }
19 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "deba",
 3 |   "version": "0.17.0",
 4 |   "description": "Deba takes a HTML document or fragment and extracts the textual content into a subset of Markdown.",
 5 |   "type": "module",
 6 |   "main": "deba.js",
 7 |   "scripts": {
 8 |     "test": "echo \"Error: no test specified\" && exit 1"
 9 |   },
10 |   "repository": "github:bloopletech/deba.js",
11 |   "author": "Brenton \"B-Train\" Fletcher",
12 |   "license": "MIT",
13 |   "bugs": "https://github.com/bloopletech/deba.js/issues",
14 |   "homepage": "https://github.com/bloopletech/deba.js",
15 |   "dependencies": {},
16 |   "peerDependencies": {
17 |     "jsdom": "^12.0.0"
18 |   },
19 |   "bin": {
20 |     "deba": "cli.js"
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017-2018 Brenton Fletcher (http://bloople.net i@bloople.net)
 2 | 
 3 | Permission is hereby granted, free of charge, to any person
 4 | obtaining a copy of this software and associated documentation
 5 | files (the "Software"), to deal in the Software without
 6 | restriction, including without limitation the rights to use,
 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the
 9 | Software is furnished to do so, subject to the following
10 | conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/deba.js:
--------------------------------------------------------------------------------
  1 | "use babel";
  2 | "use strict";
  3 | 
  4 | const Utils = {
  5 |   isPresent: function(text) {
  6 |     return text != "" && text.search(/^\s*$/) == -1;
  7 |   },
  8 |   escape: function(text) {
  9 |     /*
 10 |     From the Commonmark spec, version 0.29:
 11 |     An ASCII punctuation character is !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F), :, ;, <, =, >, ?, @ (U+003A–0040), [, \, ], ^, _, ` (U+005B–0060), {, |, }, or ~ (U+007B–007E).
 12 | 
 13 |     Breaking this up into characters that need to be escaped all the time:
 14 |     * because it can mean emphasis and also can mean a list marker
 15 |     < because it can mean a HTML open tag or HTML close tag
 16 |     [ because it can mean a link title or image title
 17 |     \ because it can escape the following character
 18 |     _ because it can mean emphasis
 19 |     ` because it can mean a code fence
 20 |     ~ because it can mean a code fence
 21 | 
 22 |     Characters that need to be escaped if they are the start of a block (with optional leading whitespace):
 23 |     # because it can be a heading start
 24 |     + because it can be a list marker
 25 |     - because it can be a list marker
 26 |     = because it can be a heading start
 27 |     > because it can be a blockquote marker
 28 | 
 29 |     Characters that need to be escaped in certain situations:
 30 |     & when it is followed by some characters and then a semicolon
 31 |     . when it is preceded by a number, because then it can be a list item
 32 | 
 33 |     Characters that can be ignored:
 34 |     ! because it only has meaning before a '[' or after a '<', and both of those will be escaped
 35 |     " because it only has meaning in HTML attributes and link titles, both of which will be escaped
 36 |     $ because it has no special meaning
 37 |     % because it has no special meaning
 38 |     ' because it only has meaning in HTML attributes and link titles, both of which will be escaped
 39 |     ( because it only has meaning in links and images, both of which will be escaped
 40 |     ) because it only has meaning in links and images, both of which will be escaped
 41 |     , because it has no special meaning
 42 |     / because it only has meaning in a HTML close tag, which will be escaped
 43 |     : because it only has meaning in links and HTML tags, both of which will be escaped
 44 |     ; because it only has meaning in HTML entities, which will be escaped
 45 |     ? because it has no special meaning
 46 |     @ because it has no special meaning
 47 |     ] because it only has meaning in links and images, both of which will be escaped
 48 |     ^ because it has no special meaning
 49 |     { because it has no special meaning
 50 |     | because it has no special meaning
 51 |     } because it has no special meaning
 52 |     */
 53 | 
 54 |     //Escaping that needs to be done all the time.
 55 |     text = text.replace(/([*<\[\\_`~])/g, '\\$1');
 56 | 
 57 |     //Escaping that needs to be done at the start of a block.
 58 |     text = text.replace(/^(\s*?)([#+\-=>])/g, '$1\\$2');
 59 | 
 60 |     //Escaping that needs to happen in certain situations
 61 |     //Conditional escaping for the '&' that begins a HTML entity.
 62 |     text = text.replace(/(&.*?;)/g, '\\$1');
 63 |     //Conditional escaping for the '.' following a number that would start an ordinal list item.
 64 |     text = text.replace(/^(\s*\d+)\. /g, '$1\\. ');
 65 | 
 66 |     return text;
 67 |   },
 68 |   normalise: function(text) {
 69 |     return text.replace(/\s+/g, " ").trim();
 70 |   }
 71 | };
 72 | 
 73 | function Stringifier(segments) {
 74 |   this.segments = segments;
 75 | }
 76 | 
 77 | Stringifier.prototype.chunkUpSegments = function() {
 78 |   const chunks = [];
 79 |   let lastType = null;
 80 |   let currentChunk = [];
 81 | 
 82 |   for(const segment of this.segments.concat(null)) {
 83 |     if(lastType == null || segment == null || segment.constructor.name != lastType) {
 84 |       if(currentChunk.length) {
 85 |         chunks.push([lastType, currentChunk]);
 86 |         currentChunk = [];
 87 | 
 88 |         if(segment == null) break;
 89 |       }
 90 | 
 91 |       lastType = segment.constructor.name;
 92 |     }
 93 | 
 94 |     currentChunk.push(segment);
 95 |   }
 96 | 
 97 |   return chunks;
 98 | }
 99 | 
100 | Stringifier.prototype.stringify = function() {
101 |   const chunks = this.chunkUpSegments();
102 |   const output = [];
103 | 
104 |   for(const chunk of chunks) {
105 |     const type = chunk[0];
106 |     const text = chunk[1].join("");
107 | 
108 |     if(type == "Span") output.push(Utils.normalise(text));
109 |     else output.push(text);
110 |   }
111 | 
112 |   return output.join("");
113 | }
114 | 
115 | function Span(text, useRaw) {
116 |   this.text = useRaw ? text : Utils.escape(text);
117 | }
118 | 
119 | Span.prototype.toString = function() {
120 |   return this.text;
121 | }
122 | 
123 | function Pre(segments) {
124 |   this.segments = segments;
125 | }
126 | 
127 | Pre.prototype.toArray = function() {
128 |   const nodes = this.segments.join("").split(/\n{2,}/g);
129 | 
130 |   var result = [];
131 |   for(const node of nodes) {
132 |     const normalised = Utils.normalise(node);
133 |     if(Utils.isPresent(normalised)) result.push(normalised);
134 |   }
135 | 
136 |   return result.length ? [result.join("\n\n") + "\n\n"] : [];
137 | }
138 | 
139 | function Heading(segments, level) {
140 |   this.segments = segments;
141 |   this.level = level;
142 | }
143 | 
144 | Heading.prototype.toArray = function() {
145 |   return ["######".substr(-this.level) + " "].concat(this.segments).concat(["\n\n"]);
146 | }
147 | 
148 | function ListItem(segments, last, index) {
149 |   this.segments = segments;
150 |   this.last = last;
151 |   this.index = index;
152 | }
153 | 
154 | ListItem.prototype.toArray = function() {
155 |   return [this.prefix()].concat(this.segments).concat(["\n" + (this.last ? "\n" : "")]);
156 | }
157 | 
158 | ListItem.prototype.prefix = function() {
159 |   if(this.index == null) return "* ";
160 |   else return this.index + ". ";
161 | }
162 | 
163 | function DefinitionTerm(segments) {
164 |   this.segments = segments;
165 | }
166 | 
167 | DefinitionTerm.prototype.toArray = function() {
168 |   return this.segments.concat([":\n"]);
169 | }
170 | 
171 | function DefinitionDescription(segments, last) {
172 |   this.segments = segments;
173 |   this.last = last;
174 | }
175 | 
176 | DefinitionDescription.prototype.toArray = function() {
177 |   return this.segments.concat(["\n" + (this.last ? "\n" : "")]);
178 | }
179 | 
180 | function Paragraph(segments) {
181 |   this.segments = segments;
182 | }
183 | 
184 | Paragraph.prototype.toArray = function() {
185 |   return this.segments.concat(["\n\n"]);
186 | }
187 | 
188 | function Document(extractor) {
189 |   this.extractor = extractor;
190 |   this.content = "";
191 | 
192 |   this.start();
193 | }
194 | 
195 | Document.prototype.getContent = function() {
196 |   return this.content;
197 | }
198 | 
199 | Document.prototype.push = function(segment) {
200 |   this.segments.push(segment);
201 | }
202 | 
203 | Document.prototype.break = function() {
204 |   this.finish();
205 |   this.start(Array.prototype.slice.call(arguments));
206 | }
207 | 
208 | Document.prototype.finish = function() {
209 |   if(!this.isPresent()) return;
210 | 
211 |   if(this.extractor.isInBlockquote()) this.content += "> ";
212 |   this.content += this.blockContent();
213 | }
214 | 
215 | Document.prototype.start = function(args) {
216 |   this.segments = [];
217 |   this.args = args || [];
218 | }
219 | 
220 | Document.prototype.isPresent = function() {
221 |   for(const segment of this.segments) {
222 |     if(segment instanceof Span && Utils.isPresent(segment.toString())) return true;
223 |   }
224 |   return false;
225 | }
226 | 
227 | Document.prototype.blockContent = function() {
228 |   const blockType = this.args.shift();
229 |   this.args.unshift(this.segments);
230 |   this.args.unshift(null);
231 | 
232 |   const block = new (Function.prototype.bind.apply(blockType, this.args));
233 | 
234 |   return (new Stringifier(block.toArray())).stringify();
235 | }
236 | 
237 | function Extractor(input, options) {
238 |   this.nodes = this.arrayify(input).map(this.convertNode);
239 |   this.options = Object.assign({ images: true, links: true, excludeHidden: true }, options);
240 | 
241 |   if(!this.nodes.length) return;
242 | 
243 |   this.textProperty = ("innerText" in this.nodes[0] ? "innerText" : "textContent");
244 |   this.domDocument = this.nodes[0].ownerDocument;
245 |   this.isDomReal = !this.domDocument.hidden;
246 | 
247 |   this.pageBounds = this.getPageBounds();
248 | 
249 |   this.HEADING_TAGS = ["h1", "h2", "h3", "h4", "h5", "h6"];
250 |   this.BLOCK_INITIATING_TAGS = ["address", "article", "aside", "body", "blockquote", "div", "dd", "dl", "dt", "figure",
251 |     "footer", "header", "li", "main", "nav", "ol", "p", "pre", "section", "td", "th", "ul"];
252 |   this.ENHANCERS = { b: "**", strong: "**", i: "*", em: "*" };
253 |   this.SKIP_TAGS = ["head", "style", "script", "noscript"];
254 |   this.BREAK_TAGS_QUERY = (this.HEADING_TAGS.concat(this.BLOCK_INITIATING_TAGS)).join(", ");
255 | }
256 | 
257 | Extractor.prototype.getPageBounds = function() {
258 |   if(!this.isDomReal || !this.options.excludeHidden) return null;
259 | 
260 |   let tallestHeight = 0;
261 |   for(const element of this.domDocument.documentElement.querySelectorAll("*")) {
262 |     const elementHeight = element.scrollHeight;
263 |     if(elementHeight > tallestHeight) tallestHeight = elementHeight;
264 |   }
265 | 
266 |   return {
267 |     top: 0,
268 |     right: this.domDocument.documentElement.scrollWidth,
269 |     bottom: tallestHeight,
270 |     left: 0
271 |   };
272 | }
273 | 
274 | Extractor.prototype.blocks = function() {
275 |   return this.blocks;
276 | }
277 | 
278 | Extractor.prototype.extract = function() {
279 |   this.justAppendedBr = false;
280 |   this.inBlockquote = false;
281 |   this.groupWithNext = false;
282 | 
283 |   this.document = new Document(this);
284 | 
285 |   for(const node of this.nodes) {
286 |     this.document.break(Paragraph);
287 |     this.process(node);
288 |     this.document.break(Paragraph);
289 |   }
290 | 
291 |   return this.document.getContent().trim();
292 | }
293 | 
294 | Extractor.prototype.arrayify = function(input) {
295 |   if(Array.isArray(input)) return input;
296 |   else return [input];
297 | }
298 | 
299 | Extractor.prototype.convertNode = function(input) {
300 |   var type = input.constructor.name;
301 |   if(type == "Document" || type == "HTMLDocument") return input.documentElement;
302 |   else if(type == "Window") return input.document.documentElement;
303 |   else return input;
304 | }
305 | 
306 | Extractor.prototype.process = function(node) {
307 |   const nodeName = node.nodeName.toLowerCase();
308 | 
309 |   if(this.SKIP_TAGS.includes(nodeName)) return;
310 | 
311 |   if(this.options.exclude) {
312 |     for(const selector of this.options.exclude) {
313 |       if(node.matches(selector)) return;
314 |     }
315 |   }
316 | 
317 |   if(this.options.excludeHidden && !this.isElementVisible(node)) return;
318 | 
319 |   //Handle repeated brs by making a paragraph break
320 |   if(nodeName == "br") {
321 |     if(this.justAppendedBr) {
322 |       this.justAppendedBr = false;
323 | 
324 |       this.document.break(Paragraph);
325 | 
326 |       return;
327 |     }
328 |     else {
329 |       this.justAppendedBr = true;
330 |     }
331 |   }
332 |   else if(this.justAppendedBr) {
333 |     this.justAppendedBr = false;
334 | 
335 |     this.document.push("\n");
336 |   }
337 | 
338 |   if(node.nodeType == 3) {
339 |     this.document.push(new Span(node.textContent));
340 | 
341 |     return;
342 |   }
343 | 
344 |   if(this.ENHANCERS[nodeName]) {
345 |     if(!Utils.isPresent(node[this.textProperty])) return;
346 | 
347 |     var enhancer = new Span(this.ENHANCERS[nodeName], true);
348 | 
349 |     this.document.push(enhancer);
350 |     this.processChildren(node);
351 |     this.document.push(enhancer);
352 | 
353 |     return;
354 |   }
355 | 
356 |   if(this.options.images && nodeName == "img") {
357 |     this.document.push(new Span("![" + Utils.escape(node.alt) + "](" + node.src + ")", true));
358 |     return;
359 |   }
360 | 
361 |   if(this.options.links && nodeName == "a") {
362 |     if(!Utils.isPresent(node[this.textProperty])) return;
363 | 
364 |     if(node.querySelectorAll(this.BREAK_TAGS_QUERY).length) {
365 |       this.processChildren(node);
366 |       return;
367 |     }
368 | 
369 |     this.document.push(new Span("[", true));
370 |     this.processChildren(node);
371 |     this.document.push(new Span("](" + node.href + ")", true));
372 | 
373 |     return;
374 |   }
375 | 
376 |   if(nodeName == "blockquote") {
377 |     this.inBlockquote = true;
378 | 
379 |     this.document.break(Paragraph);
380 |     this.processFlowContent(node);
381 | 
382 |     this.inBlockquote = false;
383 | 
384 |     return;
385 |   }
386 | 
387 |   if(nodeName == "li") {
388 |     let index = null;
389 |     if(node.parentNode.nodeName.toLowerCase() == "ol") {
390 |       index = 1;
391 |       let sibling = node;
392 |       while((sibling = sibling.previousElementSibling)) index++;
393 |     }
394 | 
395 |     this.document.break(ListItem, node.nextElementSibling == null, index);
396 |     this.processFlowContent(node);
397 | 
398 |     return;
399 |   }
400 | 
401 |   if(nodeName == "dt") {
402 |     this.document.break(DefinitionTerm);
403 |     this.processFlowContent(node);
404 |     return;
405 |   }
406 | 
407 |   if(nodeName == "dd") {
408 |     this.document.break(DefinitionDescription, node.nextElementSibling == null);
409 |     this.processFlowContent(node);
410 |     return;
411 |   }
412 | 
413 |   if(nodeName == "pre") {
414 |     this.document.break(Pre);
415 |     this.processChildren(node);
416 |     this.document.break(Paragraph);
417 | 
418 |     return;
419 |   }
420 | 
421 |   if(nodeName == "textarea") {
422 |     this.document.break(Pre);
423 |     this.document.push(new Span(node.value));
424 |     this.document.break(Paragraph);
425 | 
426 |     return;
427 |   }
428 | 
429 |   //These tags terminate the current paragraph, if present, and start a new paragraph
430 |   if(this.BLOCK_INITIATING_TAGS.includes(nodeName)) {
431 |     if(this.groupWithNext) this.groupWithNext = false;
432 |     else this.document.break(Paragraph);
433 |     this.processChildren(node);
434 |     this.document.break(Paragraph);
435 | 
436 |     return;
437 |   }
438 | 
439 |   if(this.HEADING_TAGS.includes(nodeName)) {
440 |     this.document.break(Heading, parseInt(nodeName[1]));
441 |     this.processChildren(node);
442 |     this.document.break(Paragraph);
443 | 
444 |     return;
445 |   }
446 | 
447 |   //Pretend that the children of this node were siblings of this node (move them one level up the tree)
448 |   this.processChildren(node);
449 | }
450 | 
451 | Extractor.prototype.processFlowContent = function(node) {
452 |   this.groupWithNext = true;
453 |   this.processChildren(node);
454 |   this.groupWithNext = false;
455 |   this.document.break(Paragraph);
456 | }
457 | 
458 | Extractor.prototype.processChildren = function(node) {
459 |   for(const child of node.childNodes) this.process(child);
460 | }
461 | 
462 | Extractor.prototype.isElementVisible = function(node) {
463 |   //It's only possible to determine if an element is visible if we have access to a real browser layout engine.
464 |   if(!this.isDomReal) return true;
465 | 
466 |   //Only elements can be hidden/visible; the concept doesn't make sense for other node types
467 |   if(node.nodeType != 1) return true;
468 | 
469 |   //If an element doesn't have a width or a height and doesn't generate any boxes, then it's definitely hidden
470 |   if(!node.offsetWidth && !node.offsetHeight && !node.getClientRects().length) return false;
471 | 
472 |   const window = node.ownerDocument.defaultView;
473 |   const styles = window.getComputedStyle(node);
474 | 
475 |   const nodeBounds = node.getBoundingClientRect();
476 | 
477 |   return (nodeBounds.left < this.pageBounds.right && nodeBounds.right > this.pageBounds.left &&
478 |     nodeBounds.top < this.pageBounds.bottom && nodeBounds.bottom > this.pageBounds.top);
479 | }
480 | 
481 | Extractor.prototype.isInBlockquote = function() {
482 |   return this.inBlockquote;
483 | }
484 | 
485 | export {
486 |   Utils,
487 |   Stringifier,
488 |   Span,
489 |   Pre,
490 |   Heading,
491 |   ListItem,
492 |   DefinitionTerm,
493 |   DefinitionDescription,
494 |   Paragraph,
495 |   Document,
496 |   Extractor
497 | }
498 | 
499 | export default function(input, options) {
500 |   return (new Extractor(input, options)).extract();
501 | }
502 | 


--------------------------------------------------------------------------------