├── README.md
├── .gitignore
├── lib
    └── functions
    │   ├── index.js
    │   ├── Objects.js
    │   └── Strings.js
├── package.json
├── .editorconfig
├── languages.js
└── index.js


/README.md:
--------------------------------------------------------------------------------
1 | # markdown-parser
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | *.log
3 | *.md
4 | !README.md
5 | package-lock.json
6 | *.txt
7 | *.json
8 | 


--------------------------------------------------------------------------------
/lib/functions/index.js:
--------------------------------------------------------------------------------
1 | const Objects = require("./Objects");
2 | const Strings = require("./Strings");
3 | 
4 | module.exports = {
5 |   Objects,
6 |   Strings,
7 | };
8 | 


--------------------------------------------------------------------------------
/lib/functions/Objects.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * To string
 3 |  * @param {any} o - the object to get it text representation
 4 |  * @returns {string} the `o` as string
 5 |  */
 6 | function toString(o) {
 7 |   // null or undefined
 8 |   if (o === null || o === void 0) return o;
 9 |   // is string
10 |   if (typeof o === "string") return o;
11 |   // has a toString function in their prototype
12 |   if (typeof o.toString === "function") return o.toString();
13 |   // as string in the latest intent
14 |   return String(o);
15 | }
16 | 
17 | module.exports = {
18 |   toString,
19 | };
20 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "free-programming-books-markdown-parser",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "repository": {
 7 |     "type": "git",
 8 |     "url": "git+https://github.com/EbookFoundation/free-programming-books-parser.git"
 9 |   },
10 |   "scripts": {
11 |     "start": "node index.js",
12 |     "test": "echo \"Error: no test specified\" && exit 1"
13 |   },
14 |   "bin": {
15 |     "fpb-parse": "./index.js"
16 |   },
17 |   "author": "Nick Quidas",
18 |   "license": "ISC",
19 |   "dependencies": {
20 |     "command-line-args": "^5.2.0",
21 |     "remark": "^13.0.0"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig helps developers define and maintain consistent
 2 | # coding styles between different editors and IDEs
 3 | # editorconfig.org
 4 | 
 5 | ; top-most EditorConfig file
 6 | root = true
 7 | 
 8 | ; define basic and global for any file
 9 | [*]
10 | charset = utf-8
11 | end_of_line = lf
12 | indent_size = 4
13 | indent_style = space
14 | insert_final_newline = true
15 | max_line_length = off
16 | trim_trailing_whitespace = true
17 | curly_bracket_next_line = false
18 | spaces_around_operators = true
19 | 
20 | ; DOS/Windows batch scripts -
21 | [*.{bat,cmd}]
22 | end_of_line = crlf
23 | 
24 | ; JavaScript files -
25 | [*.{js,ts}]
26 | curly_bracket_next_line = true
27 | indent_size = 2
28 | quote_type = double
29 | 
30 | ; JSON files (normal and commented version) -
31 | [*.{json,jsonc}]
32 | indent_size = 2
33 | quote_type = double
34 | 
35 | ; Make - match it own default syntax
36 | [Makefile]
37 | indent_style = tab
38 | 
39 | ; Markdown files - preserve trail spaces that means break line
40 | [*.{md,markdown}]
41 | trim_trailing_whitespace = false
42 | 
43 | ; PowerShell - match defaults for New-ModuleManifest and PSScriptAnalyzer Invoke-Formatter
44 | [*.{ps1,psd1,psm1}]
45 | charset = utf-8-bom
46 | end_of_line = crlf
47 | 
48 | ; YML config files - match it own default syntax
49 | [*.{yaml,yml}]
50 | indent_size = 2
51 | 


--------------------------------------------------------------------------------
/lib/functions/Strings.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Strip wrapped parenthesis from a string.
 3 |  * @param {string} s - the string to process
 4 |  * @returns {string} the stripped string if parens found, the input string if don't
 5 |  */
 6 | function stripParens(s) {
 7 |   // null or undefined
 8 |   if (s === null || s === void 0) return s;
 9 |   // is wrapped by ( and )?, then unwrap
10 |   if (s.slice(0, 1) === "(" && s.slice(-1) === ")") return s.slice(1, -1);
11 |   // leave as it is
12 |   return s;
13 | }
14 | 
15 | /**
16 |  * Replaces a data tokens in a template string.
17 |  * @param {string} template - the template string
18 |  * @param {object} context - the data used to replace the tokens with
19 |  * @returns string replace
20 |  */
21 | function templater(template, context = {}) {
22 |   // replaceAll using a replacer function
23 |   return template.replace(
24 |     /{{([^{}]+)}}/g, // {{key}}
25 |     (matchedText, key) => context[key] || ""
26 |   );
27 | }
28 | 
29 | /**
30 |  * Wraps a string between other that acts as token.
31 |  * @param {string} s - the text to wrap
32 |  * @param {string} token - the text to wrap with between
33 |  * @returns a string in the form `${token}${s}${token}`
34 |  */
35 | function wrap(s, token = "") {
36 |   // avoid mix concatenate/sum string/numbers using array join hack
37 |   //return `${token}${s}${token}`;
38 |   return [token, token].join(s);
39 | }
40 | 
41 | module.exports = {
42 |   stripParens,
43 |   templater,
44 |   wrap,
45 | };
46 | 


--------------------------------------------------------------------------------
/languages.js:
--------------------------------------------------------------------------------
  1 | const languages = {
  2 |   ab: "Abkhazian",
  3 |   aa: "Afar",
  4 |   af: "Afrikaans",
  5 |   ak: "Akan",
  6 |   sq: "Albanian",
  7 |   am: "Amharic",
  8 |   ar: "Arabic",
  9 |   an: "Aragonese",
 10 |   hy: "Armenian",
 11 |   as: "Assamese",
 12 |   av: "Avaric",
 13 |   ae: "Avestan",
 14 |   ay: "Aymara",
 15 |   az: "Azerbaijani",
 16 |   bm: "Bambara",
 17 |   ba: "Bashkir",
 18 |   eu: "Basque",
 19 |   be: "Belarusian",
 20 |   bn: "Bengali",
 21 |   bh: "Bihari languages",
 22 |   bi: "Bislama",
 23 |   bs: "Bosnian",
 24 |   br: "Breton",
 25 |   bg: "Bulgarian",
 26 |   my: "Burmese",
 27 |   ca: "Catalan, Valencian",
 28 |   km: "Central Khmer",
 29 |   ch: "Chamorro",
 30 |   ce: "Chechen",
 31 |   ny: "Chichewa, Chewa, Nyanja",
 32 |   zh: "Chinese",
 33 |   cu: "Church Slavonic, Old Bulgarian, Old Church Slavonic",
 34 |   cv: "Chuvash",
 35 |   kw: "Cornish",
 36 |   co: "Corsican",
 37 |   cr: "Cree",
 38 |   hr: "Croatian",
 39 |   cs: "Czech",
 40 |   da: "Danish",
 41 |   dv: "Divehi, Dhivehi, Maldivian",
 42 |   nl: "Dutch, Flemish",
 43 |   dz: "Dzongkha",
 44 |   en: "English",
 45 |   "en-US": "English",
 46 |   eo: "Esperanto",
 47 |   et: "Estonian",
 48 |   ee: "Ewe",
 49 |   fo: "Faroese",
 50 |   fj: "Fijian",
 51 |   fi: "Finnish",
 52 |   fr: "French",
 53 |   ff: "Fulah",
 54 |   gd: "Gaelic, Scottish Gaelic",
 55 |   gl: "Galician",
 56 |   lg: "Ganda",
 57 |   ka: "Georgian",
 58 |   de: "German",
 59 |   ki: "Gikuyu, Kikuyu",
 60 |   el: "Greek (Modern)",
 61 |   kl: "Greenlandic, Kalaallisut",
 62 |   gn: "Guarani",
 63 |   gu: "Gujarati",
 64 |   ht: "Haitian, Haitian Creole",
 65 |   ha: "Hausa",
 66 |   he: "Hebrew",
 67 |   hz: "Herero",
 68 |   hi: "Hindi",
 69 |   ho: "Hiri Motu",
 70 |   hu: "Hungarian",
 71 |   is: "Icelandic",
 72 |   io: "Ido",
 73 |   ig: "Igbo",
 74 |   id: "Indonesian",
 75 |   ia: "Interlingua (International Auxiliary Language Association)",
 76 |   ie: "Interlingue",
 77 |   iu: "Inuktitut",
 78 |   ik: "Inupiaq",
 79 |   ga: "Irish",
 80 |   it: "Italian",
 81 |   ja: "Japanese",
 82 |   jv: "Javanese",
 83 |   kn: "Kannada",
 84 |   kr: "Kanuri",
 85 |   ks: "Kashmiri",
 86 |   kk: "Kazakh",
 87 |   rw: "Kinyarwanda",
 88 |   kv: "Komi",
 89 |   kg: "Kongo",
 90 |   ko: "Korean",
 91 |   kj: "Kwanyama, Kuanyama",
 92 |   ku: "Kurdish",
 93 |   ky: "Kyrgyz",
 94 |   lo: "Lao",
 95 |   la: "Latin",
 96 |   lv: "Latvian",
 97 |   lb: "Letzeburgesch, Luxembourgish",
 98 |   li: "Limburgish, Limburgan, Limburger",
 99 |   ln: "Lingala",
100 |   lt: "Lithuanian",
101 |   lu: "Luba-Katanga",
102 |   mk: "Macedonian",
103 |   mg: "Malagasy",
104 |   ms: "Malay",
105 |   ml: "Malayalam",
106 |   mt: "Maltese",
107 |   gv: "Manx",
108 |   mi: "Maori",
109 |   mr: "Marathi",
110 |   mh: "Marshallese",
111 |   ro: "Moldovan, Moldavian, Romanian",
112 |   mn: "Mongolian",
113 |   na: "Nauru",
114 |   nv: "Navajo, Navaho",
115 |   nd: "Northern Ndebele",
116 |   ng: "Ndonga",
117 |   ne: "Nepali",
118 |   se: "Northern Sami",
119 |   no: "Norwegian",
120 |   nb: "Norwegian Bokmål",
121 |   nn: "Norwegian Nynorsk",
122 |   ii: "Nuosu, Sichuan Yi",
123 |   oc: "Occitan (post 1500)",
124 |   oj: "Ojibwa",
125 |   or: "Oriya",
126 |   om: "Oromo",
127 |   os: "Ossetian, Ossetic",
128 |   pi: "Pali",
129 |   pa: "Panjabi, Punjabi",
130 |   ps: "Pashto, Pushto",
131 |   fa: "Persian",
132 |   "fa-IR": "Persian (Iran)",
133 |   pl: "Polish",
134 |   "pt-BR": "Portuguese (Brazil)",
135 |   "pt-PT": "Portuguese (Portugal)",
136 |   qu: "Quechua",
137 |   rm: "Romansh",
138 |   rn: "Rundi",
139 |   ru: "Russian",
140 |   sm: "Samoan",
141 |   sg: "Sango",
142 |   sa: "Sanskrit",
143 |   sc: "Sardinian",
144 |   sr: "Serbian",
145 |   sn: "Shona",
146 |   sd: "Sindhi",
147 |   si: "Sinhala, Sinhalese",
148 |   sk: "Slovak",
149 |   sl: "Slovenian",
150 |   so: "Somali",
151 |   st: "Sotho, Southern",
152 |   nr: "South Ndebele",
153 |   es: "Spanish, Castilian",
154 |   su: "Sundanese",
155 |   sw: "Swahili",
156 |   ss: "Swati",
157 |   sv: "Swedish",
158 |   tl: "Tagalog",
159 |   ty: "Tahitian",
160 |   tg: "Tajik",
161 |   ta: "Tamil",
162 |   tt: "Tatar",
163 |   te: "Telugu",
164 |   th: "Thai",
165 |   bo: "Tibetan",
166 |   ti: "Tigrinya",
167 |   to: "Tonga (Tonga Islands)",
168 |   ts: "Tsonga",
169 |   tn: "Tswana",
170 |   tr: "Turkish",
171 |   tk: "Turkmen",
172 |   tw: "Twi",
173 |   ug: "Uighur, Uyghur",
174 |   uk: "Ukrainian",
175 |   ur: "Urdu",
176 |   uz: "Uzbek",
177 |   ve: "Venda",
178 |   vi: "Vietnamese",
179 |   vo: "Volap_k",
180 |   wa: "Walloon",
181 |   cy: "Welsh",
182 |   fy: "Western Frisian",
183 |   wo: "Wolof",
184 |   xh: "Xhosa",
185 |   yi: "Yiddish",
186 |   yo: "Yoruba",
187 |   za: "Zhuang, Chuang",
188 |   zu: "Zulu",
189 | };
190 | 
191 | module.exports = languages;
192 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | 
  3 | const fs = require("fs");
  4 | const path = require("path");
  5 | const remark = require("remark");
  6 | const { Objects, Strings } = require("./lib/functions");
  7 | const languages = require("./languages");
  8 | const commandLineArgs = require("command-line-args");
  9 | 
 10 | const optionDefinitions = [
 11 |   {
 12 |     name: "input",
 13 |     multiple: true,
 14 |     defaultValue: ["./fpb/books", "./fpb/casts", "./fpb/courses", "./fpb/more"],
 15 |   },
 16 |   { name: "output", defaultValue: "./parser/fpb.json" },
 17 | ];
 18 | 
 19 | const excludes = [
 20 |   "README.md",
 21 |   "CONTRIBUTING.md",
 22 |   "CODE_OF_CONDUCT.md",
 23 |   "SUMMARY.md",
 24 | ];
 25 | 
 26 | /**
 27 |  * Parses the contents of a heading from remark-parse into a readable format.
 28 |  *
 29 |  * @param {Array<Object>} children - an array of AST items defined by remark-parse for
 30 |  *        the content of headings (H1..H7)
 31 |  *
 32 |  * @returns {string} an string with the name of the section related with the input heading
 33 |  */
 34 | function getSectionNameFromHeadingContent(children) {
 35 |   // visit nodes in depth
 36 |   const walk = (children, depth) =>
 37 |     children.reduce((text, node, index) => {
 38 |       if (!node || !node.type) return text; // not AST, maybe plain text
 39 |       switch (node.type) {
 40 |         //
 41 |         // meaningfull nodes
 42 |         //
 43 |         case "emphasis":
 44 |         case "strong":
 45 |           text += Strings.templater(remarkTokenAST(node), {
 46 |             text: walk(node.children, depth + 1),
 47 |           });
 48 |           break;
 49 |         case "inlineCode":
 50 |         case "text":
 51 |           text += Strings.templater(remarkTokenAST(node), {
 52 |             text: node.value,
 53 |           });
 54 |           break;
 55 |         //
 56 |         // skipped nodes
 57 |         //
 58 |         case "heading":
 59 |         case "html":
 60 |         case "link":
 61 |         case "list":
 62 |         case "paragraph":
 63 |         default:
 64 |           break;
 65 |       }
 66 |       return text;
 67 |     }, "");
 68 | 
 69 |   return walk(children, 0);
 70 | }
 71 | 
 72 | /**
 73 |  * Parses the contents of a link from remark-parse into a readable format.
 74 |  *
 75 |  * @param {Array<Object>} children - an array of AST items defined by remark-parse for
 76 |  *        the content of a link (A)
 77 |  *
 78 |  * @returns {string} an string with the text of the related input link
 79 |  */
 80 | function getLinkTextFromLinkNodes(children) {
 81 |   // visit nodes in depth
 82 |   const walk = (children, depth) => {
 83 |     // not AST, maybe plain text
 84 |     if (!Array.isArray(children)) return Objects.toString(children);
 85 |     // AST children array nodes
 86 |     return children.reduce((text, node, index) => {
 87 |       if (!node || !node.type) return text; // not AST, maybe plain text
 88 |       switch (node.type) {
 89 |         //
 90 |         // rebuild meaningfull nodes
 91 |         //
 92 |         case "image":
 93 |           text += Strings.templater(remarkTokenAST(node), {
 94 |             text: node.alt || node.title,
 95 |             url: node.url,
 96 |           });
 97 |           break;
 98 |         case "inlineCode":
 99 |         case "text":
100 |           text += Strings.templater(remarkTokenAST(node), {
101 |             text: node.value,
102 |           });
103 |           break;
104 |         case "emphasis":
105 |         case "strong":
106 |           text += Strings.templater(remarkTokenAST(node), {
107 |             text: walk(node.children, depth + 1),
108 |           });
109 |           break;
110 |         //
111 |         // skipped nodes
112 |         //
113 |         default:
114 |           console.log(
115 |             "getLinkTextFromLinkNodes::skipped",
116 |             depth,
117 |             node.type,
118 |             node
119 |           );
120 |           break;
121 |       }
122 |       return text;
123 |     }, "");
124 |   };
125 | 
126 |   return walk(children, 0);
127 | }
128 | 
129 | /**
130 |  * Gets the template related with AST remark-parse node.
131 |  * @param {Object} node - AST node defined by remark-parse
132 |  * @returns {string} - the template string
133 |  */
134 | function remarkTokenAST(node) {
135 |   if (node && node.type) {
136 |     switch (node.type) {
137 |       case "break": // {type: 'break', position: {...}}
138 |         return "<br/>";
139 |       case "emphasis": // {type: 'emphasis', children: [...], position: {...}}
140 |         return Strings.wrap("{{text}}", "_");
141 |       case "heading": // {type: 'heading', depth: 1, children: [...], position: {...}}
142 |         return ["#".repeat(node.depth || 0), "{{text}}"].join("");
143 |       case "image": // {type: 'image', title: '...', url: '...', alt: '...', position: {...}}
144 |         return "![{{text}}]({{url}})";
145 |       case "inlineCode": // {type: 'inlineCode', value: '...', position: {...}}
146 |         return Strings.wrap("{{text}}", "`");
147 |       case "link": // {type: 'link', title: '...', url: '...', children: [...], position: {...}}
148 |         return "[{{text}}]({{url}})";
149 |       case "list": // {type: 'list', ordered: false, start: null, spread: false, children: [...], position: {...}}
150 |       case "listItem": // {type: 'listItem', spread: false, checked: null, children: [...], position: {...}}
151 |         // TODO: generate token for list/listItem
152 |         break;
153 |       case "strong": // {type: 'strong', children: [...], position: {...}}
154 |         return Strings.wrap("{{text}}", "**");
155 |       case "html": // {type: 'html', value: '...', position: {...}}
156 |       case "paragraph": // {type: 'paragraph', children: [...], position: {...}}
157 |       case "text": // {type: 'text', value: '...', position: {...}}
158 |         return Strings.wrap("{{text}}"); // identity
159 |       default:
160 |         break;
161 |     }
162 |   }
163 |   throw new Error("Unrecognized remark node type: " + (node && node.type));
164 | }
165 | 
166 | /**
167 |  * Parses a list item generated from remark-parse into a readable format.
168 |  *
169 |  * remark-parse parses a markdown file into a long, intricate json.
170 |  * Many fields in this json either give information we do not care
171 |  * about or does not go into enough detail. This function parses the
172 |  * output of remark-parse into a format preferred by this project,
173 |  * indicating authors, notes, and links etc.
174 |  *
175 |  * @param {Object} listItem - a listItem in AST format defined by remark-parse
176 |  *
177 |  * @return {Object} Returns an Object containing details about the piece of media.
178 |  */
179 | function parseListItem(listItem) {
180 |   let entry = {};
181 |   let s = ""; // If we need to build up a string over multiple listItem elements
182 |   let leftParen,
183 |     rightParen = -1; // If we need to parse parenthesized text
184 |   // head of listItem = url, the rest is "other stuff"
185 |   const [link, ...otherStuff] = listItem;
186 |   entry.url = link.url;
187 |   // link.children || link.value => weak way to check if link.type === "link"
188 |   entry.title = getLinkTextFromLinkNodes(link.children || link.value);
189 |   // remember to get OTHER STUFF!! remember there may be multiple links!
190 |   for (let i of otherStuff) {
191 |     if (s === "") {
192 |       // this is almost always, except for when we are parsing a multi-element note
193 |       if (i.type === "text" && i.value.slice(0, 3) === " - ") {
194 |         // author found
195 |         let parenIndex = i.value.indexOf("(");
196 |         if (parenIndex === -1) {
197 |           entry.author = i.value.slice(3).trim();
198 |         } else {
199 |           entry.author = i.value.slice(3, parenIndex).trim(); // go from " - " until the first "("
200 |         }
201 |       }
202 |       if (
203 |         i.type === "emphasis" &&
204 |         i.children[0].value.slice(0, 1) === "(" &&
205 |         i.children[0].value.slice(-1) === ")"
206 |       ) {
207 |         // access notes found (currently assumes exactly one child, so far this is always the case)
208 |         entry.accessNotes = i.children[0].value.slice(1, -1);
209 |       }
210 |       if (i.type === "link") {
211 |         // other links found
212 |         if (entry.otherLinks === undefined) entry.otherLinks = [];
213 |         entry.otherLinks.push({
214 |           title: Strings.stripParens(getLinkTextFromLinkNodes(i.children)),
215 |           url: i.url,
216 |         });
217 |         // entry.otherLinks = [...entry.otherLinks, {title: i.children[0].value, url: i.url}];      // <-- i wish i could get this syntax to work with arrays
218 |       }
219 |       if (i.type === "text" && i.value.indexOf("(") !== -1) {
220 |         // notes found (currently assumes no nested parentheses)
221 |         if (entry.notes === undefined) entry.notes = [];
222 |         leftParen = i.value.indexOf("(");
223 |         while (leftParen != -1) {
224 |           rightParen = i.value.indexOf(")", leftParen);
225 |           if (rightParen === -1) {
226 |             // there must be some *emphasis* found
227 |             s += i.value.slice(leftParen);
228 |             break;
229 |           }
230 |           entry.notes.push(i.value.slice(leftParen + 1, rightParen));
231 |           leftParen = i.value.indexOf("(", rightParen);
232 |         }
233 |       }
234 |     } else {
235 |       // for now we assume that all previous ifs are mutually exclusive with this, may polish later
236 |       if (i.type === "emphasis") {
237 |         // this is the emphasis, add it in boldface and move on
238 |         s += "*" + i.children[0].value + "*";
239 |       } else if (i.type === "link") {
240 |         // something has gone terribly wrong. this book must be viewed and edited manually.
241 |         entry.manualReviewRequired = true;
242 |         break;
243 |       } else {
244 |         // hopefully this is the end of the note
245 |         let rightParen = i.value.indexOf(")");
246 |         if (rightParen === -1) {
247 |           // we have to go AGAIN
248 |           s += i.value;
249 |         } else {
250 |           // finally, we have reached the end of the note
251 |           entry.notes.push(
252 |             Strings.stripParens(s + i.value.slice(0, rightParen + 1))
253 |           );
254 |           s = "";
255 |           // this is a copypaste of another block of code. probably not a good thing tbh.
256 |           leftParen = i.value.indexOf("(");
257 |           while (leftParen != -1) {
258 |             rightParen = i.value.indexOf(")", leftParen);
259 |             if (rightParen === -1) {
260 |               // there must be some *emphasis* found
261 |               s += i.value.slice(leftParen);
262 |               break;
263 |             }
264 |             entry.notes.push(i.value.slice(leftParen + 1, rightParen));
265 |             leftParen = i.value.indexOf("(", rightParen);
266 |           }
267 |         }
268 |       }
269 |     }
270 |   }
271 |   return entry;
272 | }
273 | 
274 | /**
275 |  * Determines the language a certain file is based on the format
276 |  * from the FreeEbookFoundation GitHub page
277 |  * @param {String} filename A filename in the format kept by all markdown files on the FreeProgrammingBooks Github
278 |  * @returns {String} The language the file is
279 |  */
280 | function getLangFromFilename(filename) {
281 |   const dash = filename.lastIndexOf("-");
282 |   const dot = filename.lastIndexOf(".");
283 |   let lang = filename.slice(dash + 1, dot).replace(/_/, "-");
284 |   let isSubject = false;
285 |   if (!languages.hasOwnProperty(lang)) {
286 |     if (/^[a-z]{2}$/.test(lang) || /^[a-z]{2}-[A-Z]{2}$/.test(lang)) {
287 |       return "";
288 |     }
289 |     // console.log(lang);
290 |     if (lang === "subjects") {
291 |       isSubject = true;
292 |     }
293 |     lang = "en";
294 |   }
295 |   return { lang: lang, isSubject: isSubject };
296 | }
297 | 
298 | /**
299 |  * Gets all markdown files in a directory,
300 |  * @param {String} dir - A directory path
301 |  * @returns A list of all md files in a directory, excluding those in the excludes array
302 |  */
303 | function getFilesFromDir(dir) {
304 |   return fs
305 |     .readdirSync(dir)
306 |     .filter(
307 |       (file) => path.extname(file) === ".md" && excludes.indexOf(file) === -1
308 |     )
309 |     .map((file) => path.join(dir, file));
310 | }
311 | 
312 | /**
313 |  * Retrieves the folder name from a string representing a directory and file
314 |  * @param {String} str - A string representing a path directory alike in the format "./directory/file"
315 |  * @returns {String} The extracted directory name
316 |  */
317 | function getMediaTypeFromDirectoryPath(str) {
318 |   str = path.resolve(str); // sanatize and expand (OS independent)
319 |   let type;
320 |   if (fs.lstatSync(str).isDirectory()) {
321 |     // if path is itself a directory, use it name as result
322 |     type = path.basename(str);
323 |   } else {
324 |     // if not... parent/previous slug is always a directory; extract this part
325 |     // path.sep: Windows -> "\", Unix -> "/"
326 |     type = str.split(path.sep).slice(-2, -1).join(path.sep);
327 |   }
328 |   return type;
329 | }
330 | 
331 | /**
332 |  * Turns a single markdown file into the json structure needed
333 |  * @param {path} doc - a single file path to a markdown file
334 |  * @returns {object} Json object of entries in the md file
335 |  */
336 | function parseMarkdown(doc) {
337 |   let tree = remark.parse(doc).children;
338 |   let sections = []; // This will go into root object later
339 |   let errors = [];
340 |   let currentDepth = 3; // used to determine if the last heading was an h4 or h3
341 | 
342 |   // find where Index ends
343 |   // probably could be done better, review later
344 |   let i = 0,
345 |     count = 0;
346 |   for (i; i < tree.length; i++) {
347 |     if (tree[i].type == "heading" && tree[i].depth == "3") count++;
348 |     if (count == 2) break;
349 |   }
350 | 
351 |   tree.slice(i).forEach((item) => {
352 |     // Start iterating after Index
353 |     try {
354 |       if (item.type == "heading") {
355 |         const sectionName = getSectionNameFromHeadingContent(item.children);
356 |         if (sectionName == "Index") return;
357 |         if (item.depth == 3) {
358 |           // Heading is an h3
359 |           currentDepth = 3;
360 |           // create section record
361 |           let newSection = {
362 |             section: sectionName,
363 |             entries: [],
364 |             subsections: [],
365 |           };
366 |           // Push the section to the output array
367 |           sections.push(newSection);
368 |         } else if (item.depth == 4) {
369 |           // Heading is an h4
370 |           currentDepth = 4;
371 |           // create subsection record
372 |           let newSubsection = {
373 |             section: sectionName,
374 |             entries: [],
375 |           };
376 |           // Add to subsection array of most recent h3
377 |           sections[sections.length - 1].subsections.push(newSubsection);
378 |         }
379 |       } else if (item.type == "list") {
380 |         item.children.forEach((listItem) => {
381 |           let content = listItem.children[0].children; // gets array containing a remark-link and a remark-paragraph
382 |           // if(content[0].type !== 'link'){ // SKIPS OVER bad formatting
383 |           //     return;
384 |           // }
385 |           if (currentDepth == 3) {
386 |             let contentJson = parseListItem(content);
387 |             sections[sections.length - 1].entries.push(contentJson); // add the entry to most recent h3
388 |           } else if (currentDepth == 4) {
389 |             let lastSection = sections.length - 1;
390 |             let lastSubSec = sections[lastSection].subsections.length - 1;
391 |             let contentJson = parseListItem(content);
392 |             sections[lastSection].subsections[lastSubSec].entries.push(
393 |               contentJson
394 |             ); // add entry to most recent h4
395 |           }
396 |         });
397 |       }
398 |     } catch (e) {
399 |       // if there was an error while parsing, print the error to an error log
400 |       // looks really ugly, maybe try to refine output later
401 |       let errStart = JSON.stringify(item.position.start.line);
402 |       let errEnd = JSON.stringify(item.position.end.line);
403 |       str = `Error at line ${errStart} - line ${errEnd}.`;
404 |       errors.push(str);
405 |     }
406 |   });
407 |   return { sections: sections, errors: errors };
408 | }
409 | 
410 | /**
411 |  * Parses a single directory's md files and converts them into usable json
412 |  * @param {String} directory A string pointing to a directory
413 |  * @returns {Object} An object containing two values, dirJson and dirErrors.
414 |  *                   dirJson contains all data that was successfully parsed from
415 |  *                   the markdown files. dirErrors contains all entries that had
416 |  *                   an error occur while parsing.
417 |  */
418 | function parseDirectory(directory) {
419 |   let dirChildren = []; // this will hold the output each markdown doc
420 |   let dirErrors = []; //contains error for a given directory
421 | 
422 |   let mediaType = getMediaTypeFromDirectoryPath(directory);
423 |   const filenames = getFilesFromDir(path.resolve(directory));
424 |   filenames.forEach((filename) => {
425 |     const doc = fs.readFileSync(filename);
426 |     let { sections, errors } = parseMarkdown(doc); // parse the markdown document
427 |     const { lang, isSubject } = getLangFromFilename(filename);
428 | 
429 |     // Entries
430 |     let docJson = {
431 |       language: {
432 |         code: lang,
433 |         name: languages[lang],
434 |       },
435 |       index: {},
436 |       sections: sections,
437 |     };
438 |     if (lang === "en") docJson.language.isSubject = isSubject;
439 |     dirChildren.push(docJson);
440 | 
441 |     // Errors
442 |     if (errors.length !== 0) {
443 |       let docErrors = {
444 |         file: path.basename(filename),
445 |         errors: errors,
446 |       };
447 |       dirErrors.push(docErrors);
448 |     }
449 |   });
450 | 
451 |   // File entries
452 |   let dirJson = {
453 |     type: mediaType,
454 |     index: {},
455 |     children: dirChildren,
456 |   };
457 | 
458 |   // Errors
459 | 
460 |   return { dirJson: dirJson, dirErrors: dirErrors };
461 | }
462 | 
463 | /**
464 |  * Reads all given directories for markdown files and prints the parsed json in the output directory
465 |  *
466 |  * @param {Array}  directories A list of strings of directories to scan for markdown files
467 |  * @param {String} output A string for the path that the output should be placed in
468 |  */
469 | function parseAll(directories, output) {
470 |   let rootChildren = []; // this will hold the output of each directory
471 |   let rootErrors = [];
472 | 
473 |   directories.forEach((directory) => {
474 |     let { dirJson, dirErrors } = parseDirectory(directory);
475 |     rootChildren.push(dirJson);
476 |     if (dirErrors.length !== 0) {
477 |       rootErrors.push({
478 |         directory: path.basename(directory),
479 |         files: dirErrors,
480 |       });
481 |     }
482 |   });
483 | 
484 |   // ALl entries
485 |   let rootJson = {
486 |     type: "root",
487 |     children: rootChildren,
488 |   };
489 | 
490 |   // Errors
491 |   let allErrors = {
492 |     type: "root",
493 |     directories: rootErrors,
494 |   };
495 |   fs.writeFileSync(output, JSON.stringify(rootJson, null, 3), function (err) {
496 |     if (err) {
497 |       console.log(err);
498 |     }
499 |   });
500 |   // fs.writeFileSync(
501 |   //     "./parser/fpb.log",
502 |   //     JSON.stringify(allErrors, null, 3),
503 |   //     function (err) {
504 |   //         if (err) {
505 |   //             console.log(err);
506 |   //         }
507 |   //     }
508 |   // );
509 | }
510 | 
511 | let { input, output } = commandLineArgs(optionDefinitions);
512 | parseAll(input, output);
513 | 


--------------------------------------------------------------------------------