├── README.md ├── .gitignore ├── lib └── functions │ ├── index.js │ ├── Objects.js │ └── Strings.js ├── package.json ├── .editorconfig ├── languages.js └── index.js /README.md: -------------------------------------------------------------------------------- 1 | # markdown-parser 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | *.log 3 | *.md 4 | !README.md 5 | package-lock.json 6 | *.txt 7 | *.json 8 | -------------------------------------------------------------------------------- /lib/functions/index.js: -------------------------------------------------------------------------------- 1 | const Objects = require("./Objects"); 2 | const Strings = require("./Strings"); 3 | 4 | module.exports = { 5 | Objects, 6 | Strings, 7 | }; 8 | -------------------------------------------------------------------------------- /lib/functions/Objects.js: -------------------------------------------------------------------------------- 1 | /** 2 | * To string 3 | * @param {any} o - the object to get it text representation 4 | * @returns {string} the `o` as string 5 | */ 6 | function toString(o) { 7 | // null or undefined 8 | if (o === null || o === void 0) return o; 9 | // is string 10 | if (typeof o === "string") return o; 11 | // has a toString function in their prototype 12 | if (typeof o.toString === "function") return o.toString(); 13 | // as string in the latest intent 14 | return String(o); 15 | } 16 | 17 | module.exports = { 18 | toString, 19 | }; 20 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "free-programming-books-markdown-parser", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "repository": { 7 | "type": "git", 8 | "url": "git+https://github.com/EbookFoundation/free-programming-books-parser.git" 9 | }, 10 | "scripts": { 11 | "start": "node index.js", 12 | "test": "echo \"Error: no test specified\" && exit 1" 13 | }, 14 | "bin": { 15 | "fpb-parse": "./index.js" 16 | }, 17 | "author": "Nick Quidas", 18 | "license": "ISC", 19 | "dependencies": { 20 | "command-line-args": "^5.2.0", 21 | "remark": "^13.0.0" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig helps developers define and maintain consistent 2 | # coding styles between different editors and IDEs 3 | # editorconfig.org 4 | 5 | ; top-most EditorConfig file 6 | root = true 7 | 8 | ; define basic and global for any file 9 | [*] 10 | charset = utf-8 11 | end_of_line = lf 12 | indent_size = 4 13 | indent_style = space 14 | insert_final_newline = true 15 | max_line_length = off 16 | trim_trailing_whitespace = true 17 | curly_bracket_next_line = false 18 | spaces_around_operators = true 19 | 20 | ; DOS/Windows batch scripts - 21 | [*.{bat,cmd}] 22 | end_of_line = crlf 23 | 24 | ; JavaScript files - 25 | [*.{js,ts}] 26 | curly_bracket_next_line = true 27 | indent_size = 2 28 | quote_type = double 29 | 30 | ; JSON files (normal and commented version) - 31 | [*.{json,jsonc}] 32 | indent_size = 2 33 | quote_type = double 34 | 35 | ; Make - match it own default syntax 36 | [Makefile] 37 | indent_style = tab 38 | 39 | ; Markdown files - preserve trail spaces that means break line 40 | [*.{md,markdown}] 41 | trim_trailing_whitespace = false 42 | 43 | ; PowerShell - match defaults for New-ModuleManifest and PSScriptAnalyzer Invoke-Formatter 44 | [*.{ps1,psd1,psm1}] 45 | charset = utf-8-bom 46 | end_of_line = crlf 47 | 48 | ; YML config files - match it own default syntax 49 | [*.{yaml,yml}] 50 | indent_size = 2 51 | -------------------------------------------------------------------------------- /lib/functions/Strings.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Strip wrapped parenthesis from a string. 3 | * @param {string} s - the string to process 4 | * @returns {string} the stripped string if parens found, the input string if don't 5 | */ 6 | function stripParens(s) { 7 | // null or undefined 8 | if (s === null || s === void 0) return s; 9 | // is wrapped by ( and )?, then unwrap 10 | if (s.slice(0, 1) === "(" && s.slice(-1) === ")") return s.slice(1, -1); 11 | // leave as it is 12 | return s; 13 | } 14 | 15 | /** 16 | * Replaces a data tokens in a template string. 17 | * @param {string} template - the template string 18 | * @param {object} context - the data used to replace the tokens with 19 | * @returns string replace 20 | */ 21 | function templater(template, context = {}) { 22 | // replaceAll using a replacer function 23 | return template.replace( 24 | /{{([^{}]+)}}/g, // {{key}} 25 | (matchedText, key) => context[key] || "" 26 | ); 27 | } 28 | 29 | /** 30 | * Wraps a string between other that acts as token. 31 | * @param {string} s - the text to wrap 32 | * @param {string} token - the text to wrap with between 33 | * @returns a string in the form `${token}${s}${token}` 34 | */ 35 | function wrap(s, token = "") { 36 | // avoid mix concatenate/sum string/numbers using array join hack 37 | //return `${token}${s}${token}`; 38 | return [token, token].join(s); 39 | } 40 | 41 | module.exports = { 42 | stripParens, 43 | templater, 44 | wrap, 45 | }; 46 | -------------------------------------------------------------------------------- /languages.js: -------------------------------------------------------------------------------- 1 | const languages = { 2 | ab: "Abkhazian", 3 | aa: "Afar", 4 | af: "Afrikaans", 5 | ak: "Akan", 6 | sq: "Albanian", 7 | am: "Amharic", 8 | ar: "Arabic", 9 | an: "Aragonese", 10 | hy: "Armenian", 11 | as: "Assamese", 12 | av: "Avaric", 13 | ae: "Avestan", 14 | ay: "Aymara", 15 | az: "Azerbaijani", 16 | bm: "Bambara", 17 | ba: "Bashkir", 18 | eu: "Basque", 19 | be: "Belarusian", 20 | bn: "Bengali", 21 | bh: "Bihari languages", 22 | bi: "Bislama", 23 | bs: "Bosnian", 24 | br: "Breton", 25 | bg: "Bulgarian", 26 | my: "Burmese", 27 | ca: "Catalan, Valencian", 28 | km: "Central Khmer", 29 | ch: "Chamorro", 30 | ce: "Chechen", 31 | ny: "Chichewa, Chewa, Nyanja", 32 | zh: "Chinese", 33 | cu: "Church Slavonic, Old Bulgarian, Old Church Slavonic", 34 | cv: "Chuvash", 35 | kw: "Cornish", 36 | co: "Corsican", 37 | cr: "Cree", 38 | hr: "Croatian", 39 | cs: "Czech", 40 | da: "Danish", 41 | dv: "Divehi, Dhivehi, Maldivian", 42 | nl: "Dutch, Flemish", 43 | dz: "Dzongkha", 44 | en: "English", 45 | "en-US": "English", 46 | eo: "Esperanto", 47 | et: "Estonian", 48 | ee: "Ewe", 49 | fo: "Faroese", 50 | fj: "Fijian", 51 | fi: "Finnish", 52 | fr: "French", 53 | ff: "Fulah", 54 | gd: "Gaelic, Scottish Gaelic", 55 | gl: "Galician", 56 | lg: "Ganda", 57 | ka: "Georgian", 58 | de: "German", 59 | ki: "Gikuyu, Kikuyu", 60 | el: "Greek (Modern)", 61 | kl: "Greenlandic, Kalaallisut", 62 | gn: "Guarani", 63 | gu: "Gujarati", 64 | ht: "Haitian, Haitian Creole", 65 | ha: "Hausa", 66 | he: "Hebrew", 67 | hz: "Herero", 68 | hi: "Hindi", 69 | ho: "Hiri Motu", 70 | hu: "Hungarian", 71 | is: "Icelandic", 72 | io: "Ido", 73 | ig: "Igbo", 74 | id: "Indonesian", 75 | ia: "Interlingua (International Auxiliary Language Association)", 76 | ie: "Interlingue", 77 | iu: "Inuktitut", 78 | ik: "Inupiaq", 79 | ga: "Irish", 80 | it: "Italian", 81 | ja: "Japanese", 82 | jv: "Javanese", 83 | kn: "Kannada", 84 | kr: "Kanuri", 85 | ks: "Kashmiri", 86 | kk: "Kazakh", 87 | rw: "Kinyarwanda", 88 | kv: "Komi", 89 | kg: "Kongo", 90 | ko: "Korean", 91 | kj: "Kwanyama, Kuanyama", 92 | ku: "Kurdish", 93 | ky: "Kyrgyz", 94 | lo: "Lao", 95 | la: "Latin", 96 | lv: "Latvian", 97 | lb: "Letzeburgesch, Luxembourgish", 98 | li: "Limburgish, Limburgan, Limburger", 99 | ln: "Lingala", 100 | lt: "Lithuanian", 101 | lu: "Luba-Katanga", 102 | mk: "Macedonian", 103 | mg: "Malagasy", 104 | ms: "Malay", 105 | ml: "Malayalam", 106 | mt: "Maltese", 107 | gv: "Manx", 108 | mi: "Maori", 109 | mr: "Marathi", 110 | mh: "Marshallese", 111 | ro: "Moldovan, Moldavian, Romanian", 112 | mn: "Mongolian", 113 | na: "Nauru", 114 | nv: "Navajo, Navaho", 115 | nd: "Northern Ndebele", 116 | ng: "Ndonga", 117 | ne: "Nepali", 118 | se: "Northern Sami", 119 | no: "Norwegian", 120 | nb: "Norwegian Bokmål", 121 | nn: "Norwegian Nynorsk", 122 | ii: "Nuosu, Sichuan Yi", 123 | oc: "Occitan (post 1500)", 124 | oj: "Ojibwa", 125 | or: "Oriya", 126 | om: "Oromo", 127 | os: "Ossetian, Ossetic", 128 | pi: "Pali", 129 | pa: "Panjabi, Punjabi", 130 | ps: "Pashto, Pushto", 131 | fa: "Persian", 132 | "fa-IR": "Persian (Iran)", 133 | pl: "Polish", 134 | "pt-BR": "Portuguese (Brazil)", 135 | "pt-PT": "Portuguese (Portugal)", 136 | qu: "Quechua", 137 | rm: "Romansh", 138 | rn: "Rundi", 139 | ru: "Russian", 140 | sm: "Samoan", 141 | sg: "Sango", 142 | sa: "Sanskrit", 143 | sc: "Sardinian", 144 | sr: "Serbian", 145 | sn: "Shona", 146 | sd: "Sindhi", 147 | si: "Sinhala, Sinhalese", 148 | sk: "Slovak", 149 | sl: "Slovenian", 150 | so: "Somali", 151 | st: "Sotho, Southern", 152 | nr: "South Ndebele", 153 | es: "Spanish, Castilian", 154 | su: "Sundanese", 155 | sw: "Swahili", 156 | ss: "Swati", 157 | sv: "Swedish", 158 | tl: "Tagalog", 159 | ty: "Tahitian", 160 | tg: "Tajik", 161 | ta: "Tamil", 162 | tt: "Tatar", 163 | te: "Telugu", 164 | th: "Thai", 165 | bo: "Tibetan", 166 | ti: "Tigrinya", 167 | to: "Tonga (Tonga Islands)", 168 | ts: "Tsonga", 169 | tn: "Tswana", 170 | tr: "Turkish", 171 | tk: "Turkmen", 172 | tw: "Twi", 173 | ug: "Uighur, Uyghur", 174 | uk: "Ukrainian", 175 | ur: "Urdu", 176 | uz: "Uzbek", 177 | ve: "Venda", 178 | vi: "Vietnamese", 179 | vo: "Volap_k", 180 | wa: "Walloon", 181 | cy: "Welsh", 182 | fy: "Western Frisian", 183 | wo: "Wolof", 184 | xh: "Xhosa", 185 | yi: "Yiddish", 186 | yo: "Yoruba", 187 | za: "Zhuang, Chuang", 188 | zu: "Zulu", 189 | }; 190 | 191 | module.exports = languages; 192 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require("fs"); 4 | const path = require("path"); 5 | const remark = require("remark"); 6 | const { Objects, Strings } = require("./lib/functions"); 7 | const languages = require("./languages"); 8 | const commandLineArgs = require("command-line-args"); 9 | 10 | const optionDefinitions = [ 11 | { 12 | name: "input", 13 | multiple: true, 14 | defaultValue: ["./fpb/books", "./fpb/casts", "./fpb/courses", "./fpb/more"], 15 | }, 16 | { name: "output", defaultValue: "./parser/fpb.json" }, 17 | ]; 18 | 19 | const excludes = [ 20 | "README.md", 21 | "CONTRIBUTING.md", 22 | "CODE_OF_CONDUCT.md", 23 | "SUMMARY.md", 24 | ]; 25 | 26 | /** 27 | * Parses the contents of a heading from remark-parse into a readable format. 28 | * 29 | * @param {Array} children - an array of AST items defined by remark-parse for 30 | * the content of headings (H1..H7) 31 | * 32 | * @returns {string} an string with the name of the section related with the input heading 33 | */ 34 | function getSectionNameFromHeadingContent(children) { 35 | // visit nodes in depth 36 | const walk = (children, depth) => 37 | children.reduce((text, node, index) => { 38 | if (!node || !node.type) return text; // not AST, maybe plain text 39 | switch (node.type) { 40 | // 41 | // meaningfull nodes 42 | // 43 | case "emphasis": 44 | case "strong": 45 | text += Strings.templater(remarkTokenAST(node), { 46 | text: walk(node.children, depth + 1), 47 | }); 48 | break; 49 | case "inlineCode": 50 | case "text": 51 | text += Strings.templater(remarkTokenAST(node), { 52 | text: node.value, 53 | }); 54 | break; 55 | // 56 | // skipped nodes 57 | // 58 | case "heading": 59 | case "html": 60 | case "link": 61 | case "list": 62 | case "paragraph": 63 | default: 64 | break; 65 | } 66 | return text; 67 | }, ""); 68 | 69 | return walk(children, 0); 70 | } 71 | 72 | /** 73 | * Parses the contents of a link from remark-parse into a readable format. 74 | * 75 | * @param {Array} children - an array of AST items defined by remark-parse for 76 | * the content of a link (A) 77 | * 78 | * @returns {string} an string with the text of the related input link 79 | */ 80 | function getLinkTextFromLinkNodes(children) { 81 | // visit nodes in depth 82 | const walk = (children, depth) => { 83 | // not AST, maybe plain text 84 | if (!Array.isArray(children)) return Objects.toString(children); 85 | // AST children array nodes 86 | return children.reduce((text, node, index) => { 87 | if (!node || !node.type) return text; // not AST, maybe plain text 88 | switch (node.type) { 89 | // 90 | // rebuild meaningfull nodes 91 | // 92 | case "image": 93 | text += Strings.templater(remarkTokenAST(node), { 94 | text: node.alt || node.title, 95 | url: node.url, 96 | }); 97 | break; 98 | case "inlineCode": 99 | case "text": 100 | text += Strings.templater(remarkTokenAST(node), { 101 | text: node.value, 102 | }); 103 | break; 104 | case "emphasis": 105 | case "strong": 106 | text += Strings.templater(remarkTokenAST(node), { 107 | text: walk(node.children, depth + 1), 108 | }); 109 | break; 110 | // 111 | // skipped nodes 112 | // 113 | default: 114 | console.log( 115 | "getLinkTextFromLinkNodes::skipped", 116 | depth, 117 | node.type, 118 | node 119 | ); 120 | break; 121 | } 122 | return text; 123 | }, ""); 124 | }; 125 | 126 | return walk(children, 0); 127 | } 128 | 129 | /** 130 | * Gets the template related with AST remark-parse node. 131 | * @param {Object} node - AST node defined by remark-parse 132 | * @returns {string} - the template string 133 | */ 134 | function remarkTokenAST(node) { 135 | if (node && node.type) { 136 | switch (node.type) { 137 | case "break": // {type: 'break', position: {...}} 138 | return "
"; 139 | case "emphasis": // {type: 'emphasis', children: [...], position: {...}} 140 | return Strings.wrap("{{text}}", "_"); 141 | case "heading": // {type: 'heading', depth: 1, children: [...], position: {...}} 142 | return ["#".repeat(node.depth || 0), "{{text}}"].join(""); 143 | case "image": // {type: 'image', title: '...', url: '...', alt: '...', position: {...}} 144 | return "![{{text}}]({{url}})"; 145 | case "inlineCode": // {type: 'inlineCode', value: '...', position: {...}} 146 | return Strings.wrap("{{text}}", "`"); 147 | case "link": // {type: 'link', title: '...', url: '...', children: [...], position: {...}} 148 | return "[{{text}}]({{url}})"; 149 | case "list": // {type: 'list', ordered: false, start: null, spread: false, children: [...], position: {...}} 150 | case "listItem": // {type: 'listItem', spread: false, checked: null, children: [...], position: {...}} 151 | // TODO: generate token for list/listItem 152 | break; 153 | case "strong": // {type: 'strong', children: [...], position: {...}} 154 | return Strings.wrap("{{text}}", "**"); 155 | case "html": // {type: 'html', value: '...', position: {...}} 156 | case "paragraph": // {type: 'paragraph', children: [...], position: {...}} 157 | case "text": // {type: 'text', value: '...', position: {...}} 158 | return Strings.wrap("{{text}}"); // identity 159 | default: 160 | break; 161 | } 162 | } 163 | throw new Error("Unrecognized remark node type: " + (node && node.type)); 164 | } 165 | 166 | /** 167 | * Parses a list item generated from remark-parse into a readable format. 168 | * 169 | * remark-parse parses a markdown file into a long, intricate json. 170 | * Many fields in this json either give information we do not care 171 | * about or does not go into enough detail. This function parses the 172 | * output of remark-parse into a format preferred by this project, 173 | * indicating authors, notes, and links etc. 174 | * 175 | * @param {Object} listItem - a listItem in AST format defined by remark-parse 176 | * 177 | * @return {Object} Returns an Object containing details about the piece of media. 178 | */ 179 | function parseListItem(listItem) { 180 | let entry = {}; 181 | let s = ""; // If we need to build up a string over multiple listItem elements 182 | let leftParen, 183 | rightParen = -1; // If we need to parse parenthesized text 184 | // head of listItem = url, the rest is "other stuff" 185 | const [link, ...otherStuff] = listItem; 186 | entry.url = link.url; 187 | // link.children || link.value => weak way to check if link.type === "link" 188 | entry.title = getLinkTextFromLinkNodes(link.children || link.value); 189 | // remember to get OTHER STUFF!! remember there may be multiple links! 190 | for (let i of otherStuff) { 191 | if (s === "") { 192 | // this is almost always, except for when we are parsing a multi-element note 193 | if (i.type === "text" && i.value.slice(0, 3) === " - ") { 194 | // author found 195 | let parenIndex = i.value.indexOf("("); 196 | if (parenIndex === -1) { 197 | entry.author = i.value.slice(3).trim(); 198 | } else { 199 | entry.author = i.value.slice(3, parenIndex).trim(); // go from " - " until the first "(" 200 | } 201 | } 202 | if ( 203 | i.type === "emphasis" && 204 | i.children[0].value.slice(0, 1) === "(" && 205 | i.children[0].value.slice(-1) === ")" 206 | ) { 207 | // access notes found (currently assumes exactly one child, so far this is always the case) 208 | entry.accessNotes = i.children[0].value.slice(1, -1); 209 | } 210 | if (i.type === "link") { 211 | // other links found 212 | if (entry.otherLinks === undefined) entry.otherLinks = []; 213 | entry.otherLinks.push({ 214 | title: Strings.stripParens(getLinkTextFromLinkNodes(i.children)), 215 | url: i.url, 216 | }); 217 | // entry.otherLinks = [...entry.otherLinks, {title: i.children[0].value, url: i.url}]; // <-- i wish i could get this syntax to work with arrays 218 | } 219 | if (i.type === "text" && i.value.indexOf("(") !== -1) { 220 | // notes found (currently assumes no nested parentheses) 221 | if (entry.notes === undefined) entry.notes = []; 222 | leftParen = i.value.indexOf("("); 223 | while (leftParen != -1) { 224 | rightParen = i.value.indexOf(")", leftParen); 225 | if (rightParen === -1) { 226 | // there must be some *emphasis* found 227 | s += i.value.slice(leftParen); 228 | break; 229 | } 230 | entry.notes.push(i.value.slice(leftParen + 1, rightParen)); 231 | leftParen = i.value.indexOf("(", rightParen); 232 | } 233 | } 234 | } else { 235 | // for now we assume that all previous ifs are mutually exclusive with this, may polish later 236 | if (i.type === "emphasis") { 237 | // this is the emphasis, add it in boldface and move on 238 | s += "*" + i.children[0].value + "*"; 239 | } else if (i.type === "link") { 240 | // something has gone terribly wrong. this book must be viewed and edited manually. 241 | entry.manualReviewRequired = true; 242 | break; 243 | } else { 244 | // hopefully this is the end of the note 245 | let rightParen = i.value.indexOf(")"); 246 | if (rightParen === -1) { 247 | // we have to go AGAIN 248 | s += i.value; 249 | } else { 250 | // finally, we have reached the end of the note 251 | entry.notes.push( 252 | Strings.stripParens(s + i.value.slice(0, rightParen + 1)) 253 | ); 254 | s = ""; 255 | // this is a copypaste of another block of code. probably not a good thing tbh. 256 | leftParen = i.value.indexOf("("); 257 | while (leftParen != -1) { 258 | rightParen = i.value.indexOf(")", leftParen); 259 | if (rightParen === -1) { 260 | // there must be some *emphasis* found 261 | s += i.value.slice(leftParen); 262 | break; 263 | } 264 | entry.notes.push(i.value.slice(leftParen + 1, rightParen)); 265 | leftParen = i.value.indexOf("(", rightParen); 266 | } 267 | } 268 | } 269 | } 270 | } 271 | return entry; 272 | } 273 | 274 | /** 275 | * Determines the language a certain file is based on the format 276 | * from the FreeEbookFoundation GitHub page 277 | * @param {String} filename A filename in the format kept by all markdown files on the FreeProgrammingBooks Github 278 | * @returns {String} The language the file is 279 | */ 280 | function getLangFromFilename(filename) { 281 | const dash = filename.lastIndexOf("-"); 282 | const dot = filename.lastIndexOf("."); 283 | let lang = filename.slice(dash + 1, dot).replace(/_/, "-"); 284 | let isSubject = false; 285 | if (!languages.hasOwnProperty(lang)) { 286 | if (/^[a-z]{2}$/.test(lang) || /^[a-z]{2}-[A-Z]{2}$/.test(lang)) { 287 | return ""; 288 | } 289 | // console.log(lang); 290 | if (lang === "subjects") { 291 | isSubject = true; 292 | } 293 | lang = "en"; 294 | } 295 | return { lang: lang, isSubject: isSubject }; 296 | } 297 | 298 | /** 299 | * Gets all markdown files in a directory, 300 | * @param {String} dir - A directory path 301 | * @returns A list of all md files in a directory, excluding those in the excludes array 302 | */ 303 | function getFilesFromDir(dir) { 304 | return fs 305 | .readdirSync(dir) 306 | .filter( 307 | (file) => path.extname(file) === ".md" && excludes.indexOf(file) === -1 308 | ) 309 | .map((file) => path.join(dir, file)); 310 | } 311 | 312 | /** 313 | * Retrieves the folder name from a string representing a directory and file 314 | * @param {String} str - A string representing a path directory alike in the format "./directory/file" 315 | * @returns {String} The extracted directory name 316 | */ 317 | function getMediaTypeFromDirectoryPath(str) { 318 | str = path.resolve(str); // sanatize and expand (OS independent) 319 | let type; 320 | if (fs.lstatSync(str).isDirectory()) { 321 | // if path is itself a directory, use it name as result 322 | type = path.basename(str); 323 | } else { 324 | // if not... parent/previous slug is always a directory; extract this part 325 | // path.sep: Windows -> "\", Unix -> "/" 326 | type = str.split(path.sep).slice(-2, -1).join(path.sep); 327 | } 328 | return type; 329 | } 330 | 331 | /** 332 | * Turns a single markdown file into the json structure needed 333 | * @param {path} doc - a single file path to a markdown file 334 | * @returns {object} Json object of entries in the md file 335 | */ 336 | function parseMarkdown(doc) { 337 | let tree = remark.parse(doc).children; 338 | let sections = []; // This will go into root object later 339 | let errors = []; 340 | let currentDepth = 3; // used to determine if the last heading was an h4 or h3 341 | 342 | // find where Index ends 343 | // probably could be done better, review later 344 | let i = 0, 345 | count = 0; 346 | for (i; i < tree.length; i++) { 347 | if (tree[i].type == "heading" && tree[i].depth == "3") count++; 348 | if (count == 2) break; 349 | } 350 | 351 | tree.slice(i).forEach((item) => { 352 | // Start iterating after Index 353 | try { 354 | if (item.type == "heading") { 355 | const sectionName = getSectionNameFromHeadingContent(item.children); 356 | if (sectionName == "Index") return; 357 | if (item.depth == 3) { 358 | // Heading is an h3 359 | currentDepth = 3; 360 | // create section record 361 | let newSection = { 362 | section: sectionName, 363 | entries: [], 364 | subsections: [], 365 | }; 366 | // Push the section to the output array 367 | sections.push(newSection); 368 | } else if (item.depth == 4) { 369 | // Heading is an h4 370 | currentDepth = 4; 371 | // create subsection record 372 | let newSubsection = { 373 | section: sectionName, 374 | entries: [], 375 | }; 376 | // Add to subsection array of most recent h3 377 | sections[sections.length - 1].subsections.push(newSubsection); 378 | } 379 | } else if (item.type == "list") { 380 | item.children.forEach((listItem) => { 381 | let content = listItem.children[0].children; // gets array containing a remark-link and a remark-paragraph 382 | // if(content[0].type !== 'link'){ // SKIPS OVER bad formatting 383 | // return; 384 | // } 385 | if (currentDepth == 3) { 386 | let contentJson = parseListItem(content); 387 | sections[sections.length - 1].entries.push(contentJson); // add the entry to most recent h3 388 | } else if (currentDepth == 4) { 389 | let lastSection = sections.length - 1; 390 | let lastSubSec = sections[lastSection].subsections.length - 1; 391 | let contentJson = parseListItem(content); 392 | sections[lastSection].subsections[lastSubSec].entries.push( 393 | contentJson 394 | ); // add entry to most recent h4 395 | } 396 | }); 397 | } 398 | } catch (e) { 399 | // if there was an error while parsing, print the error to an error log 400 | // looks really ugly, maybe try to refine output later 401 | let errStart = JSON.stringify(item.position.start.line); 402 | let errEnd = JSON.stringify(item.position.end.line); 403 | str = `Error at line ${errStart} - line ${errEnd}.`; 404 | errors.push(str); 405 | } 406 | }); 407 | return { sections: sections, errors: errors }; 408 | } 409 | 410 | /** 411 | * Parses a single directory's md files and converts them into usable json 412 | * @param {String} directory A string pointing to a directory 413 | * @returns {Object} An object containing two values, dirJson and dirErrors. 414 | * dirJson contains all data that was successfully parsed from 415 | * the markdown files. dirErrors contains all entries that had 416 | * an error occur while parsing. 417 | */ 418 | function parseDirectory(directory) { 419 | let dirChildren = []; // this will hold the output each markdown doc 420 | let dirErrors = []; //contains error for a given directory 421 | 422 | let mediaType = getMediaTypeFromDirectoryPath(directory); 423 | const filenames = getFilesFromDir(path.resolve(directory)); 424 | filenames.forEach((filename) => { 425 | const doc = fs.readFileSync(filename); 426 | let { sections, errors } = parseMarkdown(doc); // parse the markdown document 427 | const { lang, isSubject } = getLangFromFilename(filename); 428 | 429 | // Entries 430 | let docJson = { 431 | language: { 432 | code: lang, 433 | name: languages[lang], 434 | }, 435 | index: {}, 436 | sections: sections, 437 | }; 438 | if (lang === "en") docJson.language.isSubject = isSubject; 439 | dirChildren.push(docJson); 440 | 441 | // Errors 442 | if (errors.length !== 0) { 443 | let docErrors = { 444 | file: path.basename(filename), 445 | errors: errors, 446 | }; 447 | dirErrors.push(docErrors); 448 | } 449 | }); 450 | 451 | // File entries 452 | let dirJson = { 453 | type: mediaType, 454 | index: {}, 455 | children: dirChildren, 456 | }; 457 | 458 | // Errors 459 | 460 | return { dirJson: dirJson, dirErrors: dirErrors }; 461 | } 462 | 463 | /** 464 | * Reads all given directories for markdown files and prints the parsed json in the output directory 465 | * 466 | * @param {Array} directories A list of strings of directories to scan for markdown files 467 | * @param {String} output A string for the path that the output should be placed in 468 | */ 469 | function parseAll(directories, output) { 470 | let rootChildren = []; // this will hold the output of each directory 471 | let rootErrors = []; 472 | 473 | directories.forEach((directory) => { 474 | let { dirJson, dirErrors } = parseDirectory(directory); 475 | rootChildren.push(dirJson); 476 | if (dirErrors.length !== 0) { 477 | rootErrors.push({ 478 | directory: path.basename(directory), 479 | files: dirErrors, 480 | }); 481 | } 482 | }); 483 | 484 | // ALl entries 485 | let rootJson = { 486 | type: "root", 487 | children: rootChildren, 488 | }; 489 | 490 | // Errors 491 | let allErrors = { 492 | type: "root", 493 | directories: rootErrors, 494 | }; 495 | fs.writeFileSync(output, JSON.stringify(rootJson, null, 3), function (err) { 496 | if (err) { 497 | console.log(err); 498 | } 499 | }); 500 | // fs.writeFileSync( 501 | // "./parser/fpb.log", 502 | // JSON.stringify(allErrors, null, 3), 503 | // function (err) { 504 | // if (err) { 505 | // console.log(err); 506 | // } 507 | // } 508 | // ); 509 | } 510 | 511 | let { input, output } = commandLineArgs(optionDefinitions); 512 | parseAll(input, output); 513 | --------------------------------------------------------------------------------