├── .gitignore ├── .npmignore ├── README.md ├── bin └── gitbook-convert.js ├── package.json ├── scripts └── resources.sh └── src ├── converters ├── docx.js ├── html-base.js ├── html.js ├── index.js ├── markdown-filters.js ├── odt.js ├── types │ ├── chapter.js │ └── readme.js ├── utils.js └── xml.js └── index.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | resources/ 3 | test_docs/ 4 | lib/ 5 | 6 | .eslintrc 7 | *.docx 8 | *.xml 9 | *.html 10 | *.css 11 | *.md 12 | *.odt 13 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | !resources/ 2 | !lib/ 3 | src/ 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gitbook-convert 2 | 3 | [![NPM version](https://badge.fury.io/js/gitbook-convert.svg)](http://badge.fury.io/js/gitbook-convert.svg) 4 | 5 | > CLI to convert an existing document to a GitBook. 6 | 7 | ## Install 8 | 9 | Install this globally and you'll have access to the `gitbook-convert` command anywhere on your system. 10 | 11 | ```shell 12 | $ npm install gitbook-convert -g 13 | ``` 14 | 15 | ## Use 16 | 17 | ```shell 18 | $ gitbook-convert [options] [export-directory] 19 | ``` 20 | 21 | ### Options 22 | 23 | | Short | Long | Description | Type | Default | 24 | | ----- | ---- | ----------- | ---- | ------- | 25 | | -t | --document-title | Name used for the main document title | string | null | 26 | | -a | --assets-dir | Name of the document's assets export directory | string | assets | 27 | | -m | --max-depth | Maximum title depth to use to split your original document into sub-chapters | integer | 2 | 28 | | -p | --prefix | Prefix filenames by an incremental counter | flag | false | 29 | | -d | --debug | Log stack trace when an error occurs | flag | false | 30 | 31 | After converting your document, the corresponding GitBook files will be placed in the provided `export-directory` folder. The folder is created during conversion. 32 | 33 | If `export-directory` is not provided, a new `export` folder is created in the current working directory. The GitBook files are then placed here. 34 | 35 | If the `--document-title` argument is not passed, the filename without the file extension will be used as the main document title. 36 | 37 | ### Currently accepted formats 38 | 39 | | Type | Extension | 40 | | ---- | --------- | 41 | | Microsoft Office Open XML Document | .docx | 42 | | OpenOffice / Open Document Format | .odt | 43 | | Docbook Markup Language | .xml | 44 | | HyperText Markup Language | .html | 45 | 46 | ## Output 47 | 48 | This version of `gitbook-convert` generates markdown files only. Support for asciidoc might be added later. 49 | 50 | ### Document processing 51 | 52 | `gitbook-convert` divides your original document into chapters and sub-chapters, if any, one per output file. To do this, `gitbook-convert` automatically detects the headers in your document and uses the `-m` flag to split it into sub-chapters. 53 | 54 | When converting a Docbook file though, the depth is always detected automatically. 55 | 56 | Thus, converting the following document named **History of modern computers.docx** with the default `--max-depth` flag: 57 | > # Chapter 1 58 | > What the world used to be. 59 | > ## The beginning 60 | > At the beginning was the big bang... 61 | > ## The following 62 | > Strange creatures called “humans” had trouble living in peace... 63 | > # Chapter 2 64 | > What the world is now. 65 | > ## The awakening 66 | > Computers came to rule the world... 67 | > ## The end 68 | > The power supply went disconnected. 69 | 70 | will produce the following output: 71 | ```shell 72 | user @ cwd/export/history_of_modern_computers 73 | README.md 74 | SUMMARY.md 75 | assets/ 76 | chapter_1/ 77 | README.md 78 | the_beginning.md 79 | the_following.md 80 | chapter_2/ 81 | README.md 82 | the_awakening.md 83 | the_end.md 84 | ``` 85 | 86 | While using `1` for `--max-depth` would produce: 87 | ```shell 88 | user @ cwd/export/history_of_modern_computers 89 | chapter_1.md 90 | chapter_2.md 91 | README.md 92 | SUMMARY.md 93 | assets/ 94 | ``` 95 | 96 | ### Summary 97 | 98 | The `SUMMARY.md` file is created automatically. 99 | 100 | For our first example: 101 | 102 | ```markdown 103 | # Summary 104 | 105 | * [Introduction](README.md) 106 | * [Chapter 1](chapter_1/README.md) 107 | * [The beginning](chapter_1/the_beginning.md) 108 | * [The following](chapter_1/the_following.md) 109 | * [Chapter 2](chapter_2/README.md) 110 | * [The awakening](chapter_2/the_awakening.md) 111 | * [The end](chapter_2/the_end.md) 112 | ``` 113 | 114 | With `--max-depth` set to `1`: 115 | 116 | ```markdown 117 | # Summary 118 | 119 | * [Introduction](README.md) 120 | * [Chapter 1](chapter_1.md) 121 | * [Chapter 2](chapter_2.md) 122 | ``` 123 | 124 | ### README 125 | 126 | The content of the `README.md` file depends on your document structure. Anyways, the filename of your original document will be used as the main title here. 127 | 128 | ##### Original document starts with a main header 129 | 130 | `gitbook-convert` creates the default GitBook `README.md` file: 131 | 132 | ```markdown 133 | # History of modern computers 134 | 135 | This file serves as your book's preface, a great place to describe your book's content and ideas. 136 | ``` 137 | 138 | ##### Original document has an introduction 139 | Otherwise, everything before the first main header is used as the `README.md` content. If we modify our example to be: 140 | 141 | > A short history of modern computers. 142 | > # Chapter 1 143 | > ## The beginning 144 | > At the beginning was the big bang... 145 | > ## The following 146 | > ... 147 | 148 | The content of the `README.md` file will be: 149 | 150 | ```markdown 151 | # History of modern computers 152 | 153 | A short history of modern computers. 154 | ``` 155 | 156 | The behavior is the same when `--max-depth` is set to higher levels. Each `README.md` in the sub-chapters folders will contain the preface for the current chapter. 157 | 158 | ## Converters 159 | 160 | The appropriate converter for a document type is deduced from its extension. 161 | 162 | For now, the converters should: 163 | * be placed in `lib/converters`, 164 | * with its filename being the document-type extension, for example `/lib/converters/docx.js`, 165 | * added to the `lib/converters/index.js` file for reference and use. 166 | 167 | ### docx 168 | 169 | The `.docx` converter uses mwilliamson's [mammoth.js](https://github.com/mwilliamson/mammoth.js) to convert your document to HTML before generating the output. 170 | 171 | `gitbook-convert` will try to export your inline images in the `/assets` folder, using the image title as the image filename if provided. 172 | 173 | ### odt 174 | 175 | The `.odt` converter uses [odt2html](https://github.com/GitbookIO/odt2html) to convert your document to HTML before generating the output. Because there was no node module out there to convert OpenOffice documents to HTML, we built our own. 176 | 177 | `gitbook-convert` will try to export your inline images in the `/assets` folder, using the image name in the document as the image filename if provided. 178 | 179 | ### docbook 180 | 181 | `gitbook-convert` requires [**xsltproc**](http://xmlsoft.org/XSLT/xsltproc.html) to be installed to process a Docbook. If you are using MacOS or a Linux distribution, it should be installed by default. 182 | 183 | You can test that **xsltproc** is installed using: 184 | ```shell 185 | $ which xsltproc 186 | ``` 187 | 188 | **xsltproc** uses the last version of [docbook.xsl](http://sourceforge.net/projects/docbook/files/docbook-xsl/) to convert your Docbook to HTML first. Since the [Docbook XML markup is very large](http://www.docbook.org/tdg5/en/html/chunk-part-d64e8789.html), `gitbook-convert` will try to convert the meta-data as well as possible. Extended conversion might be added to the tool based on user requests. 189 | 190 | When you install `gitbook-convert` using [npm](npmjs.com), the [docbook.xsl](http://sourceforge.net/projects/docbook/files/docbook-xsl/) stylesheets are downloaded and installed along with the app. 191 | 192 | We recommend using the tool with Docbook version 5. [Here is a walk-through](http://doccookbook.sourceforge.net/html/en/dbc.structure.db4-to-db5.html) for converting an existing Docbook in version 4 to version 5. -------------------------------------------------------------------------------- /bin/gitbook-convert.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | /* eslint-disable no-console */ 3 | 4 | var _ = require('lodash'); 5 | var program = require('commander'); 6 | 7 | var gitbookConvert = require('../lib/index'); 8 | 9 | var ALLOWED_FORMATS = require('../lib/converters').ALLOWED_FORMATS; 10 | var pkg = require('../package.json'); 11 | 12 | // Describe program options 13 | program 14 | .version(pkg.version) 15 | .usage('[options] ') 16 | .option('-t, --document-title [string]', 'Name used for the main document title', null) 17 | .option('-a, --assets-dir [dirname]', 'Name of the document\'s assets export directory', 'assets') 18 | .option('-m, --max-depth [integer]', 'Maximum title depth to use to split your original document into sub-chapters', 2) 19 | .option('-p, --prefix', 'Prefix filenames by an incremental counter') 20 | .option('-d, --debug', 'Log stack trace when an error occurs'); 21 | 22 | // Customize --help flag 23 | program.on('--help', function() { 24 | console.log(' gitbook-convert accepts the following formats:'); 25 | console.log(''); 26 | ALLOWED_FORMATS.forEach(function(format) { 27 | console.log(' .' + format.ext + ': ' + format.description); 28 | }); 29 | console.log(''); 30 | console.log(' After converting your document, the corresponding GitBook files will be placed in ./export//.'); 31 | }); 32 | 33 | // Parse passed arguments 34 | program.parse(process.argv); 35 | 36 | // Parse and fallback to help if no args 37 | if (_.isEmpty(program.parse(process.argv).args) && process.argv.length === 2) { 38 | program.help(); 39 | } 40 | 41 | // Construct converters options 42 | var opts = { 43 | filename: program.args[0], 44 | exportDir: program.args[1] || 'export', 45 | documentTitle: program.documentTitle, 46 | assetsDirectory: program.assetsDir, 47 | titleDepth: parseInt(program.maxDepth, 10), 48 | prefix: program.prefix, 49 | debug: program.debug 50 | }; 51 | 52 | // Get a converter based on filename 53 | var converter; 54 | try { 55 | converter = gitbookConvert.pickConverter(opts); 56 | } 57 | catch (err) { 58 | console.log(err.message); 59 | if (program.debug) { 60 | console.log(err.stack); 61 | } 62 | 63 | process.exit(1); 64 | } 65 | 66 | // Launch conversion to a GitBook 67 | converter.convert(); 68 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gitbook-convert", 3 | "version": "1.1.1", 4 | "description": "Convert your existing books in different formats to GitBook", 5 | "author": "GitBook Team ", 6 | "license": "Apache-2.0", 7 | "main": "bin/gitbook-convert.js", 8 | "bin": { 9 | "gitbook-convert": "bin/gitbook-convert.js" 10 | }, 11 | "babel": { 12 | "presets": [ 13 | "es2015" 14 | ] 15 | }, 16 | "scripts": { 17 | "compile": "rm -rf lib/ && babel --presets es2015 -d lib/ src/", 18 | "prepublish": "scripts/resources.sh && npm run compile", 19 | "test": "echo \"Error: no test specified\" && exit 1" 20 | }, 21 | "keywords": [ 22 | "gitbook", 23 | "import", 24 | "convert", 25 | "book" 26 | ], 27 | "dependencies": { 28 | "babel-polyfill": "^6.13.0", 29 | "brightml": "^3.0.7", 30 | "cheerio": "git://github.com/cheeriojs/cheerio.git#70c5608113d3efaf584efd29edafe173b74e106f", 31 | "commander": "^2.9.0", 32 | "lodash": "^3.10.1", 33 | "mammoth": "^0.3.29", 34 | "normall": "^0.2.2", 35 | "odt2html": "1.0.1", 36 | "q": "^1.4.1", 37 | "to-markdown": "git://github.com/jpreynat/to-markdown.git#c88526cffbb62cd27faedcddcf279a946bc6de6e" 38 | }, 39 | "contributors": [ 40 | { 41 | "name": "Johan Preynat", 42 | "email": "johan@gitbook.com" 43 | } 44 | ], 45 | "devDependencies": { 46 | "babel-cli": "^6.14.0", 47 | "babel-preset-es2015": "^6.14.0", 48 | "eslint": "^3.4.0" 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /scripts/resources.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RESOURCES_DIR="./resources" 4 | DOCBOOK_DIR="./docbook" 5 | TAR_FILE="docbook.tar.bz2" 6 | 7 | # Create resources directory 8 | echo "Creating resources directory..." 9 | mkdir $RESOURCES_DIR 10 | cd $RESOURCES_DIR 11 | 12 | # Create docbook directory 13 | echo "Creating docbook stylesheets directory..." 14 | mkdir $DOCBOOK_DIR 15 | cd $DOCBOOK_DIR 16 | 17 | # Download latest docbook.xsl 18 | echo "Downloading docbook.xsl stylesheets..." 19 | URL="http://sourceforge.net/projects/docbook/files/docbook-xsl/1.79.1/docbook-xsl-1.79.1.tar.bz2" 20 | wget -O $TAR_FILE -q --show-progress $URL 21 | 22 | # Inflate and delete zip 23 | echo "Inflating $TAR_FILE..." 24 | tar -xjf $TAR_FILE --strip-components=1 25 | echo "Deleting $TAR_FILE..." 26 | rm $TAR_FILE 27 | 28 | echo "Done." -------------------------------------------------------------------------------- /src/converters/docx.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | const Promise = require('q'); 4 | const mammoth = require('mammoth'); 5 | const normall = require('normall'); 6 | 7 | const HTMLBaseConverter = require('./html-base'); 8 | const utils = require('./utils'); 9 | 10 | const logger = new utils.Logger('log'); 11 | 12 | class DocxConverter extends HTMLBaseConverter { 13 | // Implement toHTML() 14 | toHTML() { 15 | const d = Promise.defer(); 16 | 17 | // counter for default name (altText unavailable) 18 | let imgCounter = 0; 19 | 20 | // imgExporter exports inline images to the assets folder and apply src attribute to HTML correctly 21 | const imgExporter = mammoth.images.inline((element) => { 22 | return element.read() 23 | .then((imageBuffer) => { 24 | // Set image file name 25 | let imgFilename; 26 | 27 | // Use altText for image name 28 | if (Boolean(element.altText)) { 29 | imgFilename = element.altText; 30 | 31 | // Remove extension in altText if is equal to contentType 32 | const contentType = `image/${path.extname(imgFilename).slice(1)}`; 33 | if (element.contentType === contentType) { 34 | imgFilename = imgFilename.split('.').slice(0, -1).join('.'); 35 | } 36 | 37 | // Shorten if too long 38 | imgFilename = imgFilename.slice(0, 35).trim(); 39 | } 40 | 41 | // Normalize filename 42 | imgFilename = normall.filename(imgFilename); 43 | 44 | // Or use default name -> img-NN.ext 45 | if (!imgFilename) { 46 | imgFilename = `img-${imgCounter}`; 47 | imgCounter++; 48 | } 49 | 50 | // Add extension 51 | imgFilename = `${imgFilename}.${element.contentType.split('/')[1]}`; 52 | // Create path 53 | const imgPath = path.join(this._assetsFolder, imgFilename); 54 | 55 | // Write on disk 56 | fs.writeFile(imgPath, imageBuffer, (err) => { 57 | if (err) { 58 | logger.log(`Unable to save image ${imgPath}`); 59 | } 60 | else { 61 | logger.log(`Successfully exported image ${imgPath}`); 62 | } 63 | }); 64 | 65 | // Return correct HTML src attribute 66 | return { 67 | src: path.resolve(this._projectFolder, imgPath) 68 | }; 69 | }); 70 | }); 71 | 72 | // Set mammoth options 73 | const mammothOpts = { 74 | convertImage: imgExporter 75 | }; 76 | 77 | // Convert to HTML 78 | logger.log('Converting docx file to HTML...'); 79 | mammoth.convertToHtml({ 80 | path: this.originalDoc.path 81 | }, mammothOpts) 82 | .then( 83 | (result) => { 84 | logger.log('Done.'); 85 | 86 | // The generated HTML 87 | this._html = result.value; 88 | // Any messages, such as warnings during conversion 89 | const messages = result.messages; 90 | if (this.debug) { 91 | logger.log(messages); 92 | } 93 | 94 | d.resolve(); 95 | }, 96 | (err) => d.reject(err) 97 | ); 98 | 99 | return d.promise; 100 | } 101 | } 102 | 103 | module.exports = DocxConverter; 104 | -------------------------------------------------------------------------------- /src/converters/html-base.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | const _ = require('lodash'); 4 | const Promise = require('q'); 5 | const cheerio = require('cheerio'); 6 | 7 | const utils = require('./utils'); 8 | const Chapter = require('./types/chapter'); 9 | const Readme = require('./types/readme'); 10 | 11 | const logger = new utils.Logger('log'); 12 | 13 | // Interface for HTML-based converters 14 | // The inheriting converter must implement .toHMTL() 15 | 16 | class HTMLBaseConverter { 17 | constructor(opts) { 18 | // Informations about original document file 19 | this.originalDoc = { 20 | name: opts.filename, 21 | ext: path.extname(opts.filename), 22 | path: path.resolve(process.cwd(), opts.filename) 23 | }; 24 | 25 | this.documentTitle = opts.documentTitle || path.basename(this.originalDoc.name, this.originalDoc.ext); 26 | 27 | // Set working directories; 28 | this._projectFolder = path.resolve(process.cwd(), opts.exportDir); 29 | 30 | this.assetsDirectory = opts.assetsDirectory; 31 | this._assetsFolder = path.join(this._projectFolder, this.assetsDirectory); 32 | 33 | this._summaryFile = path.join(this._projectFolder, 'SUMMARY.md'); 34 | 35 | // Other options 36 | this.titleDepth = opts.titleDepth; 37 | this.debug = opts.debug; 38 | this.prefixFilenames = opts.prefix; 39 | } 40 | 41 | /** 42 | * Launch conversion 43 | * @return {Promise} 44 | */ 45 | convert() { 46 | // Check that file exists 47 | return Promise.nfcall(fs.stat, this.originalDoc.path) 48 | // Create folders 49 | .then(() => this.createDirectories()) 50 | // Actually convert to HTML 51 | .fin(() => this.toHTML()) 52 | // Manipulate HTML 53 | .then(() => this.extractFootnotes()) 54 | .then(() => this.parseChapters()) 55 | .then(() => this.processChapters()) 56 | .then(() => this.toMarkdown()) 57 | .then(() => this.writeSummary()) 58 | .then(() => this.writeFiles()) 59 | .then(() => logger.log('Done.')) 60 | .fail(this.handleError); 61 | } 62 | 63 | /** 64 | * Create project directories 65 | * @return {Promise} 66 | */ 67 | createDirectories() { 68 | logger.log('Creating export folder...'); 69 | return Promise.nfcall(fs.mkdir, this._projectFolder) 70 | .then( 71 | () => {}, 72 | (err) => {} 73 | ) 74 | .then(() => { 75 | logger.log('Creating assets folder...'); 76 | return Promise.nfcall(fs.mkdir, this._assetsFolder) 77 | .then( 78 | () => {}, 79 | (err) => {} 80 | ); 81 | }) 82 | .then(() => { 83 | logger.log('Creating summary file...'); 84 | return Promise.nfcall(fs.writeFile, this._summaryFile, '# Summary\n\n') 85 | .then( 86 | () => {}, 87 | (err) => {} 88 | ); 89 | }) 90 | .then(() => logger.log('Done.')); 91 | } 92 | 93 | /** 94 | * Extract footnotes from HTML and store in Converter.footnotes 95 | */ 96 | extractFootnotes() { 97 | logger.log('Extracting footnotes...'); 98 | this.footnotes = {}; 99 | 100 | const $ = cheerio.load(this._html); 101 | $('a').each((i, link) => { 102 | // Ensure tag is the only child 103 | const $parent = $(link).parent(); 104 | if (!$parent.length) { 105 | return; 106 | } 107 | 108 | if (!$parent.is('sup')) { 109 | return; 110 | } 111 | 112 | if ($parent.contents().length !== 1) { 113 | return; 114 | } 115 | 116 | // Get origin id and href attributes 117 | const originHref = $(link).attr('href'); 118 | let originId = $(link).attr('id'); 119 | // originId could also be set on parent tag 120 | if (!originId) { 121 | originId = $parent.attr('id'); 122 | } 123 | 124 | // Both id and href must be set in a footnote origin link 125 | if (!originHref || !originId) { 126 | return; 127 | } 128 | 129 | // Check if href is an id-like link 130 | if (_.startsWith(originHref, '#')) { 131 | // Get referenced element 132 | const referencedId = utils.idFromRef(originHref); 133 | const $referencedTag = $(`*[id="${referencedId}"]`).first(); 134 | if (!$referencedTag.length) { 135 | return; 136 | } 137 | 138 | // Check that referred element has a link back to origin 139 | const $linkToOrigin = $(`a[href="#${originId}"]`); 140 | if (!$referencedTag.has($linkToOrigin)) { 141 | return; 142 | } 143 | 144 | // Change referred element to a

tag 145 | let $replacement; 146 | if ($referencedTag.children().length === 1 && $referencedTag.children().first().is('p')) { 147 | $replacement = $referencedTag.children().first(); 148 | } 149 | else { 150 | $replacement = $(`

${$referencedTag.html()}

`); 151 | } 152 | 153 | // Wrap content in a tag if not already and prepend content with origin link text 154 | let prefix; 155 | let content; 156 | if ($replacement.children().first().is('sup')) { 157 | content = $replacement.children().first().html().trim(); 158 | prefix = _.startsWith(content, $(link).text()) ? '' : $(link).text(); 159 | content = `${prefix} ${content}`.trim(); 160 | 161 | $replacement.children().first().html(content); 162 | } 163 | else { 164 | content = $replacement.html().trim(); 165 | prefix = _.startsWith(content, $(link).text()) ? '' : $(link).text(); 166 | content = `${prefix} ${content}`.trim(); 167 | 168 | $replacement.html(`${content}`); 169 | } 170 | 171 | // Copy attributes 172 | const referencedTagAttributes = getTagAttributes($referencedTag); 173 | for (const attr in referencedTagAttributes) { 174 | $replacement.children().first().attr(attr, referencedTagAttributes[attr]); 175 | } 176 | 177 | // Save footnote by reference and remove from DOM 178 | this.footnotes[originHref] = $replacement.html(); 179 | $referencedTag.remove(); 180 | } 181 | }); 182 | 183 | this._html = $.html(); 184 | } 185 | 186 | /** 187 | * Generate the list of chapters from HTML 188 | */ 189 | parseChapters() { 190 | logger.log('Parsing chapters...'); 191 | // Detect maximum title tags depth to split content 192 | this.detectTitleTags(); 193 | 194 | // Create README file 195 | const readme = new Readme('md', this.documentTitle ,this._projectFolder); 196 | 197 | // Actually parse chapters 198 | this.chapters = this.parseHTML(this._html, 0, readme); 199 | 200 | // If no chapters are created, the README should contain the whole document 201 | if (!this.chapters.length) { 202 | readme.content = this._html; 203 | } 204 | 205 | // Flatten list of chapters 206 | this.chapters = _.chain(this.chapters) 207 | .map((chapter, i) => [chapter].concat(chapter.getChildrenDeep())) 208 | .flatten(true) 209 | .value(); 210 | 211 | // Generate chapters filenames 212 | this.chapters.forEach(chapter => chapter.generateFilename('md', this.prefixFilenames)); 213 | 214 | // Insert Readme at the beginning of chapters array 215 | readme.addTitleToContent(); 216 | this.chapters.unshift(readme); 217 | } 218 | 219 | /** 220 | * Get the list of header tags that should be parsed as chapters 221 | */ 222 | detectTitleTags() { 223 | const tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']; 224 | const titleTags = []; 225 | 226 | const $ = cheerio.load(this._html); 227 | tags.forEach((tag) => { 228 | // Maximum depth reached 229 | if (titleTags.length === this.titleDepth) { 230 | return; 231 | } 232 | 233 | // Found at list a title for current 234 | if (Boolean($(tag).length)) { 235 | titleTags.push(tag); 236 | } 237 | }); 238 | 239 | this.titleTags = titleTags; 240 | } 241 | 242 | /** 243 | * Parse HTML content and render the chapters tree recursively 244 | * 245 | * @param {String} html HTML content string 246 | * @param {Number} level Current tree level 247 | * @param {Chapter} parent Chapter to use as parent of parsed chapters 248 | * @return {Array} 249 | */ 250 | parseHTML(html, level, parent) { 251 | // Call recursively based on titleTags length 252 | const chapters = []; 253 | if (level + 1 > this.titleTags.length) { 254 | return chapters; 255 | } 256 | 257 | const tag = this.titleTags[level]; 258 | const tagLevel = tag.slice(-1); 259 | const tagSplitter = new RegExp(`(\\<${tag}.*?${tag}\\>)`, 'g'); 260 | const tagDetector = new RegExp(`\\<${tag}`); 261 | 262 | const parts = html.split(tagSplitter); 263 | 264 | // Grab first part if not a title as content of parent 265 | if (!tagDetector.test(parts[0])) { 266 | let preface = parts.shift(); 267 | preface = preface.trim(); 268 | 269 | if (Boolean(preface)) { 270 | parent.content = preface; 271 | } 272 | } 273 | 274 | let chapter; 275 | parts.forEach((part) => { 276 | // Match a current level title 277 | if (tagDetector.test(part)) { 278 | // Create a new chapter 279 | chapter = new Chapter(this._projectFolder); 280 | 281 | const info = parseTitleInfo(part, tagLevel); 282 | 283 | chapter.level = level; 284 | chapter.title = info.title; 285 | chapter.titleId = info.titleId; 286 | 287 | chapter.parent = (level > 0) ? parent : null; 288 | chapter.titleHTML = part; 289 | } 290 | // Match a current level content 291 | else { 292 | // Get subchapters 293 | chapter.children = this.parseHTML(part, level + 1, chapter); 294 | if (!chapter.children.length) { 295 | chapter.content = part; 296 | } 297 | 298 | chapter.content = chapter.titleHTML + chapter.content; 299 | chapter.content = chapter.content.trim(); 300 | chapter.num = chapters.length + 1; 301 | 302 | delete chapter.titleHTML; 303 | chapters.push(chapter); 304 | } 305 | }); 306 | 307 | return chapters; 308 | } 309 | 310 | /** 311 | * Format parsed HTML 312 | */ 313 | processChapters() { 314 | logger.log('Processing chapters...'); 315 | // Set titles and footnotes 316 | this.chapters.forEach((chapter, index) => { 317 | // Don't erase footnotes for other chapters 318 | const footnotes = _.cloneDeep(this.footnotes); 319 | // Reset footnotes in each correct chapter 320 | chapter.setFootnotes(footnotes); 321 | }); 322 | 323 | // Clean and resolve links 324 | this.chapters.forEach((chapter) => { 325 | // Clean HTML 326 | try { 327 | chapter.cleanHTML(); 328 | } 329 | catch (err) { 330 | this.handleError(err); 331 | } 332 | }); 333 | 334 | // Normalize titles id 335 | this.chapters.forEach((chapter, index) => { 336 | const siblings = this.chapters.filter((c, pos) => pos !== index); 337 | chapter.normalizeTitlesId(siblings); 338 | }); 339 | 340 | // Resolve links 341 | this.chapters.forEach((chapter, index) => { 342 | const siblings = this.chapters.filter((c, pos) => pos !== index); 343 | 344 | chapter.resolveLinks(siblings); 345 | chapter.resolveAssetsLinks(); 346 | }); 347 | } 348 | 349 | /** 350 | * Convert HTML to markdown 351 | */ 352 | toMarkdown() { 353 | logger.log('Converting chapters to markdown...'); 354 | this.chapters.forEach(chapter => chapter.toMarkdown()); 355 | } 356 | 357 | /** 358 | * Add each chapter to SUMMARY 359 | * @return {Promise} 360 | */ 361 | writeSummary() { 362 | logger.log('Writing summary...'); 363 | 364 | return this.chapters.reduce((prev, chapter) => { 365 | return prev.then(() => { 366 | // Create padding for subchapters 367 | let padding = ''; 368 | while (padding.length < chapter.level * 2) { 369 | padding += ' '; 370 | } 371 | 372 | // Add summary entry 373 | const entry = `${padding}* [${chapter.title}](${chapter.summaryPath})\n`; 374 | 375 | return Promise.nfcall(fs.appendFile, this._summaryFile, entry) 376 | .fail(this.handleError); 377 | }); 378 | }, Promise()); 379 | } 380 | 381 | /** 382 | * Write each chapter to a file on FS 383 | * @return {Promise} 384 | */ 385 | writeFiles() { 386 | // Create a file for each book part 387 | return this.chapters.reduce((prev, chapter) => { 388 | return prev.then(() => { 389 | logger.log(`Writing file: ${chapter.filepath}`); 390 | // Try to create directory 391 | return Promise.nfcall(fs.stat, chapter.path) 392 | .fail((err) => Promise.nfcall(fs.mkdir, chapter.path)) 393 | // Directory exists 394 | .fail((err) => {}) 395 | .then(() => { 396 | // Write converted file 397 | return Promise.nfcall(fs.writeFile, chapter.filepath, chapter.markdown) 398 | .fail(this.handleError); 399 | }); 400 | }); 401 | }, Promise()); 402 | } 403 | 404 | handleError(err) { 405 | logger.error(err.message); 406 | logger.error(err.stack); 407 | 408 | process.exit(1); 409 | } 410 | } 411 | 412 | /** 413 | * Extract the text and id from a header tag 414 | * and returns as an object: 415 | * - titleId: id attribute to use 416 | * - title: text for this title 417 | * 418 | * @param {String} title HTML string for the title 419 | * @param {String} level Header tag level (i.e. 1 for

) 420 | * @return {Object} 421 | */ 422 | function parseTitleInfo(title, level) { 423 | const $ = cheerio.load(title); 424 | 425 | const $h = $(`h${level}`); 426 | // Use existing id attribute or normalize text 427 | const titleId = $h.attr('id') || utils.normalizeId($h.text()); 428 | 429 | return { 430 | titleId, 431 | title: $h.text() 432 | }; 433 | } 434 | 435 | /** 436 | * Return a tag attributes 437 | * @param {jQuery Element} el 438 | * @return {Object} 439 | */ 440 | function getTagAttributes(el) { 441 | return el.get(0).attribs; 442 | } 443 | 444 | module.exports = HTMLBaseConverter; 445 | -------------------------------------------------------------------------------- /src/converters/html.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const _ = require('lodash'); 3 | const Promise = require('q'); 4 | const cheerio = require('cheerio'); 5 | 6 | const HTMLBaseConverter = require('./html-base'); 7 | 8 | // Implement toHTML() 9 | class HTMLConverter extends HTMLBaseConverter { 10 | toHTML() { 11 | const d = Promise.defer(); 12 | 13 | fs.readFile(this.originalDoc.path, { encoding: 'utf-8' }, (err, data) => { 14 | if (err) { 15 | d.reject(err); 16 | } 17 | 18 | // Return HTML from if tag exists, whole HTML otherwise 19 | const $ = cheerio.load(data); 20 | const $body = $('body'); 21 | 22 | if (!_.size($body)) { 23 | this._html = data; 24 | } 25 | else { 26 | this._html = $body.html(); 27 | } 28 | 29 | d.resolve(); 30 | }); 31 | 32 | return d.promise; 33 | } 34 | } 35 | 36 | module.exports = HTMLConverter; 37 | -------------------------------------------------------------------------------- /src/converters/index.js: -------------------------------------------------------------------------------- 1 | const docx = require('./docx'); 2 | const html = require('./html'); 3 | const xml = require('./xml'); 4 | const odt = require('./odt'); 5 | 6 | module.exports = { 7 | ALLOWED_FORMATS: [ 8 | { 9 | description: 'Microsoft Office Open XML Document', 10 | ext: 'docx' 11 | }, 12 | { 13 | description: 'HyperText Markup Language', 14 | ext: 'html' 15 | }, 16 | { 17 | description: 'Docbook Markup Language', 18 | ext: 'xml' 19 | }, 20 | { 21 | description: 'OpenOffice / Open Document Format', 22 | ext: 'odt' 23 | } 24 | ], 25 | docx, 26 | html, 27 | xml, 28 | odt 29 | }; 30 | -------------------------------------------------------------------------------- /src/converters/markdown-filters.js: -------------------------------------------------------------------------------- 1 | module.exports = [ 2 | // Handle titles links 3 | { 4 | filter: ['h1', 'h2', 'h3', 'h4','h5', 'h6'], 5 | replacement(content, node) { 6 | const hLevel = node.nodeName.charAt(1); 7 | 8 | let hPrefix = ''; 9 | for (let i = 0; i < hLevel; i++) { 10 | hPrefix += '#'; 11 | } 12 | 13 | let id = ''; 14 | if (Boolean(node.id)) { 15 | id = ` {#${node.id}}`; 16 | } 17 | 18 | return '\n\n' + `${hPrefix} ${content}${id}` + '\n\n'; 19 | } 20 | }, 21 | // Handle footnotes 22 | { 23 | filter: 'sup', 24 | replacement(content, node) { 25 | let reference; 26 | // Origin only contains an tag 27 | if (/A/.test(node.firstChild.tagName) && node.children.length === 1) { 28 | // Reference is the content of the tag 29 | reference = node.firstChild.textContent; 30 | reference = reference.replace(/[^a-zA-Z\d]/g, ''); 31 | 32 | return `[^${reference}]`; 33 | } 34 | else { 35 | // No id attribute, keep as-is 36 | if (!node.id) { 37 | return node.outerHTML; 38 | } 39 | 40 | // Delete back-to-origin link from tag 41 | content = content.replace(/\[[^\]]*\]\(.*\)\s*$/, ''); 42 | // In footnotes, reference is the first "word" 43 | content = content.split(' '); 44 | reference = content.shift(); 45 | reference = reference.replace(/[^a-zA-Z\d]/g, ''); 46 | 47 | return `[^${reference}]: ${content.join(' ').trim()}`; 48 | } 49 | } 50 | }, 51 | { 52 | filter: ['section', 'div', 'span'], 53 | replacement(content, node) { 54 | return content; 55 | } 56 | }, 57 | // Treat
as